In [165]:
# Configuration to automatically reload modified modules
%load_ext autoreload
%autoreload 2

# This allows changes in imported modules to be reflected automatically
# without needing to restart the kernel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [166]:
import nltk
import pandas as pd
from nltk.corpus import treebank
from pos_spacy import spacy_pos
from standford_pos_nltk import stanford_pos
from test_corpus import accuracy, compare_pos_taggers, extract_raw_text, extract_tokens_from_gold


# Check if treebank corpus is available, if not, download it
try:
    treebank.tagged_sents()
except LookupError:
    print("Downloading NLTK treebank corpus...")
    nltk.download("treebank")

In [167]:
gold = treebank.tagged_sents()[:15]  # POS PTB

In [168]:
raw_text = extract_raw_text(gold)

In [169]:
# Tokenize the raw text for both taggers
tokenized_text = extract_tokens_from_gold(gold)

In [170]:
pos_tag_output_st_nltk = stanford_pos(tokenized_text)
pos_tag_output_spacy = spacy_pos(tokenized_text)
mismatches = compare_pos_taggers(pos_tag_output_st_nltk, pos_tag_output_spacy)

In [171]:
pd.DataFrame(mismatches).drop_duplicates()

Unnamed: 0,word1,tag1,word2,tag2
0,*-1,CD,*-1,NNP
1,*T*-1,NN,*T*-1,UH
2,*T*-1,NN,*T*-1,RB
3,*T*-2,NN,*T*-2,UH
4,that,WDT,that,IN
5,*T*-2,NN,*T*-2,NNP
6,*-2,CD,*-2,NNP
7,Neither,CC,Neither,DT
8,*T*-3,VBP,*T*-3,NNP
9,studied,VBN,studied,VBD


In [172]:
acc_st_nltk, mismatches_st_nltk = accuracy(pos_tag_output_st_nltk, gold, return_mismatches=True)
acc_spacy, mismatches_spacy = accuracy(pos_tag_output_spacy, gold, return_mismatches=True)
print(f"Accuracy (Stanford NLTK): {acc_st_nltk:.4f}")
print(f"Accuracy (spaCy): {acc_spacy:.4f}")

Accuracy (Stanford NLTK): 0.8856
Accuracy (spaCy): 0.8883


In [173]:
pd.DataFrame(mismatches_st_nltk).drop_duplicates()

Unnamed: 0,word,model_tag,gold_word,gold_tag
0,Dutch,JJ,Dutch,NNP
1,publishing,NN,publishing,VBG
2,*-1,CD,*-1,-NONE-
3,used,VBD,used,VBN
4,*,NFP,*,-NONE-
7,to,IN,to,TO
8,more,JJR,more,RBR
9,ago,RB,ago,IN
10,0,CD,0,-NONE-
11,*T*-1,NN,*T*-1,-NONE-


In [174]:
pd.DataFrame(mismatches_spacy).drop_duplicates()

Unnamed: 0,word,model_tag,gold_word,gold_tag
0,Dutch,JJ,Dutch,NNP
1,publishing,NN,publishing,VBG
2,*-1,NNP,*-1,-NONE-
3,used,VBD,used,VBN
4,*,NFP,*,-NONE-
7,to,IN,to,TO
8,more,JJR,more,RBR
9,ago,RB,ago,IN
10,0,CD,0,-NONE-
11,*T*-1,UH,*T*-1,-NONE-
