In [None]:
# Configuration to automatically reload modified modules
%load_ext autoreload
%autoreload 2

# This allows changes in imported modules to be reflected automatically
# without needing to restart the kernel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [224]:
import nltk
import pandas as pd
from nltk.corpus import treebank
from pos_spacy import spacy_pos
from standford_pos_nltk import stanford_pos
from test_corpus import (
    accuracy,
    compare_pos_taggers,
    drop_traces_tagged,
    extract_raw_text,
    extract_tokens_from_gold,
)


# Check if treebank corpus is available, if not, download it
try:
    treebank.tagged_sents()
except LookupError:
    print("Downloading NLTK treebank corpus...")
    nltk.download("treebank")

In [225]:
gold = treebank.tagged_sents()[:15]  # POS PTB
gold_no_traces = drop_traces_tagged(gold)

In [226]:
extract_raw_text(gold_no_traces)

"Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive director of this British industrial conglomerate. A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago, researchers reported. The asbestos fiber, crocidolite, is unusually resilient once it enters the lungs, with even brief exposures to it causing symptoms that show up decades later, researchers said. Lorillard Inc., the unit of New York-based Loews Corp. that makes Kent cigarettes, stopped using crocidolite in its Micronite cigarette filters in 1956. Although preliminary findings were reported more than a year ago, the latest results appear in today 's New England Journal of Medicine, a forum likely to bring new att

In [227]:
# Tokenize the raw text for both taggers
tokenized_text = extract_tokens_from_gold(gold_no_traces)

In [228]:
pos_tag_output_st_nltk = stanford_pos(tokenized_text)
pos_tag_output_spacy = spacy_pos(tokenized_text)
mismatches = compare_pos_taggers(pos_tag_output_st_nltk, pos_tag_output_spacy)

In [229]:
pd.DataFrame(mismatches).drop_duplicates()

Unnamed: 0,word1,tag1,word2,tag2
0,used,VBD,used,VBN
1,Neither,CC,Neither,DT
2,replaced,VBN,replaced,VBD


In [230]:
# Accuracy (Stanford NLTK): 0.9451
# Accuracy (spaCy): 0.9480

In [231]:
acc_st_nltk, mismatches_st_nltk = accuracy(
    pos_tag_output_st_nltk, gold_no_traces, return_mismatches=True
)  # type: ignore
acc_spacy, mismatches_spacy = accuracy(pos_tag_output_spacy, gold_no_traces, return_mismatches=True)  # type: ignore
print(f"Accuracy (Stanford NLTK): {acc_st_nltk:.4f}")
print(f"Accuracy (spaCy): {acc_spacy:.4f}")

Accuracy (Stanford NLTK): 0.9451
Accuracy (spaCy): 0.9480


In [232]:
mismatches_st_nltk_df = pd.DataFrame(mismatches_st_nltk).drop_duplicates()
mismatches_spacy_df = pd.DataFrame(mismatches_spacy).drop_duplicates()

In [235]:
mismatches_spacy_df

Unnamed: 0,word,model_tag,gold_word,gold_tag
0,Dutch,JJ,Dutch,NNP
1,publishing,NN,publishing,VBG
2,to,IN,to,TO
3,more,JJR,more,RBR
4,ago,RB,ago,IN
6,later,RB,later,JJ
7,New,NNP,New,JJ
8,York-based,NNP,York-based,JJ
9,Micronite,NNP,Micronite,NN
14,heard,VBN,heard,VBD


In [234]:
mismatches_spacy_df

Unnamed: 0,word,model_tag,gold_word,gold_tag
0,Dutch,JJ,Dutch,NNP
1,publishing,NN,publishing,VBG
2,to,IN,to,TO
3,more,JJR,more,RBR
4,ago,RB,ago,IN
6,later,RB,later,JJ
7,New,NNP,New,JJ
8,York-based,NNP,York-based,JJ
9,Micronite,NNP,Micronite,NN
14,heard,VBN,heard,VBD
