In [1]:
import nltk
from nltk.corpus import conll2000

import kg.ner.utils as utils
from kg.ner.unsupervised import NounPhraseDetection, EntityDetection

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tmorrill002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tmorrill002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/tmorrill002/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tmorrill002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_directory = '/Users/tmorrill002/Documents/datasets/conll/transformed'

In [3]:
df_dict = utils.load_train_data(data_directory)

## What fraction of NER tags are Noun Phrases?

In [4]:
train_df = df_dict['train.csv']

In [5]:
def nouns_over_NER(df, noun_col = 'Chunk_Tag', ner_col = 'NER_Tag_Normalized'):
    ner_df = df[df[ner_col] != 'O']
    noun_phrase_token_count = len(ner_df[(ner_df[noun_col] == 'I-NP') | (ner_df[noun_col] == 'B-NP')])
    print(f'Count of noun phrase tokens among NER tokens: {noun_phrase_token_count}')
    print(f'Count of NER tokens: {len(ner_df)}')
    print(f'Percent of NER tokens that are part of noun phrases: {round(noun_phrase_token_count / len(ner_df),4) * 100}%')

In [6]:
nouns_over_NER(train_df)

Count of noun phrase tokens among NER tokens: 33054
Count of NER tokens: 34043
Percent of NER tokens that are part of noun phrases: 97.09%


## What fraction of Noun Phrase tokens are NER tagged?

In [7]:
def NER_over_nouns(df, noun_col = 'Chunk_Tag', ner_col = 'NER_Tag_Normalized'):
    noun_phrase_df = df[(df[noun_col] == 'I-NP') | (df[noun_col] == 'B-NP')]
    ner_tag_token_count = len(noun_phrase_df[noun_phrase_df[ner_col] != 'O'])
    print(f'Count of NER tokens among noun phrase tokens: {ner_tag_token_count}')
    print(f'Count of noun phrase tokens: {len(noun_phrase_df)}')
    print(f'Percent of noun phrase tokens that are part of NER tags: {round(ner_tag_token_count / len(noun_phrase_df), 4) * 100}%')

In [8]:
NER_over_nouns(train_df)

Count of NER tokens among noun phrase tokens: 33054
Count of noun phrase tokens: 124032
Percent of noun phrase tokens that are part of NER tags: 26.650000000000002%


### Conclusions:
1. NER Tags are almost exclusively noun phrases (97%) -> noun phrase candidates will yield high recall
2. Noun phrases encompass a lot more than NER tags -> noun phrase candidates will yield low precision and other techniques should be used to reduce the number of false positives

### Evaluate Unsupervised Noun Phrase Detection Against CoNLL-2000 and CoNLL-2003

In [9]:
conll_2000_test_sentences = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

In [10]:
chunk_parser = NounPhraseDetection()

In [11]:
print(chunk_parser.evaluate(conll_2000_test_sentences))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [12]:
def consolidate(x):
    return (x['Token'], x['POS_Tag'], x['Chunk_Tag'])

In [13]:
train_df['Tags'] = train_df.apply(consolidate, axis=1) #passes a Series object, row-wise

In [14]:
# ground truth labeled data containing (token, pos, chunk), used for evaluation
train_conll_2003 = train_df.groupby(['Article_ID', 'Sentence_ID'], )['Tags'].apply(list).values.tolist()

In [15]:
# pos tagged sentences only containing (token, pos), used to make predictions
train_conll_2003_pos_tags = []
for sentence in train_conll_2003:
    temp_sentence = []
    for token, pos, chunk in sentence:
        temp_sentence.append((token, pos))
    train_conll_2003_pos_tags.append(temp_sentence)

In [16]:
# examples must be in "tree" format to use evaluation code
trees = []
for example in train_conll_2003:
    trees.append(nltk.chunk.conlltags2tree(example))

In [17]:
# getting harmed here because B-NP tags aren't marked appropriately in CoNLL-2003
print(chunk_parser.evaluate(trees))

ChunkParse score:
    IOB Accuracy:  67.4%%
    Precision:     80.5%%
    Recall:        43.7%%
    F-Measure:     56.7%%


In [18]:
prediction_trees = []
for example in train_conll_2003_pos_tags:
    prediction_trees.append(chunk_parser.parse(example))

In [19]:
predictions = []
for sentence in prediction_trees:
    predictions.append(nltk.chunk.tree2conlltags(sentence))

In [20]:
flattened = [prediction for sentence in predictions for prediction in sentence ]

In [21]:
train_df['Prediction'] = flattened

In [22]:
train_df['Prediction'] = train_df['Prediction'].apply(lambda x: x[2])

In [23]:
train_df['Noun_Phrase'] = train_df['Prediction'] != 'O'

In [24]:
nouns_over_NER(train_df, 'Prediction', 'NER_Tag_Normalized')

Count of noun phrase tokens among NER tokens: 33030
Count of NER tokens: 34043
Percent of NER tokens that are part of noun phrases: 97.02%


In [25]:
NER_over_nouns(train_df, 'Prediction', 'NER_Tag_Normalized')

Count of NER tokens among noun phrase tokens: 33030
Count of noun phrase tokens: 124428
Percent of noun phrase tokens that are part of NER tags: 26.55%


In [26]:
train_df['NP_Chunk_Tag'] = (train_df['Chunk_Tag'] == 'I-NP') | (train_df['Chunk_Tag'] == 'B-NP')

In [27]:
# agreement between existing ConLL-2003 noun phrase tags and the predicted tags
(train_df['Noun_Phrase'] == train_df['NP_Chunk_Tag']).sum() / len(train_df)

0.9570476522559068

## Evaluate TF-IDF Rankings

In [28]:
entity_extractor = EntityDetection(chunk_parser)

In [29]:
train_df

Unnamed: 0,Article_ID,Sentence_ID,Token_ID,Token,POS_Tag,Chunk_Tag,NER_Tag,NER_Tag_ID,NER_Tag_Normalized,Tags,Prediction,Noun_Phrase,NP_Chunk_Tag
0,0,0,0,EU,NNP,I-NP,I-ORG,0.0,ORG,"(EU, NNP, I-NP)",B-NP,True,True
1,0,0,1,rejects,VBZ,I-VP,O,,O,"(rejects, VBZ, I-VP)",O,False,False
2,0,0,2,German,JJ,I-NP,I-MISC,1.0,MISC,"(German, JJ, I-NP)",B-NP,True,True
3,0,0,3,call,NN,I-NP,O,,O,"(call, NN, I-NP)",I-NP,True,True
4,0,0,4,to,TO,I-VP,O,,O,"(to, TO, I-VP)",O,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
203616,945,6,1,three,CD,I-NP,O,,O,"(three, CD, I-NP)",I-NP,True,True
203617,945,7,0,Swansea,NN,I-NP,I-ORG,23497.0,ORG,"(Swansea, NN, I-NP)",B-NP,True,True
203618,945,7,1,1,CD,I-NP,O,,O,"(1, CD, I-NP)",I-NP,True,True
203619,945,7,2,Lincoln,NNP,I-NP,I-ORG,23498.0,ORG,"(Lincoln, NNP, I-NP)",I-NP,True,True


In [30]:
# gather up articles
articles = train_df.groupby(['Article_ID'], )['Token'].apply(lambda x: ' '.join([str(y) for y in list(x)])).values.tolist()

In [31]:
entity_extractor.fit_tfidf(articles)

In [32]:
candidate_phrases = []
for article in articles:
    for candidate in entity_extractor.candidates(article):
        candidate_phrases.append(candidate)

In [33]:
# look at first article
candidates = entity_extractor.candidates(articles[0])

In [34]:
scores = entity_extractor.score_phrases(candidates)

In [41]:
scored = list(set(zip(candidates, scores.tolist())))

In [42]:
scored.sort(key=lambda x: x[1], reverse=True)

In [51]:
train_df[train_df['Article_ID'] == 0].iloc[40:50]

Unnamed: 0,Article_ID,Sentence_ID,Token_ID,Token,POS_Tag,Chunk_Tag,NER_Tag,NER_Tag_ID,NER_Tag_Normalized,Tags,Prediction,Noun_Phrase,NP_Chunk_Tag
40,0,3,27,to,TO,I-PP,O,,O,"(to, TO, I-PP)",O,False,False
41,0,3,28,sheep,NN,I-NP,O,,O,"(sheep, NN, I-NP)",B-NP,True,True
42,0,3,29,.,.,O,O,,O,"(., ., O)",O,False,False
43,0,4,0,Germany,NNP,I-NP,I-LOC,8.0,LOC,"(Germany, NNP, I-NP)",B-NP,True,True
44,0,4,1,'s,POS,B-NP,O,,O,"('s, POS, B-NP)",I-NP,True,True
45,0,4,2,representative,NN,I-NP,O,,O,"(representative, NN, I-NP)",I-NP,True,True
46,0,4,3,to,TO,I-PP,O,,O,"(to, TO, I-PP)",O,False,False
47,0,4,4,the,DT,I-NP,O,,O,"(the, DT, I-NP)",B-NP,True,True
48,0,4,5,European,NNP,I-NP,I-ORG,9.0,ORG,"(European, NNP, I-NP)",I-NP,True,True
49,0,4,6,Union,NNP,I-NP,I-ORG,9.0,ORG,"(Union, NNP, I-NP)",I-NP,True,True


In [44]:
articles[0]

'EU rejects German call to boycott British lamb . Peter Blackburn BRUSSELS 1996-08-22 The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep . Germany \'s representative to the European Union \'s veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . " We do n\'t support any such recommendation because we do n\'t see any grounds for it , " the Commission \'s chief spokesman Nikolaus van der Pas told a news briefing . He said further scientific study was required and if it was found that action was needed it should be taken by the European Union . He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protec

In [43]:
scored

[('careful', 7.160151912626133),
 ('clearer', 7.160151912626133),
 ('Palacio', 7.160151912626133),
 ('NFU', 7.160151912626133),
 ('Fischler', 6.754686804517968),
 ('recommendations', 6.754686804517968),
 ('sheepmeat', 6.754686804517968),
 ('unjustified alarm', 6.702006546689056),
 ('sheep brains', 6.610845768292078),
 ('humans', 6.467004732066187),
 ('consumers', 6.467004732066187),
 ('dangerous generalisation', 6.264272178012106),
 ("Fischler 's proposal", 6.264272178012105),
 ('scientists', 6.243861180751978),
 ('BSE', 6.243861180751978),
 ('scientific study', 6.120431141786215),
 ('animal waste', 6.061539623958024),
 ('EU Farm Commissioner Franz Fischler', 5.911075961208344),
 ('contract Bovine Spongiform Encephalopathy', 5.910110914068893),
 ('beef', 5.907388944130765),
 ('laboratory conditions sheep', 5.883938113796435),
 ('contract scrapie', 5.831773459268803),
 ('BBC radio', 5.758471722172866),
 ('mad cow disease', 5.643951967792901),
 ('a slight risk', 5.57112499745216),
 ('EU-