In [159]:
import nltk
from nltk.corpus import conll2000
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

import kg.ner.utils as utils
from kg.ner.unsupervised import NounPhraseDetection, EntityDetection

In [2]:
data_directory = '/Users/tmorrill002/Documents/datasets/conll/transformed'

In [3]:
df_dict = utils.load_train_data(data_directory)

## What fraction of NER tags are Noun Phrases?

In [4]:
train_df = df_dict['train.csv']

In [5]:
def nouns_over_NER(df, noun_col = 'Chunk_Tag', ner_col = 'NER_Tag_Normalized'):
    ner_df = df[df[ner_col] != 'O']
    noun_phrase_token_count = len(ner_df[(ner_df[noun_col] == 'I-NP') | (ner_df[noun_col] == 'B-NP')])
    print(f'Count of noun phrase tokens among NER tokens: {noun_phrase_token_count}')
    print(f'Count of NER tokens: {len(ner_df)}')
    print(f'Percent of NER tokens that are part of noun phrases: {round(noun_phrase_token_count / len(ner_df),4) * 100}%')

In [6]:
nouns_over_NER(train_df)

Count of noun phrase tokens among NER tokens: 33054
Count of NER tokens: 34043
Percent of NER tokens that are part of noun phrases: 97.09%


## What fraction of Noun Phrase tokens are NER tagged?

In [7]:
def NER_over_nouns(df, noun_col = 'Chunk_Tag', ner_col = 'NER_Tag_Normalized'):
    noun_phrase_df = df[(df[noun_col] == 'I-NP') | (df[noun_col] == 'B-NP')]
    ner_tag_token_count = len(noun_phrase_df[noun_phrase_df[ner_col] != 'O'])
    print(f'Count of NER tokens among noun phrase tokens: {ner_tag_token_count}')
    print(f'Count of noun phrase tokens: {len(noun_phrase_df)}')
    print(f'Percent of noun phrase tokens that are part of NER tags: {round(ner_tag_token_count / len(noun_phrase_df), 4) * 100}%')

In [8]:
NER_over_nouns(train_df)

Count of NER tokens among noun phrase tokens: 33054
Count of noun phrase tokens: 124032
Percent of noun phrase tokens that are part of NER tags: 26.650000000000002%


### Conclusions:
1. NER Tags are almost exclusively noun phrases (97%) -> noun phrase candidates will yield high recall
2. Noun phrases encompass a lot more than NER tags -> noun phrase candidates will yield low precision and other techniques should be used to reduce the number of false positives

### Evaluate Unsupervised Noun Phrase Detection Against CoNLL-2000 and CoNLL-2003

In [9]:
conll_2000_test_sentences = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

In [10]:
chunk_parser = NounPhraseDetection()

In [11]:
print(chunk_parser.evaluate(conll_2000_test_sentences))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [12]:
def consolidate(x):
    return (x['Token'], x['POS_Tag'], x['Chunk_Tag'])

In [13]:
train_df['Tags'] = train_df.apply(consolidate, axis=1) #passes a Series object, row-wise

In [14]:
# ground truth labeled data containing (token, pos, chunk), used for evaluation
train_conll_2003 = train_df.groupby(['Article_ID', 'Sentence_ID'], )['Tags'].apply(list).values.tolist()

In [15]:
# pos tagged sentences only containing (token, pos), used to make predictions
train_conll_2003_pos_tags = []
for sentence in train_conll_2003:
    temp_sentence = []
    for token, pos, chunk in sentence:
        temp_sentence.append((token, pos))
    train_conll_2003_pos_tags.append(temp_sentence)

In [16]:
# examples must be in "tree" format to use evaluation code
trees = []
for example in train_conll_2003:
    trees.append(nltk.chunk.conlltags2tree(example))

In [17]:
# getting harmed here because B-NP tags aren't marked appropriately in CoNLL-2003
print(chunk_parser.evaluate(trees))

ChunkParse score:
    IOB Accuracy:  67.4%%
    Precision:     80.5%%
    Recall:        43.7%%
    F-Measure:     56.7%%


In [18]:
prediction_trees = []
for example in train_conll_2003_pos_tags:
    prediction_trees.append(chunk_parser.parse(example))

In [19]:
predictions = []
for sentence in prediction_trees:
    predictions.append(nltk.chunk.tree2conlltags(sentence))

In [20]:
flattened = [prediction for sentence in predictions for prediction in sentence ]

In [21]:
train_df['Prediction'] = flattened

In [22]:
train_df['Prediction'] = train_df['Prediction'].apply(lambda x: x[2])

In [23]:
train_df['Noun_Phrase'] = train_df['Prediction'] != 'O'

In [24]:
nouns_over_NER(train_df, 'Prediction', 'NER_Tag_Normalized')

Count of noun phrase tokens among NER tokens: 33030
Count of NER tokens: 34043
Percent of NER tokens that are part of noun phrases: 97.02%


In [25]:
NER_over_nouns(train_df, 'Prediction', 'NER_Tag_Normalized')

Count of NER tokens among noun phrase tokens: 33030
Count of noun phrase tokens: 124428
Percent of noun phrase tokens that are part of NER tags: 26.55%


In [26]:
train_df['NP_Chunk_Tag'] = (train_df['Chunk_Tag'] == 'I-NP') | (train_df['Chunk_Tag'] == 'B-NP')

In [27]:
# agreement between existing ConLL-2003 noun phrase tags and the predicted tags
(train_df['Noun_Phrase'] == train_df['NP_Chunk_Tag']).sum() / len(train_df)

0.9570476522559068

## Evaluate TF-IDF Rankings

In [28]:
entity_extractor = EntityDetection(chunk_parser)

In [65]:
train_df['NER_Tag_Flag'] = train_df['NER_Tag'] != 'O'

In [66]:
# gather up articles
articles = train_df.groupby(['Article_ID'], )['Token'].apply(lambda x: ' '.join([str(y) for y in list(x)])).values.tolist()

In [67]:
entity_extractor.fit_tfidf(articles)

In [108]:
article = article.split()

In [120]:
candidates = []
for article in articles:
    # manually tokenize because nltk tokenizer is converting 'C$' -> ['C', '$'] and throwing off comparison
    sentences = utils.tokenize_text(article)
    article = [sentence.split() for sentence in sentences]
    article = utils.tag_pos(article)
    for candidate in entity_extractor.candidates(article, preprocess = False):
        candidates.append(candidate)

In [121]:
scores = entity_extractor.score_phrases(candidates)

  scores = sums / token_counts


In [122]:
prediction_df = pd.DataFrame(scores, columns=['Phrase', 'Noun_Phrase_Flag', 'Score'])

In [123]:
prediction_df['Phrase_ID'] = prediction_df.index

In [124]:
prediction_df.head()

Unnamed: 0,Phrase,Noun_Phrase_Flag,Score,Phrase_ID
0,EU,True,5.455404,0
1,rejects,False,7.160152,1
2,German call,True,4.048864,2
3,to boycott,False,3.81318,3
4,British lamb,True,5.459553,4


In [125]:
prediction_df['Phrase'] = prediction_df['Phrase'].apply(lambda x: x.split())

In [126]:
prediction_df = prediction_df.explode('Phrase')

In [127]:
# punctuation isn't getting assigned a score, fill with zero for now
prediction_df['Score'] = prediction_df['Score'].fillna(0.0)

In [128]:
prediction_df = prediction_df.rename(columns={'Phrase': 'Predicted_Phrase'})

In [129]:
# TODO: Investigate why this is happening
prediction_df['Predicted_Phrase'] = prediction_df['Predicted_Phrase'].replace('``', '"')

In [130]:
# df_sample = train_df[train_df['Article_ID'] == 0]

In [131]:
eval_df = pd.concat((train_df, prediction_df.reset_index(drop=True)), axis=1)

In [135]:
assert len(train_df) == len(prediction_df)

In [137]:
# TODO: why are some CoNLL-2003 tokens NaN?
eval_df = eval_df.dropna(subset=['Token'])

In [138]:
assert (eval_df['Token'] == eval_df['Predicted_Phrase']).sum() == len(eval_df) 

In [139]:
eval_df['Predicted_Entity_Flag'] = eval_df['Noun_Phrase_Flag']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['Predicted_Entity_Flag'] = eval_df['Noun_Phrase_Flag']


In [140]:
# baseline of just using noun phrases to identify entities (high recall, low precision)
print(classification_report(eval_df['NER_Tag_Flag'], eval_df['Predicted_Entity_Flag']))

              precision    recall  f1-score   support

       False       0.99      0.51      0.68    169575
        True       0.29      0.98      0.44     34043

    accuracy                           0.59    203618
   macro avg       0.64      0.74      0.56    203618
weighted avg       0.87      0.59      0.64    203618



In [151]:
median_threshold = eval_df['Score'].describe()['50%']

In [153]:
# use TF-IDF score
eval_df['Predicted_Entity_Flag_TFIDF_Median'] = (eval_df['Noun_Phrase_Flag'] & (eval_df['Score'] > median_threshold))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_df['Predicted_Entity_Flag_TFIDF_Median'] = (eval_df['Noun_Phrase_Flag'] & (eval_df['Score'] > median_threshold))


In [154]:
# using noun phrases and median TFIDF score to identify entities
print(classification_report(eval_df['NER_Tag_Flag'], eval_df['Predicted_Entity_Flag_TFIDF_Median']))

              precision    recall  f1-score   support

       False       0.95      0.69      0.80    169575
        True       0.34      0.81      0.48     34043

    accuracy                           0.71    203618
   macro avg       0.64      0.75      0.64    203618
weighted avg       0.85      0.71      0.74    203618



In [156]:
# optimize threshold to maximize macro f1
start = eval_df['Score'].describe()['min']
stop = eval_df['Score'].describe()['max']

In [163]:
# 0.25 increments
predictions = []
for thresh in np.arange(start, stop, step=0.25):
    predictions.append((thresh, (eval_df['Noun_Phrase_Flag'] & (eval_df['Score'] > thresh)).values))

In [183]:
macro_f1_scores = []
for prediction in predictions:
    report = classification_report(eval_df['NER_Tag_Flag'], prediction[1], output_dict=True)
    macro_f1_scores.append((prediction[0], report['True']['f1-score']))

In [184]:
max(macro_f1_scores, key=lambda x: x[1])

(4.5, 0.4838175288794013)

In [185]:
print(classification_report(eval_df['NER_Tag_Flag'], (eval_df['Noun_Phrase_Flag'] & (eval_df['Score'] > 4.5))))

              precision    recall  f1-score   support

       False       0.92      0.80      0.85    169575
        True       0.39      0.64      0.48     34043

    accuracy                           0.77    203618
   macro avg       0.65      0.72      0.67    203618
weighted avg       0.83      0.77      0.79    203618

