In [1]:
import pandas as pd
import numpy as np

In [10]:
# Load all dependencies for convenience
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [2]:
data_cnn = pd.read_csv("basetable_CNN.csv")
data_fox = pd.read_csv("basetable_Fox.csv")

In [95]:
data_fox.head()

Unnamed: 0.1,Unnamed: 0,doc_id,sentence_id,sentence,token_id,token,lemma,upos,xpos,Entity
0,1,doc1,1,iowa state university,1,iowa,iowa,NOUN,NN,Unassigned
1,2,doc1,1,iowa state university,2,state,state,NOUN,NN,Unassigned
2,3,doc1,1,iowa state university,3,university,university,NOUN,NN,Profession
3,4,doc2,1,long live britain america true patriots best west,1,long,long,ADJ,JJ,Unassigned
4,5,doc2,1,long live britain america true patriots best west,2,live,live,ADJ,JJ,Unassigned


## Approach 1 : Recognition based on memorization only

In here, we write a first simplistic model that will learn from the occurences of each word and their assigned entity.
<br>
The most popular entities for a word will be assigned to any new word when predicting.

### Model building

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class MemoryRecognizer(BaseEstimator, TransformerMixin):
    # Building a class based on the sklearn estimators model
    # This will allow to easily evaluate the model using sklearn built-in features

    
    def fit(self, X, y, display_progress=False):
        """Finds out the most occuring entity for each word
        Expects a pd.Series for X & y"""
        voc = {}
        self.entities = []
            
        total_words = len(X)
        progress = 0
        # For each word, establish a list of dictionaries with the number of times an entity is associated with the word
        for word, entity in zip(X,y):
            if display_progress == True:
                progress += 1
                if progress % 10000 == 0:
                    print(str(progress), "/", str(total_words), "words classified.")
            
            if entity not in self.entities:
                self.entities.append(entity)
            if word in voc:
                if entity in voc[word]:
                    voc[word][entity] += 1
                else:
                    voc[word][entity] = 1
            else:
                voc[word] = {entity :1}
            
            self.memory = {}
            for word, entities in voc.items():
                self.memory[word] = max(entities, key=entities.get)
        
    def predict(self, X, y=None):
        """Dictionary look-up"""
        return [self.memory.get(word, 'Unassigned') for word in X]

### Evaluation

Here, we will assess the model's performance in two ways.
<br> 
The first is getting a predicted value for each word in the cnn_data set, using cross_validated predicting.
<br>
This means the dataset is split in a k-fold fashion (5 fold here), and all the words pass through the prediction method at some point, as the testing (unseen) set.
This allows for evaluation metrics for the whole dataset while avoiding biased results.

<br> 
The second way to evaluate the model is to test it on the opposite data set (train on cnn, test on fox) or vice-versa.

In [97]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [98]:
words = data_fox.token.values.tolist()
entities = data_fox.Entity.values.tolist()

pred = cross_val_predict(estimator=MemoryRecognizer(), 
                         X=words, y=entities, cv=5, 
                         fit_params={"display_progress":True})

print(classification_report(y_pred = pred, y_true=entities))

10000 / 32459 words classified.
20000 / 32459 words classified.
30000 / 32459 words classified.
10000 / 32459 words classified.
20000 / 32459 words classified.
30000 / 32459 words classified.
10000 / 32459 words classified.
20000 / 32459 words classified.
30000 / 32459 words classified.
10000 / 32459 words classified.
20000 / 32459 words classified.
30000 / 32459 words classified.
10000 / 32460 words classified.
20000 / 32460 words classified.
30000 / 32460 words classified.
              precision    recall  f1-score   support

     Hobbies       1.00      0.96      0.98       713
Organization       1.00      0.92      0.96       193
 Personality       1.00      0.95      0.97       713
  Profession       1.00      0.98      0.99      2469
    Religion       1.00      0.91      0.95        34
  Unassigned       1.00      1.00      1.00     36452

 avg / total       1.00      1.00      1.00     40574



#### Training on CNN / Testing on Fox

In [99]:
recognizer = MemoryRecognizer()

words_cnn = data_cnn.token.values.tolist()
entities_cnn = data_cnn.Entity.values.tolist()

words_fox = data_fox.token.values.tolist()
entities_fox = data_fox.Entity.values.tolist()

recognizer.fit(X=words_cnn, y=entities_cnn, display_progress=True)
preds = recognizer.predict(X=words_fox)

print(classification_report(y_pred=preds, y_true=entities_fox))

10000 / 31356 words classified.
20000 / 31356 words classified.
30000 / 31356 words classified.
              precision    recall  f1-score   support

     Hobbies       1.00      0.96      0.98       713
Organization       1.00      0.85      0.92       193
 Personality       1.00      0.95      0.97       713
  Profession       1.00      0.97      0.99      2469
    Religion       1.00      1.00      1.00        34
  Unassigned       1.00      1.00      1.00     36452

 avg / total       1.00      1.00      1.00     40574



#### Training on Fox / Testing on CNN

In [100]:
recognizer = MemoryRecognizer()

words_fox = data_fox.token.values.tolist()
entities_fox = data_fox.Entity.values.tolist()

words_cnn = data_cnn.token.values.tolist()
entities_cnn = data_cnn.Entity.values.tolist()

recognizer.fit(X=words_fox, y=entities_fox, display_progress=True)
preds = recognizer.predict(X=words_cnn)

print(classification_report(y_pred=preds, y_true=entities_cnn))

10000 / 40574 words classified.
20000 / 40574 words classified.
30000 / 40574 words classified.
40000 / 40574 words classified.
              precision    recall  f1-score   support

     Hobbies       1.00      0.96      0.98       581
Organization       1.00      0.82      0.90       135
 Personality       1.00      0.96      0.98       478
  Profession       1.00      0.98      0.99      2308
    Religion       1.00      0.95      0.98        22
  Unassigned       1.00      1.00      1.00     27832

 avg / total       1.00      1.00      1.00     31356



## Approach 2 : Simple classification model

Here, we derive simple features from each word and use those to try to predict their entity type, using a Random Forest classifier.

In [101]:
from sklearn.ensemble import RandomForestClassifier

In [102]:
def get_features(word):
    return pd.Series([word.istitle(), word.islower(), 
                      len(word), word.isdigit(), word.isalpha()])

In [103]:
features_fox.head()

Unnamed: 0,istitle,islower,length,isdigit,isalpha
0,False,True,4,False,True
1,False,True,5,False,True
2,False,True,10,False,True
3,False,True,4,False,True
4,False,True,4,False,True


### Evaluation using cross-validated predictions (Fox dataset)

In [104]:
features_fox = data_fox.token.apply(get_features)
features_fox.columns = ["istitle", "islower", 
                      "length", "isdigit", "isalpha"]
preds = cross_val_predict(RandomForestClassifier(n_estimators=500),
                         X=features_fox, y=data_fox.Entity, cv=5)

print(classification_report(y_pred=preds, y_true=data_fox.Entity))

              precision    recall  f1-score   support

     Hobbies       0.00      0.00      0.00       713
Organization       0.00      0.00      0.00       193
 Personality       0.00      0.00      0.00       713
  Profession       0.00      0.00      0.00      2469
    Religion       0.00      0.00      0.00        34
  Unassigned       0.90      1.00      0.95     36452

 avg / total       0.81      0.90      0.85     40574



  'precision', 'predicted', average, warn_for)


#### Training on Fox / Testing on CNN

In [105]:
X_fox = data_fox.token.apply(get_features)
X_fox.columns = ["istitle", "islower", 
                      "length", "isdigit", "isalpha"]

X_cnn = data_cnn.token.apply(get_features)
X_cnn.columns = ["istitle", "islower", 
                      "length", "isdigit", "isalpha"]

In [106]:
rfc = RandomForestClassifier()
rfc.fit(X=X_fox, y=data_fox.Entity)

preds = rfc.predict(X_cnn)
print(classification_report(y_pred=preds, y_true=data_cnn.Entity))

              precision    recall  f1-score   support

     Hobbies       0.00      0.00      0.00       581
Organization       0.00      0.00      0.00       135
 Personality       0.00      0.00      0.00       478
  Profession       0.00      0.00      0.00      2308
    Religion       0.00      0.00      0.00        22
  Unassigned       0.89      1.00      0.94     27832

 avg / total       0.79      0.89      0.83     31356



  'precision', 'predicted', average, warn_for)


#### Training on CNN / Testing on Fox

In [107]:
rfc = RandomForestClassifier()
rfc.fit(X=X_cnn, y=data_cnn.Entity)

preds = rfc.predict(X_fox)
print(classification_report(y_pred=preds, y_true=data_fox.Entity))

              precision    recall  f1-score   support

     Hobbies       0.00      0.00      0.00       713
Organization       0.00      0.00      0.00       193
 Personality       0.00      0.00      0.00       713
  Profession       0.00      0.00      0.00      2469
    Religion       0.00      0.00      0.00        34
  Unassigned       0.90      1.00      0.95     36452

 avg / total       0.81      0.90      0.85     40574



  'precision', 'predicted', average, warn_for)


The results using a simple classification method are terrible. 

## Approach 3 : Combination of featurization, memorization & context information

For this final model, we combine an approach based on memory of previously seen words, word featurization, context information, and classify the resulting observations using a Random Forest classifier. 
The memory-based part is obtained by learning the most common entity identifications for each word. Once it is trained, if a new word has already been seen, it will be classified as it was in the past. 


In addition to this major feature, other information are drawn from the word itself. It includes whether the word contains uppercase, lowercase letters, whether it is in the title format, how many characters it is made of and whether there is digits within the word. The part of speech tag is also retrieved.
These features are also retrieved for the preceding word and the following one, and combined to form a feature space.

Finally, these variables are used to predict the named entity using a random forest classifier.

### Model building

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class Featurizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryRecognizer()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        self.pos = X.upos.values.tolist()
        self.memory_tagger.fit(X.token, y)
        entities = X.Entity.values.tolist()
        self.pos_encoder.fit(self.pos)
        self.tag_encoder.fit(X.Entity)
        return self
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X.upos.values.tolist()
        words = X.token.values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['Unassigned'])[0]
                posp = pos_default(".")
                
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                    
                else:
                    wm = self.tag_encoder.transform(["Unassigned"])[0]
                    posm = pos_default(".")
                    
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(["Unassigned"])[0]
                
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                 pos_default(p), wp, wm, posp, posm]))
        
        return out

### Evaluation using cross-validated predictions (Fox)

In [109]:
from sklearn.pipeline import Pipeline
X_fox = data_fox[["token", "upos", "Entity"]]

y_fox = data_fox.Entity.values.tolist()

pred = cross_val_predict(Pipeline([("feature_map", Featurizer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data_fox, y=y_fox, cv=5)

In [110]:
print(classification_report(y_pred=pred, y_true=y_fox))

              precision    recall  f1-score   support

     Hobbies       0.99      0.95      0.97       713
Organization       0.98      0.82      0.90       193
 Personality       0.99      0.95      0.97       713
  Profession       0.99      0.98      0.98      2469
    Religion       1.00      0.47      0.64        34
  Unassigned       1.00      1.00      1.00     36452

 avg / total       1.00      1.00      1.00     40574



##### Data preparation

In [4]:
X_fox = data_fox[["token", "upos", "Entity"]]
y_fox = data_fox.Entity.values.tolist()

X_cnn = data_cnn[["token", "upos", "Entity"]]
y_cnn = data_cnn.Entity.values.tolist()

#### Training on Fox / Testing on CNN

In [114]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("feature_map", Featurizer()), 
                ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))])
pipe = pipe.fit(X_fox, y_fox)

pred_cnn = pipe.predict(X_cnn)

In [116]:
print(classification_report(y_pred=pred_cnn, y_true=y_cnn))

              precision    recall  f1-score   support

     Hobbies       1.00      0.95      0.98       581
Organization       0.96      0.80      0.87       135
 Personality       0.99      0.95      0.97       478
  Profession       1.00      0.98      0.99      2308
    Religion       1.00      0.68      0.81        22
  Unassigned       1.00      1.00      1.00     27832

 avg / total       1.00      1.00      1.00     31356



#### Training on CNN / Testing on Fox

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([("feature_map", Featurizer()), 
                ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))])
pipe = pipe.fit(X_fox, y_fox)

pred_cnn = pipe.predict(X_cnn)

In [11]:
print(classification_report(y_pred=pred_cnn, y_true=y_cnn))

              precision    recall  f1-score   support

     Hobbies       1.00      0.96      0.98       581
Organization       1.00      0.82      0.90       135
 Personality       1.00      0.95      0.98       478
  Profession       1.00      0.98      0.99      2308
    Religion       1.00      0.95      0.98        22
  Unassigned       1.00      1.00      1.00     27832

 avg / total       1.00      1.00      1.00     31356



##### Outputting results in a standard format for reporting

In [48]:
data1 = data_cnn[["doc_id", "token", "upos", "Entity"]]
data2 = pd.DataFrame(pred_cnn, columns=["label"])
output_results = pd.concat([data1, data2], axis=1,ignore_index=True)
output_results.columns = [["doc_id", "token", "upos", "Entity", "label"]]
output_results.head()

Unnamed: 0,doc_id,token,upos,Entity,label
0,doc1,optimistic,ADJ,Personality,Personality
1,doc2,fav,VERB,Unassigned,Unassigned
2,doc2,youtuber,ADP,Unassigned,Unassigned
3,doc2,moesargi,NOUN,Unassigned,Unassigned
4,doc2,fav,NOUN,Unassigned,Unassigned


In [49]:
output_results.to_csv("Random Forest Fox Predictions.csv")

## Studying the decision-making of the Random Forest Classifier

In [57]:
# Intelligible list of variable names
vars = ["word.istitle", "word.islower", "word.isupper", "length", "word.isdigit", "word.isalpha",
"word.memoryEntity", "word.postag", "nextword.memoryEntity", "prevword.memoryEntity",
"nextword.postag", "prevword.postag"]

vars_importances = pipe.named_steps.clf.feature_importances_

var_ranking = []
for var, imp in zip(vars, vars_importances):
    var_ranking.append([var, imp])
df = pd.DataFrame(var_ranking, columns=["Variable", "Importance"]).sort_values("Importance", ascending=False)
df.head()

Unnamed: 0,Variable,Importance
6,word.memoryEntity,0.916774
7,word.postag,0.035291
3,length,0.024707
9,prevword.memoryEntity,0.006488
8,nextword.memoryEntity,0.006335
