In [4]:
import pandas as pd
import numpy as np

data = pd.read_csv('../../data/ner_dataset.csv', encoding='latin1')

In [5]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,,impact,NN,O
1048566,,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,,forces,NNS,O
1048569,,said,VBD,O
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O
1048574,,attack,NN,O


In [6]:
data = data.fillna(method="ffill")

In [7]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [9]:
words = list(set(data["Word"].values))

In [11]:
n_words = len(words); n_words

35178

In [12]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data 
        self.empty = False
        
    def get_next(self):
        try:
            s = self.data[self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()
        except:
            self.empty = True
            return None, None, None

In [13]:
getter = SentenceGetter(data)

In [14]:
sent, pos, tag = getter.get_next()

In [15]:
print(sent); print(pos); print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


## Memorization 
- simple baseline that remember common named entity for every word and predict that

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

class MemoryTagger(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
                
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get) #word의 tag중에서 가장 많이 나온 tag를 memorization
            
    def predict(self, X, y=None):
        '''
        Predict the tag from memory. If word is unkown, predict 'O'
        '''
        return [self.memory.get(x, 'O') for x in X]

In [28]:
tagger = MemoryTagger()

In [29]:
tagger.fit(sent, tag)

In [30]:
print(tagger.predict(sent))

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [31]:
tagger.tags

['O', 'B-geo', 'B-gpe']

In [40]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [34]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [43]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

In [44]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)
# precision quite well. but recall is poor. because we cannot perdict on words we don't know

              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

    accuracy              

## A simple machine learning approach

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [67]:
def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word), word.isdigit(),  word.isalpha()])

In [51]:
words = [feature_map(w) for w in data["Word"].values.tolist()]

In [52]:
words[:10]

[array([1, 0, 0, 9, 0, 1]),
 array([0, 1, 0, 2, 0, 1]),
 array([ 0,  1,  0, 13,  0,  1]),
 array([0, 1, 0, 4, 0, 1]),
 array([0, 1, 0, 7, 0, 1]),
 array([0, 1, 0, 7, 0, 1]),
 array([1, 0, 0, 6, 0, 1]),
 array([0, 1, 0, 2, 0, 1]),
 array([0, 1, 0, 7, 0, 1]),
 array([0, 1, 0, 3, 0, 1])]

In [55]:
%%time
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=words, y=tags, cv=5)

CPU times: user 15min 48s, sys: 22.9 s, total: 16min 11s
Wall time: 55.6 s


In [56]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.80      0.40     37644
       B-gpe       0.25      0.04      0.07     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.96      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

that looks really bad.\
since the features lack a lot of information necessary for the decision.\
So now we enhance our simple features on the one hand by memory and on the other hand by using context information.


In [79]:
from sklearn.preprocessing import LabelEncoder

class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
        
    def fit(self, X, y): 
        '''
        X: data[Sentence #, Word, POS, Tag]
        y: data['Tag']
        '''
        words = X["Word"].values.tolist()
        self.pos = X["POS"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
            
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(".")
                    
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                pos_default(p), wp, wm, posp, posm]))
        return out

In [80]:
from sklearn.pipeline import Pipeline

In [81]:
%%time
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()),
                                  ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                        X=data, y=tags, cv=5)

CPU times: user 35min 17s, sys: 2min 43s, total: 38min 1s
Wall time: 31min 22s


In [82]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.16      0.07      0.09       402
       B-eve       0.41      0.26      0.32       308
       B-geo       0.82      0.86      0.84     37644
       B-gpe       0.98      0.93      0.95     15870
       B-nat       0.22      0.22      0.22       201
       B-org       0.71      0.62      0.66     20143
       B-per       0.80      0.76      0.78     16990
       B-tim       0.88      0.79      0.84     20333
       I-art       0.03      0.01      0.02       297
       I-eve       0.25      0.12      0.17       253
       I-geo       0.78      0.64      0.70      7414
       I-gpe       0.77      0.46      0.58       198
       I-nat       0.42      0.16      0.23        51
       I-org       0.72      0.64      0.68     16784
       I-per       0.86      0.72      0.78     17251
       I-tim       0.84      0.46      0.60      6528
           O       0.98      1.00      0.99    887908

    accuracy              