# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [41]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [12]:
df.shape

(66874, 12)

In [6]:
# number of sentences
df.sentence_idx.max()

1500.0

In [18]:
df['tag'][6]

'B-geo'

In [8]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [42]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

Длина самих слов

In [43]:
df['word_lenght'] = df['word'].apply(len)

In [44]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [5]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [50]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df.drop(['tag'], axis=1), y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


## РЕШЕНИЕ

In [None]:
Построить модель для обнаружения и классификации именованных сущностей (named entities). 

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
baseline 1: 0.0604 random labels
baseline 2: 0.3966 PoS features + logistic regression
baseline 3: 0.8122 word2vec cbow embedding + baseline 2 + svm

Чисто DecisionTree

In [45]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

In [32]:

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DecisionTreeClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.7541294230304936
test 0.6065548557583169


## Bagging

На рандомных значениях уже получается побить первый и второй бейзлайны.

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=10,
                             random_state=SEED)

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.39930991812212674
test 0.32715166932255213
CPU times: user 3.72 s, sys: 304 ms, total: 4.02 s
Wall time: 4.26 s


Попробуем подобрать параметры

In [11]:
params = [('gini', 100, None, None), ('gini', 150, None, None), 
          ('gini', 100, 10, None), ('entropy', 100, 10, None),
          ('entropy', 100, None, None), ('entropy', 150, None, None),
         ('entropy', 160, 20, None), ('gini', 160, 20, None)]

def grid(params, X_train, X_test, y_train, y_test):

    t_par = tqdm_notebook(params, desc='cv', leave=True)
    res = {}

    for param in t_par:

        model = RandomForestClassifier(criterion=param[0], n_estimators=param[1], 
                                       max_depth=param[2], max_features=param[3],
                                       random_state=SEED)
        model.fit(X_train, y_train)

        res[param] = metrics.f1_score(y_test, model.predict(X_test), average='macro')

    return sorted(res.items(), key=lambda kv: kv[1])

In [40]:
grid(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))

[(('gini', 100, 10, None), 0.34258269375668465),
 (('entropy', 100, 10, None), 0.34679923281682984),
 (('entropy', 160, 20, None), 0.5854528179186669),
 (('entropy', 100, None, None), 0.5855549663506368),
 (('entropy', 150, None, None), 0.5860315511894434),
 (('gini', 160, 20, None), 0.5891794999496678),
 (('gini', 150, None, None), 0.589667605949408),
 (('gini', 100, None, None), 0.5908287922182608)]

Добавим значение длины

In [46]:
columns += ['length']

In [42]:
gridparams, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))

[(('gini', 100, 10, None), 0.3573981596689184),
 (('entropy', 100, 10, None), 0.3597785998854501),
 (('gini', 160, 20, None), 0.768895126679233),
 (('entropy', 100, None, None), 0.7697471539274654),
 (('entropy', 160, 20, None), 0.7702924896260623),
 (('entropy', 150, None, None), 0.7706634706208975),
 (('gini', 100, None, None), 0.7767623140652997),
 (('gini', 150, None, None), 0.7770172310431464)]

Значения значительно выросли

Добавим длину слова

In [47]:
columns += ['word_lenght']

In [44]:
grid(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))

[(('gini', 100, 10, None), 0.36689616656253665),
 (('entropy', 100, 10, None), 0.3853284912399867),
 (('entropy', 100, None, None), 0.8114340473931465),
 (('entropy', 160, 20, None), 0.8122191110249137),
 (('entropy', 150, None, None), 0.8131469171535812),
 (('gini', 160, 20, None), 0.819675026301672),
 (('gini', 100, None, None), 0.823682670569716),
 (('gini', 150, None, None), 0.823747286529106)]

Стало еще лучше, и мы побили третий бейзлайн!

In [38]:
params = [('gini', 40, None, 5), ('gini', 20, None, 5), 
          ('gini', 30, 10, 5), ('entropy', 10, 10, 5),
          ('entropy', 50, None, 5), ('entropy', 30, None, 5),
         ('entropy', 40, 20, 5), ('gini', 40, 20, 5)]

In [39]:
grid(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))




[(('gini', 30, 10, 5), 0.36090765251399937),
 (('entropy', 10, 10, 5), 0.3774924040651807),
 (('gini', 20, None, 5), 0.7968103986463004),
 (('gini', 40, 20, 5), 0.7990773334704147),
 (('entropy', 40, 20, 5), 0.8121164875527964),
 (('entropy', 30, None, 5), 0.8126728039308379),
 (('entropy', 50, None, 5), 0.8160220867004018),
 (('gini', 40, None, 5), 0.8175458156853598)]

In [37]:
%%time
model = RandomForestClassifier(criterion='gini', n_estimators=40, 
                                       max_depth=None, max_features=5,
                                       random_state=SEED)

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.9903483228677615
test 0.8175458156853598
CPU times: user 2.61 s, sys: 112 ms, total: 2.72 s
Wall time: 2.73 s


## Gradient Boosting (xgboost)

In [48]:
import xgboost

Без указания параметров

In [14]:
model = xgboost.XGBClassifier(random_state=SEED)
model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.3656648506998212
test 0.33681206992843266


In [15]:
params = [(0.01, 5, 1), (0.01, 10, 1),
          (0.03, 5, 1), (0.03, 10, 1),
          (0.1, 5, 1),  (0.1, 10, 1),
          (0.2, 5, 1), (0.2, 10, 1)]

In [16]:
def grid_xgb(params, X_train, X_test, y_train, y_test):

    t_par = tqdm_notebook(params, desc='cv', leave=True)
    res = {}

    for param in t_par:

        model = xgboost.XGBClassifier(min_child_weight=param[2], max_depth=param[1],
                                      eta=param[0], random_state=SEED)
        model.fit(X_train, y_train)

        res[param] = metrics.f1_score(y_test, model.predict(X_test), average='macro')

    return sorted(res.items(), key=lambda kv: kv[1])
    

In [17]:
grid_xgb(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))




[((0.01, 5, 1), 0.38464173022741166),
 ((0.03, 5, 1), 0.38464173022741166),
 ((0.1, 5, 1), 0.38464173022741166),
 ((0.2, 5, 1), 0.38464173022741166),
 ((0.01, 10, 1), 0.6643041434306864),
 ((0.03, 10, 1), 0.6643041434306864),
 ((0.1, 10, 1), 0.6643041434306864),
 ((0.2, 10, 1), 0.6643041434306864)]

In [18]:
params = [(0.01, 10, 2), (0.01, 10, 5),
          (0.03, 10, 2), (0.03, 10, 5),
          (0.1, 10, 2),  (0.1, 10, 5),
          (0.2, 10, 2), (0.2, 10, 5)]

In [19]:
grid_xgb(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))




[((0.01, 10, 5), 0.4506142430442806),
 ((0.03, 10, 5), 0.4506142430442806),
 ((0.1, 10, 5), 0.4506142430442806),
 ((0.2, 10, 5), 0.4506142430442806),
 ((0.01, 10, 2), 0.5899193110417742),
 ((0.03, 10, 2), 0.5899193110417742),
 ((0.1, 10, 2), 0.5899193110417742),
 ((0.2, 10, 2), 0.5899193110417742)]

In [20]:
params = [(0.01, 15, 1), (0.01, 20, 1),
          (0.03, 15, 1), (0.03, 20, 1),
          (0.15, 15, 1),  (0.15, 20, 1),
          (0.3, 15, 1), (0.3, 20, 1)]

grid_xgb(params, df_train[columns], df_test[columns], y_train, y_test)

HBox(children=(IntProgress(value=0, description='cv', max=8), HTML(value='')))




[((0.01, 15, 1), 0.7913716980126662),
 ((0.03, 15, 1), 0.7913716980126662),
 ((0.15, 15, 1), 0.7913716980126662),
 ((0.3, 15, 1), 0.7913716980126662),
 ((0.01, 20, 1), 0.8223722486148932),
 ((0.03, 20, 1), 0.8223722486148932),
 ((0.15, 20, 1), 0.8223722486148932),
 ((0.3, 20, 1), 0.8223722486148932)]

In [53]:
model = xgboost.XGBClassifier(min_child_weight=1, max_depth=20,
                                      eta=0.3, random_state=SEED)
model.fit(df_train[columns], y_train)

metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro')

0.8223722486148932

##### How can you exploit that words belong to some sentence?

В предложении слова чаще всего стоят в определенном порядке, этот порядок может сильно помочь в определении именнованных сущностей, например, если это язык, в котором порядок слов четко фиксированный. Если у нас в предложении на первом месте, например, всегда стоит подлежащее (SVO), вероятность того, что именнованная сущность будет стоять на первом месте будет выше, чем на втором, где должен стоять глагол. 

##### Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?

Так как f1 мера объединяет в себе precision и recall, можно сделать преположение, что мы еще можем использовать их для измерения качества. Мы используем macro averaging, а не например, binary, так как у нас multilabel targets. А macro мы используем потому, что хотим узнать, как система работает в целом по наборам данных, не учитывая дисбаланс меток. Accuracy здесь не подходходит именно потому, что у нас несбалансированные данные, так бы мы могли предсказывать просто самый частотный класс. 

In [23]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 24.1 s, sys: 304 ms, total: 24.4 s
Wall time: 10.2 s


In [12]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.05887736725599869
test 0.060439542712750365
CPU times: user 104 ms, sys: 13.3 ms, total: 118 ms
Wall time: 122 ms


In [16]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566
CPU times: user 3min 14s, sys: 13.1 s, total: 3min 27s
Wall time: 11min 30s


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [13]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.0min finished


train 0.95846030477073
test 0.8122113864662406
CPU times: user 1min 56s, sys: 5.51 s, total: 2min 1s
Wall time: 7min 2s
