# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [4]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head(10)

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O
5,TO,to,NNP,London,IN,VBN,VBP,have,marched,1.0,through,O
6,VB,protest,TO,to,NNP,IN,VBN,marched,through,1.0,London,B-geo
7,DT,the,VB,protest,TO,NNP,IN,through,London,1.0,to,O
8,NN,war,DT,the,VB,TO,NNP,London,to,1.0,protest,O
9,IN,in,NN,war,DT,VB,TO,to,protest,1.0,the,O


In [5]:
# number of sentences
df.sentence_idx.max()

1500.0

In [24]:
# class distribution
df.tag.value_counts(normalize=True)

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [7]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [8]:
tdf.head()

Unnamed: 0_level_0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,Thousands,O,48
1.0,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,of,O,48
1.0,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,demonstrators,O,48
1.0,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,have,O,48
1.0,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,marched,O,48


In [9]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [10]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [11]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [12]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [13]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

Wall time: 34 s


In [14]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.05887736725599869
test 0.060439542712750365
Wall time: 335 ms


In [15]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566
Wall time: 2h 29min 28s


In [16]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 14.2min finished


train 0.9568881501474211
test 0.8093016932978506
Wall time: 17min 49s


### Попробуем random forest classifier

Возьмём сразу те данные, которые с word2vec cbow embeddings, потому что как видно на сравнении baseline 2 и baseline 3 - это повышает f1 score.

In [17]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [18]:
clf = RandomForestClassifier(n_jobs=-1, random_state=SEED)

In [19]:
%%time
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [20]:
%%time
print('train', metrics.f1_score(y_train, clf.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, clf.predict(X_test), average='macro'))

train 0.9850458602901792
test 0.821452120112657
Wall time: 6.59 s


*Ура, бейзлайн побит*

### Теперь попробуем градиентный бустинг

In [28]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [22]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# я в итоге стала использовать метод fit, а не train, поэтому это не пригодилось

In [25]:
# number of classes for parameters in lgb
len(df.tag.value_counts(normalize=True))

17

In [32]:
params = {'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'num_class': 17,
          'learning_rate': 0.03,
          'num_leaves': 40,
          'feature_fraction': 0.5,
          'bagging_fraction': 0.3,
          'reg_alpha': 0.15,
          'reg_lambda': 0.15,
          'seed': SEED}

In [39]:
gbm = LGBMClassifier(boosting_type='gbdt',
                     objective='multiclass',
                     num_class=17,
                     learning_rate=0.03,
                     num_leaves=40,
                     seed=SEED)

In [41]:
%%time
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=5)

[1]	valid_0's multi_logloss: 0.651774
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 0.600979
[3]	valid_0's multi_logloss: 0.574088
[4]	valid_0's multi_logloss: 0.535768
[5]	valid_0's multi_logloss: 0.510486
[6]	valid_0's multi_logloss: 0.490365
[7]	valid_0's multi_logloss: 0.479085
[8]	valid_0's multi_logloss: 0.452253
[9]	valid_0's multi_logloss: 0.436436
[10]	valid_0's multi_logloss: 0.428979
[11]	valid_0's multi_logloss: 0.410815
[12]	valid_0's multi_logloss: 0.39742
[13]	valid_0's multi_logloss: 0.385544
[14]	valid_0's multi_logloss: 0.374343
[15]	valid_0's multi_logloss: 0.363954
[16]	valid_0's multi_logloss: 0.354038
[17]	valid_0's multi_logloss: 0.344688
[18]	valid_0's multi_logloss: 0.335916
[19]	valid_0's multi_logloss: 0.327443
[20]	valid_0's multi_logloss: 0.31944
[21]	valid_0's multi_logloss: 0.311866
[22]	valid_0's multi_logloss: 0.304594
[23]	valid_0's multi_logloss: 0.297608
[24]	valid_0's multi_logloss: 0.290889
[25]	valid_0's

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=40,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [43]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm.predict(X_train, num_iteration=100), average='macro'))
print('test', metrics.f1_score(y_test, gbm.predict(X_test, num_iteration=100), average='macro'))

Starting predicting...
train 0.9800463804961659
test 0.8268352127815337
Wall time: 17.9 s


0.827 это конечно немного лучше, чем 0.822. Но мне не нравится, что в fit использовались данные теста для валидации, поэтому попробуем обойтись без этого и добавить больше параметров. Например, bagging и т.д.

In [54]:
gbm2 = LGBMClassifier(boosting_type='gbdt',
                      objective='multiclass',
                      num_class=17,
                      learning_rate=0.03,
                      num_leaves=45,
                      feature_fraction=0.5,
                      bagging_fraction=0.3,
                      seed=SEED)

In [55]:
%%time
# train
gbm2.fit(X_train, y_train)

Wall time: 23min 43s


LGBMClassifier(bagging_fraction=0.3, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.5,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=45,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [56]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm2.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, gbm2.predict(X_test), average='macro'))

Starting predicting...
train 0.9810162197763342
test 0.8421804050931931
Wall time: 39.8 s


Ура, f1 score 0.84. Это выше и бейзлайна и предыдущих двух методов.

Поиграв еще немного с парметром количество листьев, я получила модель еще получше.

In [57]:
gbm3 = LGBMClassifier(boosting_type='gbdt',
                      objective='multiclass',
                      num_class=17,
                      learning_rate=0.03,
                      num_leaves=60,
                      feature_fraction=0.5,
                      bagging_fraction=0.3,
                      seed=SEED)

In [58]:
%%time
# train
gbm3.fit(X_train, y_train)

Wall time: 18min 24s


LGBMClassifier(bagging_fraction=0.3, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.5,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=60,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [59]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm3.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, gbm3.predict(X_test), average='macro'))

Starting predicting...
train 0.986631846426279
test 0.8534014454489132
Wall time: 20.9 s
