# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [4]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head(10)

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O
5,TO,to,NNP,London,IN,VBN,VBP,have,marched,1.0,through,O
6,VB,protest,TO,to,NNP,IN,VBN,marched,through,1.0,London,B-geo
7,DT,the,VB,protest,TO,NNP,IN,through,London,1.0,to,O
8,NN,war,DT,the,VB,TO,NNP,London,to,1.0,protest,O
9,IN,in,NN,war,DT,VB,TO,to,protest,1.0,the,O


In [5]:
# number of sentences
df.sentence_idx.max()

1500.0

In [24]:
# class distribution
df.tag.value_counts(normalize=True)

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [7]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [8]:
tdf.head()

Unnamed: 0_level_0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,Thousands,O,48
1.0,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,of,O,48
1.0,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,demonstrators,O,48
1.0,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,have,O,48
1.0,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,marched,O,48


In [9]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [10]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [11]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [12]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [13]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

Wall time: 34 s


In [14]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.05887736725599869
test 0.060439542712750365
Wall time: 335 ms


In [15]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566
Wall time: 2h 29min 28s


In [16]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 14.2min finished


train 0.9568881501474211
test 0.8093016932978506
Wall time: 17min 49s


### Попробуем random forest classifier

Возьмём сразу те данные, которые с word2vec cbow embeddings, потому что как видно на сравнении baseline 2 и baseline 3 - это повышает f1 score.

In [17]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

In [18]:
clf = RandomForestClassifier(n_jobs=-1, random_state=SEED)

In [19]:
%%time
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [20]:
%%time
print('train', metrics.f1_score(y_train, clf.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, clf.predict(X_test), average='macro'))

train 0.9850458602901792
test 0.821452120112657
Wall time: 6.59 s


*Ура, бейзлайн побит*

### Теперь попробуем градиентный бустинг

In [28]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [22]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# я в итоге стала использовать метод fit, а не train, поэтому это не пригодилось

In [25]:
# number of classes for parameters in lgb
len(df.tag.value_counts(normalize=True))

17

In [39]:
gbm = LGBMClassifier(boosting_type='gbdt',
                     objective='multiclass',
                     num_class=17,
                     learning_rate=0.03,
                     num_leaves=40,
                     seed=SEED)

In [41]:
%%time
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=5)

[1]	valid_0's multi_logloss: 0.651774
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's multi_logloss: 0.600979
[3]	valid_0's multi_logloss: 0.574088
[4]	valid_0's multi_logloss: 0.535768
[5]	valid_0's multi_logloss: 0.510486
[6]	valid_0's multi_logloss: 0.490365
[7]	valid_0's multi_logloss: 0.479085
[8]	valid_0's multi_logloss: 0.452253
[9]	valid_0's multi_logloss: 0.436436
[10]	valid_0's multi_logloss: 0.428979
[11]	valid_0's multi_logloss: 0.410815
[12]	valid_0's multi_logloss: 0.39742
[13]	valid_0's multi_logloss: 0.385544
[14]	valid_0's multi_logloss: 0.374343
[15]	valid_0's multi_logloss: 0.363954
[16]	valid_0's multi_logloss: 0.354038
[17]	valid_0's multi_logloss: 0.344688
[18]	valid_0's multi_logloss: 0.335916
[19]	valid_0's multi_logloss: 0.327443
[20]	valid_0's multi_logloss: 0.31944
[21]	valid_0's multi_logloss: 0.311866
[22]	valid_0's multi_logloss: 0.304594
[23]	valid_0's multi_logloss: 0.297608
[24]	valid_0's multi_logloss: 0.290889
[25]	valid_0's

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=40,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [43]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm.predict(X_train, num_iteration=100), average='macro'))
print('test', metrics.f1_score(y_test, gbm.predict(X_test, num_iteration=100), average='macro'))

Starting predicting...
train 0.9800463804961659
test 0.8268352127815337
Wall time: 17.9 s


0.827 это конечно немного лучше, чем 0.822. Но мне не нравится, что в fit использовались данные теста для валидации, поэтому попробуем обойтись без этого и добавить больше параметров. Например, bagging и т.д.

In [54]:
gbm2 = LGBMClassifier(boosting_type='gbdt',
                      objective='multiclass',
                      num_class=17,
                      learning_rate=0.03,
                      num_leaves=45,
                      feature_fraction=0.5,
                      bagging_fraction=0.3,
                      seed=SEED)

In [55]:
%%time
# train
gbm2.fit(X_train, y_train)

Wall time: 23min 43s


LGBMClassifier(bagging_fraction=0.3, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.5,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=45,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [56]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm2.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, gbm2.predict(X_test), average='macro'))

Starting predicting...
train 0.9810162197763342
test 0.8421804050931931
Wall time: 39.8 s


Ура, f1 score 0.84. Это выше и бейзлайна и предыдущих двух методов.

Поиграв еще немного с парметром количество листьев, я получила модель еще получше.

In [57]:
gbm3 = LGBMClassifier(boosting_type='gbdt',
                      objective='multiclass',
                      num_class=17,
                      learning_rate=0.03,
                      num_leaves=60,
                      feature_fraction=0.5,
                      bagging_fraction=0.3,
                      seed=SEED)

In [58]:
%%time
# train
gbm3.fit(X_train, y_train)

Wall time: 18min 24s


LGBMClassifier(bagging_fraction=0.3, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.5,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=17, num_leaves=60,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=1337, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [59]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, gbm3.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, gbm3.predict(X_test), average='macro'))

Starting predicting...
train 0.986631846426279
test 0.8534014454489132
Wall time: 20.9 s


### Попробуем catboost

In [60]:
from catboost import CatBoostClassifier, Pool, cv

In [62]:
cat = CatBoostClassifier(random_seed=SEED,
                         depth=2,
                         loss_function='MultiClass')

In [65]:
%%time
cat.fit(df_train[columns], y_train)

0:	learn: -2.4422040	total: 250ms	remaining: 4m 9s
1:	learn: -2.2019679	total: 368ms	remaining: 3m 3s
2:	learn: -2.0255858	total: 505ms	remaining: 2m 47s
3:	learn: -1.8859962	total: 624ms	remaining: 2m 35s
4:	learn: -1.7706699	total: 788ms	remaining: 2m 36s
5:	learn: -1.6726621	total: 921ms	remaining: 2m 32s
6:	learn: -1.5876849	total: 1.07s	remaining: 2m 32s
7:	learn: -1.5128906	total: 1.19s	remaining: 2m 27s
8:	learn: -1.4462847	total: 1.32s	remaining: 2m 25s
9:	learn: -1.3864124	total: 1.44s	remaining: 2m 23s
10:	learn: -1.3321784	total: 1.59s	remaining: 2m 22s
11:	learn: -1.2827358	total: 1.71s	remaining: 2m 20s
12:	learn: -1.2374161	total: 1.84s	remaining: 2m 19s
13:	learn: -1.1956817	total: 1.95s	remaining: 2m 17s
14:	learn: -1.1570934	total: 2.08s	remaining: 2m 16s
15:	learn: -1.1212877	total: 2.2s	remaining: 2m 15s
16:	learn: -1.0879603	total: 2.32s	remaining: 2m 14s
17:	learn: -1.0568537	total: 2.45s	remaining: 2m 13s
18:	learn: -1.0277478	total: 2.58s	remaining: 2m 13s
19:	le

158:	learn: -0.3677486	total: 21.4s	remaining: 1m 53s
159:	learn: -0.3663047	total: 21.6s	remaining: 1m 53s
160:	learn: -0.3659489	total: 21.7s	remaining: 1m 53s
161:	learn: -0.3655853	total: 22s	remaining: 1m 53s
162:	learn: -0.3650402	total: 22.2s	remaining: 1m 54s
163:	learn: -0.3643894	total: 22.4s	remaining: 1m 54s
164:	learn: -0.3640357	total: 22.6s	remaining: 1m 54s
165:	learn: -0.3636213	total: 22.9s	remaining: 1m 55s
166:	learn: -0.3622644	total: 23.1s	remaining: 1m 55s
167:	learn: -0.3619513	total: 23.3s	remaining: 1m 55s
168:	learn: -0.3614441	total: 23.4s	remaining: 1m 55s
169:	learn: -0.3606342	total: 23.5s	remaining: 1m 54s
170:	learn: -0.3602544	total: 23.6s	remaining: 1m 54s
171:	learn: -0.3589721	total: 23.8s	remaining: 1m 54s
172:	learn: -0.3577412	total: 23.9s	remaining: 1m 54s
173:	learn: -0.3573126	total: 24s	remaining: 1m 53s
174:	learn: -0.3569939	total: 24.1s	remaining: 1m 53s
175:	learn: -0.3566885	total: 24.3s	remaining: 1m 53s
176:	learn: -0.3564160	total: 24

311:	learn: -0.3142287	total: 42s	remaining: 1m 32s
312:	learn: -0.3140911	total: 42.1s	remaining: 1m 32s
313:	learn: -0.3139311	total: 42.2s	remaining: 1m 32s
314:	learn: -0.3137198	total: 42.4s	remaining: 1m 32s
315:	learn: -0.3135323	total: 42.5s	remaining: 1m 31s
316:	learn: -0.3133364	total: 42.6s	remaining: 1m 31s
317:	learn: -0.3131857	total: 42.7s	remaining: 1m 31s
318:	learn: -0.3130340	total: 42.8s	remaining: 1m 31s
319:	learn: -0.3126357	total: 43s	remaining: 1m 31s
320:	learn: -0.3123906	total: 43.1s	remaining: 1m 31s
321:	learn: -0.3122806	total: 43.2s	remaining: 1m 30s
322:	learn: -0.3118415	total: 43.3s	remaining: 1m 30s
323:	learn: -0.3116694	total: 43.4s	remaining: 1m 30s
324:	learn: -0.3114411	total: 43.5s	remaining: 1m 30s
325:	learn: -0.3112617	total: 43.7s	remaining: 1m 30s
326:	learn: -0.3111974	total: 43.8s	remaining: 1m 30s
327:	learn: -0.3110601	total: 43.9s	remaining: 1m 29s
328:	learn: -0.3108510	total: 44s	remaining: 1m 29s
329:	learn: -0.3106335	total: 44.1

465:	learn: -0.2927284	total: 1m 1s	remaining: 1m 10s
466:	learn: -0.2926750	total: 1m 1s	remaining: 1m 10s
467:	learn: -0.2926022	total: 1m 1s	remaining: 1m 10s
468:	learn: -0.2925639	total: 1m 1s	remaining: 1m 10s
469:	learn: -0.2925400	total: 1m 1s	remaining: 1m 9s
470:	learn: -0.2924179	total: 1m 2s	remaining: 1m 9s
471:	learn: -0.2923265	total: 1m 2s	remaining: 1m 9s
472:	learn: -0.2921817	total: 1m 2s	remaining: 1m 9s
473:	learn: -0.2921311	total: 1m 2s	remaining: 1m 9s
474:	learn: -0.2920891	total: 1m 2s	remaining: 1m 9s
475:	learn: -0.2920435	total: 1m 2s	remaining: 1m 8s
476:	learn: -0.2919933	total: 1m 2s	remaining: 1m 8s
477:	learn: -0.2919061	total: 1m 2s	remaining: 1m 8s
478:	learn: -0.2918658	total: 1m 3s	remaining: 1m 8s
479:	learn: -0.2918440	total: 1m 3s	remaining: 1m 8s
480:	learn: -0.2917480	total: 1m 3s	remaining: 1m 8s
481:	learn: -0.2916992	total: 1m 3s	remaining: 1m 8s
482:	learn: -0.2914419	total: 1m 3s	remaining: 1m 7s
483:	learn: -0.2913458	total: 1m 3s	remain

619:	learn: -0.2815545	total: 1m 21s	remaining: 49.9s
620:	learn: -0.2815226	total: 1m 21s	remaining: 49.7s
621:	learn: -0.2814671	total: 1m 21s	remaining: 49.6s
622:	learn: -0.2814403	total: 1m 21s	remaining: 49.5s
623:	learn: -0.2813797	total: 1m 21s	remaining: 49.4s
624:	learn: -0.2812929	total: 1m 22s	remaining: 49.3s
625:	learn: -0.2812587	total: 1m 22s	remaining: 49.2s
626:	learn: -0.2812204	total: 1m 22s	remaining: 49s
627:	learn: -0.2811960	total: 1m 22s	remaining: 48.9s
628:	learn: -0.2810793	total: 1m 22s	remaining: 48.8s
629:	learn: -0.2810039	total: 1m 22s	remaining: 48.7s
630:	learn: -0.2809804	total: 1m 23s	remaining: 48.6s
631:	learn: -0.2809301	total: 1m 23s	remaining: 48.5s
632:	learn: -0.2808780	total: 1m 23s	remaining: 48.3s
633:	learn: -0.2807729	total: 1m 23s	remaining: 48.2s
634:	learn: -0.2807227	total: 1m 23s	remaining: 48s
635:	learn: -0.2805340	total: 1m 23s	remaining: 47.9s
636:	learn: -0.2805102	total: 1m 23s	remaining: 47.8s
637:	learn: -0.2804621	total: 1m

772:	learn: -0.2741172	total: 1m 40s	remaining: 29.5s
773:	learn: -0.2740628	total: 1m 40s	remaining: 29.4s
774:	learn: -0.2740131	total: 1m 40s	remaining: 29.2s
775:	learn: -0.2739793	total: 1m 40s	remaining: 29.1s
776:	learn: -0.2739185	total: 1m 41s	remaining: 29s
777:	learn: -0.2738633	total: 1m 41s	remaining: 28.9s
778:	learn: -0.2738317	total: 1m 41s	remaining: 28.7s
779:	learn: -0.2737856	total: 1m 41s	remaining: 28.6s
780:	learn: -0.2737353	total: 1m 41s	remaining: 28.5s
781:	learn: -0.2737043	total: 1m 41s	remaining: 28.4s
782:	learn: -0.2736732	total: 1m 41s	remaining: 28.2s
783:	learn: -0.2736633	total: 1m 42s	remaining: 28.1s
784:	learn: -0.2736134	total: 1m 42s	remaining: 28s
785:	learn: -0.2735667	total: 1m 42s	remaining: 27.8s
786:	learn: -0.2735599	total: 1m 42s	remaining: 27.7s
787:	learn: -0.2735123	total: 1m 42s	remaining: 27.6s
788:	learn: -0.2734881	total: 1m 42s	remaining: 27.5s
789:	learn: -0.2734299	total: 1m 42s	remaining: 27.3s
790:	learn: -0.2734132	total: 1m

926:	learn: -0.2688085	total: 2m	remaining: 9.47s
927:	learn: -0.2687843	total: 2m	remaining: 9.34s
928:	learn: -0.2687625	total: 2m	remaining: 9.21s
929:	learn: -0.2687346	total: 2m	remaining: 9.08s
930:	learn: -0.2686757	total: 2m	remaining: 8.95s
931:	learn: -0.2686699	total: 2m	remaining: 8.82s
932:	learn: -0.2686424	total: 2m 1s	remaining: 8.69s
933:	learn: -0.2685666	total: 2m 1s	remaining: 8.56s
934:	learn: -0.2684964	total: 2m 1s	remaining: 8.43s
935:	learn: -0.2684392	total: 2m 1s	remaining: 8.3s
936:	learn: -0.2684086	total: 2m 1s	remaining: 8.17s
937:	learn: -0.2683856	total: 2m 1s	remaining: 8.04s
938:	learn: -0.2683641	total: 2m 1s	remaining: 7.91s
939:	learn: -0.2683322	total: 2m 1s	remaining: 7.78s
940:	learn: -0.2682832	total: 2m 1s	remaining: 7.65s
941:	learn: -0.2682332	total: 2m 2s	remaining: 7.52s
942:	learn: -0.2682096	total: 2m 2s	remaining: 7.39s
943:	learn: -0.2681790	total: 2m 2s	remaining: 7.26s
944:	learn: -0.2681472	total: 2m 2s	remaining: 7.13s
945:	learn: 

<catboost.core.CatBoostClassifier at 0x21964ef1a90>

In [67]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, cat.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, cat.predict(df_test[columns]), average='macro'))

Starting predicting...
train 0.29978423173517954
test 0.28731485224709036
Wall time: 559 ms


Тут получилось плохо, потому что я использовала данные без эмбеддингов. Но это лучше, чем первый бейзлайн.

In [69]:
cat = CatBoostClassifier(random_seed=SEED,
                          depth=2,
                          loss_function='MultiClass')

In [72]:
X_train.toarray()

array([[ 2.67694783,  1.82457614, -0.02699823, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.61131239, -1.02338362, -1.01135623, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.81724876,  0.03102222, -0.31903329, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.34841216, -1.23699093,  0.28605303, ...,  0.        ,
         0.        ,  0.        ],
       [-0.86828572, -0.11770303,  0.16762055, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.5446353 , -1.2318145 , -0.07649441, ...,  0.        ,
         0.        ,  0.        ]])

In [73]:
%%time
cat.fit(X_train.toarray(), y_train)

0:	learn: -2.4369752	total: 2.53s	remaining: 42m 7s
1:	learn: -2.1921323	total: 5.94s	remaining: 49m 22s
2:	learn: -2.0118647	total: 8.08s	remaining: 44m 44s
3:	learn: -1.8689676	total: 10.1s	remaining: 42m 6s
4:	learn: -1.7507913	total: 12.1s	remaining: 40m 16s
5:	learn: -1.6503004	total: 14.2s	remaining: 39m 11s
6:	learn: -1.5631393	total: 16.8s	remaining: 39m 41s
7:	learn: -1.4864086	total: 19.4s	remaining: 40m 4s
8:	learn: -1.4180737	total: 22.5s	remaining: 41m 16s
9:	learn: -1.3566483	total: 25.3s	remaining: 41m 47s
10:	learn: -1.3010118	total: 28.3s	remaining: 42m 21s
11:	learn: -1.2502970	total: 31.2s	remaining: 42m 46s
12:	learn: -1.2026253	total: 33.7s	remaining: 42m 35s
13:	learn: -1.1591311	total: 36.1s	remaining: 42m 25s
14:	learn: -1.1191471	total: 38.7s	remaining: 42m 22s
15:	learn: -1.0822404	total: 41.6s	remaining: 42m 39s
16:	learn: -1.0477819	total: 44.5s	remaining: 42m 55s
17:	learn: -1.0157219	total: 47.1s	remaining: 42m 51s
18:	learn: -0.9858426	total: 49.6s	remain

150:	learn: -0.3201331	total: 5m 23s	remaining: 30m 21s
151:	learn: -0.3199062	total: 5m 25s	remaining: 30m 16s
152:	learn: -0.3192748	total: 5m 27s	remaining: 30m 12s
153:	learn: -0.3187549	total: 5m 29s	remaining: 30m 8s
154:	learn: -0.3183352	total: 5m 30s	remaining: 30m 3s
155:	learn: -0.3179813	total: 5m 32s	remaining: 29m 59s
156:	learn: -0.3174143	total: 5m 34s	remaining: 29m 54s
157:	learn: -0.3171070	total: 5m 36s	remaining: 29m 50s
158:	learn: -0.3161655	total: 5m 37s	remaining: 29m 46s
159:	learn: -0.3158085	total: 5m 39s	remaining: 29m 42s
160:	learn: -0.3156277	total: 5m 41s	remaining: 29m 37s
161:	learn: -0.3154289	total: 5m 42s	remaining: 29m 33s
162:	learn: -0.3146792	total: 5m 44s	remaining: 29m 29s
163:	learn: -0.3142899	total: 5m 46s	remaining: 29m 26s
164:	learn: -0.3137959	total: 5m 48s	remaining: 29m 22s
165:	learn: -0.3133215	total: 5m 50s	remaining: 29m 18s
166:	learn: -0.3130338	total: 5m 51s	remaining: 29m 14s
167:	learn: -0.3128719	total: 5m 53s	remaining: 29

298:	learn: -0.2795572	total: 9m 41s	remaining: 22m 43s
299:	learn: -0.2793551	total: 9m 43s	remaining: 22m 41s
300:	learn: -0.2793011	total: 9m 44s	remaining: 22m 38s
301:	learn: -0.2792263	total: 9m 46s	remaining: 22m 35s
302:	learn: -0.2789885	total: 9m 48s	remaining: 22m 33s
303:	learn: -0.2788913	total: 9m 50s	remaining: 22m 30s
304:	learn: -0.2788276	total: 9m 51s	remaining: 22m 28s
305:	learn: -0.2786405	total: 9m 53s	remaining: 22m 25s
306:	learn: -0.2784539	total: 9m 55s	remaining: 22m 23s
307:	learn: -0.2782732	total: 9m 56s	remaining: 22m 20s
308:	learn: -0.2777856	total: 9m 58s	remaining: 22m 18s
309:	learn: -0.2773605	total: 10m	remaining: 22m 16s
310:	learn: -0.2772358	total: 10m 2s	remaining: 22m 13s
311:	learn: -0.2770450	total: 10m 3s	remaining: 22m 11s
312:	learn: -0.2769524	total: 10m 5s	remaining: 22m 8s
313:	learn: -0.2767445	total: 10m 7s	remaining: 22m 6s
314:	learn: -0.2764326	total: 10m 8s	remaining: 22m 4s
315:	learn: -0.2762560	total: 10m 10s	remaining: 22m 1

443:	learn: -0.2563255	total: 14m 2s	remaining: 17m 35s
444:	learn: -0.2561434	total: 14m 4s	remaining: 17m 33s
445:	learn: -0.2560099	total: 14m 6s	remaining: 17m 31s
446:	learn: -0.2558414	total: 14m 8s	remaining: 17m 29s
447:	learn: -0.2557436	total: 14m 10s	remaining: 17m 27s
448:	learn: -0.2555871	total: 14m 12s	remaining: 17m 25s
449:	learn: -0.2554690	total: 14m 13s	remaining: 17m 23s
450:	learn: -0.2552469	total: 14m 15s	remaining: 17m 21s
451:	learn: -0.2551095	total: 14m 17s	remaining: 17m 19s
452:	learn: -0.2550793	total: 14m 19s	remaining: 17m 17s
453:	learn: -0.2548922	total: 14m 21s	remaining: 17m 15s
454:	learn: -0.2547906	total: 14m 22s	remaining: 17m 13s
455:	learn: -0.2543523	total: 14m 24s	remaining: 17m 11s
456:	learn: -0.2540613	total: 14m 27s	remaining: 17m 10s
457:	learn: -0.2539376	total: 14m 29s	remaining: 17m 8s
458:	learn: -0.2538021	total: 14m 31s	remaining: 17m 6s
459:	learn: -0.2535157	total: 14m 32s	remaining: 17m 4s
460:	learn: -0.2534114	total: 14m 34s	

588:	learn: -0.2385031	total: 18m 53s	remaining: 13m 11s
589:	learn: -0.2384209	total: 18m 55s	remaining: 13m 9s
590:	learn: -0.2383389	total: 18m 57s	remaining: 13m 7s
591:	learn: -0.2383036	total: 18m 59s	remaining: 13m 5s
592:	learn: -0.2381721	total: 19m	remaining: 13m 2s
593:	learn: -0.2380479	total: 19m 2s	remaining: 13m
594:	learn: -0.2378997	total: 19m 4s	remaining: 12m 58s
595:	learn: -0.2378066	total: 19m 5s	remaining: 12m 56s
596:	learn: -0.2377254	total: 19m 7s	remaining: 12m 54s
597:	learn: -0.2376110	total: 19m 9s	remaining: 12m 52s
598:	learn: -0.2374217	total: 19m 11s	remaining: 12m 50s
599:	learn: -0.2373167	total: 19m 13s	remaining: 12m 48s
600:	learn: -0.2372101	total: 19m 15s	remaining: 12m 46s
601:	learn: -0.2370886	total: 19m 16s	remaining: 12m 44s
602:	learn: -0.2369563	total: 19m 18s	remaining: 12m 43s
603:	learn: -0.2369363	total: 19m 20s	remaining: 12m 41s
604:	learn: -0.2368405	total: 19m 22s	remaining: 12m 39s
605:	learn: -0.2366988	total: 19m 24s	remaining:

734:	learn: -0.2255076	total: 23m 23s	remaining: 8m 26s
735:	learn: -0.2254484	total: 23m 25s	remaining: 8m 24s
736:	learn: -0.2253993	total: 23m 27s	remaining: 8m 22s
737:	learn: -0.2253482	total: 23m 29s	remaining: 8m 20s
738:	learn: -0.2252581	total: 23m 31s	remaining: 8m 18s
739:	learn: -0.2252230	total: 23m 32s	remaining: 8m 16s
740:	learn: -0.2250990	total: 23m 35s	remaining: 8m 14s
741:	learn: -0.2250499	total: 23m 37s	remaining: 8m 12s
742:	learn: -0.2249814	total: 23m 39s	remaining: 8m 10s
743:	learn: -0.2249592	total: 23m 40s	remaining: 8m 8s
744:	learn: -0.2249321	total: 23m 42s	remaining: 8m 6s
745:	learn: -0.2248415	total: 23m 44s	remaining: 8m 5s
746:	learn: -0.2247780	total: 23m 46s	remaining: 8m 3s
747:	learn: -0.2247170	total: 23m 48s	remaining: 8m 1s
748:	learn: -0.2246644	total: 23m 50s	remaining: 7m 59s
749:	learn: -0.2245856	total: 23m 52s	remaining: 7m 57s
750:	learn: -0.2245514	total: 23m 54s	remaining: 7m 55s
751:	learn: -0.2244106	total: 23m 56s	remaining: 7m 5

882:	learn: -0.2163839	total: 28m 18s	remaining: 3m 45s
883:	learn: -0.2163515	total: 28m 20s	remaining: 3m 43s
884:	learn: -0.2162582	total: 28m 23s	remaining: 3m 41s
885:	learn: -0.2161895	total: 28m 25s	remaining: 3m 39s
886:	learn: -0.2160758	total: 28m 26s	remaining: 3m 37s
887:	learn: -0.2159930	total: 28m 28s	remaining: 3m 35s
888:	learn: -0.2159735	total: 28m 30s	remaining: 3m 33s
889:	learn: -0.2159421	total: 28m 33s	remaining: 3m 31s
890:	learn: -0.2159087	total: 28m 35s	remaining: 3m 29s
891:	learn: -0.2158835	total: 28m 37s	remaining: 3m 27s
892:	learn: -0.2158454	total: 28m 39s	remaining: 3m 25s
893:	learn: -0.2158092	total: 28m 41s	remaining: 3m 24s
894:	learn: -0.2157840	total: 28m 43s	remaining: 3m 22s
895:	learn: -0.2157611	total: 28m 44s	remaining: 3m 20s
896:	learn: -0.2156868	total: 28m 46s	remaining: 3m 18s
897:	learn: -0.2156176	total: 28m 48s	remaining: 3m 16s
898:	learn: -0.2155222	total: 28m 50s	remaining: 3m 14s
899:	learn: -0.2154780	total: 28m 52s	remaining:

<catboost.core.CatBoostClassifier at 0x21964ef1a90>

In [74]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, cat.predict(X_train.toarray()), average='macro'))
print('test', metrics.f1_score(y_test, cat.predict(X_test.toarray()), average='macro'))

Starting predicting...
train 0.43051089263212056
test 0.39245543877309014
Wall time: 28.5 s


Это почти как второй бейзлайн, значит, параметры всё еще не очень. Но эмбеддинги немного помогли.

In [78]:
cat2 = CatBoostClassifier(random_seed=SEED,
                          depth=6,
                          learning_rate=0.3,  # add
                          classes_count=17,  # add
                          bagging_temperature=0.3,  # add
                          iterations=500,  # add
                          border_count=32,  # add
                          l2_leaf_reg=3,  #add
                          loss_function='MultiClass')

In [79]:
%%time
cat2.fit(X_train.toarray(), y_train)

0:	learn: -0.4375691	total: 10.7s	remaining: 1h 28m 46s
1:	learn: -0.3700690	total: 20.3s	remaining: 1h 24m 5s
2:	learn: -0.3331821	total: 29.5s	remaining: 1h 21m 34s
3:	learn: -0.3081472	total: 38.7s	remaining: 1h 19m 55s
4:	learn: -0.2896297	total: 47.8s	remaining: 1h 18m 49s
5:	learn: -0.2738268	total: 56.9s	remaining: 1h 18m 1s
6:	learn: -0.2585876	total: 1m 5s	remaining: 1h 17m 18s
7:	learn: -0.2494050	total: 1m 15s	remaining: 1h 17m 1s
8:	learn: -0.2413401	total: 1m 23s	remaining: 1h 16m 17s
9:	learn: -0.2338735	total: 1m 33s	remaining: 1h 16m 5s
10:	learn: -0.2297306	total: 1m 41s	remaining: 1h 15m 31s
11:	learn: -0.2256173	total: 1m 51s	remaining: 1h 15m 42s
12:	learn: -0.2196938	total: 2m 1s	remaining: 1h 15m 42s
13:	learn: -0.2166538	total: 2m 10s	remaining: 1h 15m 27s
14:	learn: -0.2109004	total: 2m 19s	remaining: 1h 15m 11s
15:	learn: -0.2072597	total: 2m 28s	remaining: 1h 14m 59s
16:	learn: -0.2035319	total: 2m 37s	remaining: 1h 14m 48s
17:	learn: -0.2008290	total: 2m 47s	

144:	learn: -0.0824904	total: 22m 37s	remaining: 55m 23s
145:	learn: -0.0823275	total: 22m 46s	remaining: 55m 12s
146:	learn: -0.0819062	total: 22m 54s	remaining: 55m 1s
147:	learn: -0.0815477	total: 23m 3s	remaining: 54m 51s
148:	learn: -0.0808520	total: 23m 12s	remaining: 54m 40s
149:	learn: -0.0805698	total: 23m 21s	remaining: 54m 29s
150:	learn: -0.0805376	total: 23m 30s	remaining: 54m 18s
151:	learn: -0.0798514	total: 23m 39s	remaining: 54m 10s
152:	learn: -0.0796068	total: 23m 48s	remaining: 53m 59s
153:	learn: -0.0794391	total: 23m 57s	remaining: 53m 48s
154:	learn: -0.0788847	total: 24m 5s	remaining: 53m 37s
155:	learn: -0.0787496	total: 24m 14s	remaining: 53m 28s
156:	learn: -0.0780934	total: 24m 23s	remaining: 53m 17s
157:	learn: -0.0778845	total: 24m 32s	remaining: 53m 7s
158:	learn: -0.0776208	total: 24m 41s	remaining: 52m 57s
159:	learn: -0.0774162	total: 24m 50s	remaining: 52m 47s
160:	learn: -0.0772249	total: 24m 59s	remaining: 52m 37s
161:	learn: -0.0766963	total: 25m 8

289:	learn: -0.0472743	total: 46m 27s	remaining: 33m 38s
290:	learn: -0.0470477	total: 46m 36s	remaining: 33m 28s
291:	learn: -0.0470301	total: 46m 45s	remaining: 33m 18s
292:	learn: -0.0469258	total: 46m 58s	remaining: 33m 11s
293:	learn: -0.0467109	total: 47m 9s	remaining: 33m 2s
294:	learn: -0.0466621	total: 47m 21s	remaining: 32m 54s
295:	learn: -0.0465846	total: 47m 32s	remaining: 32m 45s
296:	learn: -0.0463378	total: 47m 42s	remaining: 32m 36s
297:	learn: -0.0462627	total: 47m 54s	remaining: 32m 28s
298:	learn: -0.0462202	total: 48m 8s	remaining: 32m 21s
299:	learn: -0.0461937	total: 48m 21s	remaining: 32m 14s
300:	learn: -0.0461251	total: 48m 34s	remaining: 32m 7s
301:	learn: -0.0458505	total: 48m 52s	remaining: 32m 2s
302:	learn: -0.0455856	total: 49m 9s	remaining: 31m 57s
303:	learn: -0.0455419	total: 49m 25s	remaining: 31m 51s
304:	learn: -0.0454766	total: 49m 42s	remaining: 31m 46s
305:	learn: -0.0454068	total: 49m 59s	remaining: 31m 41s
306:	learn: -0.0453697	total: 50m 16s

431:	learn: -0.0314404	total: 1h 21m 51s	remaining: 12m 53s
432:	learn: -0.0313782	total: 1h 22m 5s	remaining: 12m 42s
433:	learn: -0.0312770	total: 1h 22m 20s	remaining: 12m 31s
434:	learn: -0.0311223	total: 1h 22m 35s	remaining: 12m 20s
435:	learn: -0.0311046	total: 1h 22m 49s	remaining: 12m 9s
436:	learn: -0.0310556	total: 1h 23m 4s	remaining: 11m 58s
437:	learn: -0.0309107	total: 1h 23m 19s	remaining: 11m 47s
438:	learn: -0.0308377	total: 1h 23m 33s	remaining: 11m 36s
439:	learn: -0.0308141	total: 1h 23m 48s	remaining: 11m 25s
440:	learn: -0.0308135	total: 1h 24m 3s	remaining: 11m 14s
441:	learn: -0.0306995	total: 1h 24m 18s	remaining: 11m 3s
442:	learn: -0.0306313	total: 1h 24m 33s	remaining: 10m 52s
443:	learn: -0.0305342	total: 1h 24m 48s	remaining: 10m 41s
444:	learn: -0.0305267	total: 1h 25m 2s	remaining: 10m 30s
445:	learn: -0.0305179	total: 1h 25m 18s	remaining: 10m 19s
446:	learn: -0.0304681	total: 1h 25m 32s	remaining: 10m 8s
447:	learn: -0.0303915	total: 1h 25m 46s	remain

<catboost.core.CatBoostClassifier at 0x2196517a710>

In [80]:
%%time
print('Starting predicting...')
# eval
print('train', metrics.f1_score(y_train, cat2.predict(X_train.toarray()), average='macro'))
print('test', metrics.f1_score(y_test, cat2.predict(X_test.toarray()), average='macro'))

Starting predicting...
train 0.9820775196557863
test 0.8577297664886203
Wall time: 32.6 s


0.858 - лучший результат. Но еще, конечно, во всех этих моделях можно было бы потюнить параметры и сделать GridSearchCV, но я на этом остановлюсь, потому что остальное долго училось бы.