In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=23

In [4]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [5]:
# number of sentences
df.sentence_idx.max()

1500.0

In [6]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [0]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [0]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [9]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [10]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [0]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/4c/3b/4ae113193b4ee01387ed76d5eea32788aec0589df9ae7378a8b7443eaa8b/lightgbm-2.2.2-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 16.7MB/s 
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.2


In [0]:
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [0]:
lg = lgb.LGBMClassifier(silent=False)
param_dist = {"max_depth": [50,75],
              "learning_rate" : [0.1, 0.5, 1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200, 300]
             }

In [0]:
columns = ['sentence_idx','next-next-pos','next-pos','pos','prev-pos','prev-prev-pos']

In [0]:
grid_search = GridSearchCV(lg,
                           n_jobs=-1,
                           param_grid=param_dist,
                           cv = 3,
                           verbose=5)

grid_search.fit(df_train[columns], y_train)

In [0]:
print("Best parameters: {} with score: {}".format(grid_search.best_params_,
                                                  grid_search.best_score_))

In [0]:
model = grid_search.best_estimator_.fit(df_train[columns], y_train)

In [0]:
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

In [12]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/19/ea/d6cbdf03fb1e8ea8c7c7a0b37b89c8f8b3825ec625d428d49b6230656c00/catboost-0.11.1-cp36-none-manylinux1_x86_64.whl (55.3MB)
[K    100% |████████████████████████████████| 55.3MB 423kB/s 
Collecting enum34 (from catboost)
  Downloading https://files.pythonhosted.org/packages/af/42/cb9355df32c69b553e72a2e28daee25d1611d2c0d9c272aa1d34204205b2/enum34-1.1.6-py3-none-any.whl
Installing collected packages: enum34, catboost
Successfully installed catboost-0.11.1 enum34-1.1.6


In [0]:
from catboost import CatBoostClassifier

from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [0]:
param_dist = {'learning_rate': [0.01, 0.1, 1],
              'depth': [5, 10]}

cbc = CatBoostClassifier(learning_rate=1,
                         iterations=800,
                         random_state=SEED,
                         early_stopping_rounds=10,
                         loss_function='MultiClassOneVsAll',
                         custom_loss='F1')

grid_search_cat = GridSearchCV(cbc,
                               n_jobs=-1,
                               param_grid=param_dist,
                               cv = 3,
                               verbose=1)

grid_search_cat.fit(df_train[columns], y_train)

In [21]:
print("Best parameters: {} with score: {}".format(grid_search_cat.best_params_,
                                                  grid_search_cat.best_score_))

Best parameters: {'depth': 10, 'learning_rate': 1} with score: 0.9584488086930515


In [0]:
model_cat = grid_search_cat.best_estimator_.fit(df_train[columns], y_train)

In [23]:
print('train', metrics.f1_score(y_train, model_cat.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model_cat.predict(df_test[columns]), average='macro'))

train 0.9948557391944455
test 0.8989441134767683


# Итак, подведем итоги.

Модель, которая побила baseline 3:






In [26]:
model_cat.get_params()

{'custom_loss': 'F1',
 'depth': 10,
 'early_stopping_rounds': 10,
 'iterations': 800,
 'learning_rate': 1,
 'loss_function': 'MultiClassOneVsAll',
 'random_state': 23}

С результатом:

In [27]:
print('train', metrics.f1_score(y_train, model_cat.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model_cat.predict(df_test[columns]), average='macro'))

train 0.9948557391944455
test 0.8989441134767683


Результат работы с LightGBM будет находиться в другом файле (assignment_4_LightGBM.ipynb),поскольку результат CatBoost оказался лучше, то в финальной версии я его и оставил.

# Вопросы

**1) Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?**

Мы можем заметить, что наша выборка очень-очень несбалансированная. В f1 score у нас учитывается и плотность, и полнота, что помогает решить эту проблему.

**2) How can you exploit that words belong to some sentence?**

Во многих языках, имеются правила, которые определяют порядок слов. Тем самым, вероятность встретить существительное после глагола (пример с потолка) будет гораздо выше, чем прилагательное. Тем самым, зная правила, и зная характеристику слов, мы можем предаоложить характеристику послед. слова.