In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
import spacy
from catboost import CatBoostClassifier
from tqdm import tqdm
tqdm.pandas()

import warnings
import joblib

from model.tokenizer import Tokenizer

warnings.filterwarnings("ignore")

In [2]:
test = pd.read_table('D:\Files\dl_workspace\Анализ тональности отзывов\data\products_sentiment_test.tsv', index_col='Id')
sample_submit = \
    pd.read_csv('D:\Files\dl_workspace\Анализ тональности отзывов\data\products_sentiment_sample_submission.csv', index_col='Id')
train = pd.read_table('D:\Files\dl_workspace\Анализ тональности отзывов\data\products_sentiment_train.tsv', header=None)
train.columns = ['text', 'target']

In [3]:
nlp = spacy.load("en_core_web_sm")

In [53]:
train.head()

Unnamed: 0,text,target,tokens,prepared_text
0,"2 . take around 10,000 640x480 pictures .",1,"[., take, around, NUM, NUM, picture, .]",. take around NUM NUM picture .
1,i downloaded a trial version of computer assoc...,1,"[PRON, download, a, trial, version, of, comput...",PRON download a trial version of computer asso...
2,the wrt54g plus the hga7t is a perfect solutio...,1,"[the, wrt54, g, plus, the, hga7, t, be, a, per...",the wrt54 g plus the hga7 t be a perfect solut...
3,i dont especially like how music files are uns...,0,"[PRON, do, not, especially, like, how, music, ...",PRON do not especially like how music file be ...
4,i was using the cheapie pail ... and it worked...,1,"[PRON, be, use, the, cheapie, pail, ..., and, ...",PRON be use the cheapie pail ... and PRON work...


In [4]:
eng_stopwords = list(nlp.Defaults.stop_words)

In [5]:
count_text_pipeline = \
    make_pipeline(Tokenizer(), CountVectorizer(ngram_range=(1, 2), stop_words=eng_stopwords, max_df=0.5), LogisticRegression(random_state=0, max_iter=1000))

In [43]:
accuracy_cv_score = cross_val_score(count_text_pipeline, X=train.text, y=train.target, cv=5, scoring='accuracy')
print('Mean accuracy - ', accuracy_cv_score.mean(), ' with std - ', accuracy_cv_score.std())

Mean accuracy -  0.756  with std -  0.013095800853708787


In [20]:
roc_auc_cv_score = cross_val_score(count_text_pipeline, X=train.text, y=train.target, cv=5, scoring='roc_auc')
print('Mean roc_auc - ', roc_auc_cv_score.mean(), ' with std - ', roc_auc_cv_score.std())

Mean roc_auc -  0.8112123323771503  with std -  0.008597149772213037


## CatBoost

In [126]:
catboost = CatBoostClassifier()

In [132]:
from catboost import cv, Pool

params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': 'AUC',
    'random_seed': 63,
}

cv_data = cv(
    params=params,
    pool=Pool(train['prepared_text'], label=train['target'], text_features=[0]),
    fold_count=5, # Разбивка выборки на 5 кусочков
    shuffle=True, # Перемешаем наши данные
    partition_random_seed=0,
    plot=True, # Никуда без визуализатора
    stratified=True,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.4337022636
bestIteration = 876

Training on fold [1/5]

bestTest = 0.4771809359
bestIteration = 491

Training on fold [2/5]

bestTest = 0.4619427055
bestIteration = 440

Training on fold [3/5]

bestTest = 0.4198505803
bestIteration = 999

Training on fold [4/5]

bestTest = 0.4665979993
bestIteration = 789



In [147]:
catboost.fit(X=pd.DataFrame(train['prepared_text']), y=pd.DataFrame(train['target']), text_features=[0])

Learning rate set to 0.013851
0:	learn: 0.6885455	total: 39.8ms	remaining: 39.8s
1:	learn: 0.6843818	total: 120ms	remaining: 1m
2:	learn: 0.6796931	total: 177ms	remaining: 58.9s
3:	learn: 0.6751349	total: 236ms	remaining: 58.7s
4:	learn: 0.6709160	total: 282ms	remaining: 56.2s
5:	learn: 0.6668124	total: 384ms	remaining: 1m 3s
6:	learn: 0.6628923	total: 492ms	remaining: 1m 9s
7:	learn: 0.6589329	total: 536ms	remaining: 1m 6s
8:	learn: 0.6552600	total: 581ms	remaining: 1m 3s
9:	learn: 0.6514492	total: 625ms	remaining: 1m 1s
10:	learn: 0.6473506	total: 677ms	remaining: 1m
11:	learn: 0.6440403	total: 737ms	remaining: 1m
12:	learn: 0.6408657	total: 784ms	remaining: 59.5s
13:	learn: 0.6376177	total: 827ms	remaining: 58.2s
14:	learn: 0.6339431	total: 876ms	remaining: 57.5s
15:	learn: 0.6307834	total: 918ms	remaining: 56.5s
16:	learn: 0.6267192	total: 961ms	remaining: 55.6s
17:	learn: 0.6234446	total: 1.1s	remaining: 59.8s
18:	learn: 0.6203986	total: 1.16s	remaining: 60s
19:	learn: 0.6177014	t

<catboost.core.CatBoostClassifier at 0x140fb186910>

### Feature importance

In [6]:
count_text_pipeline.fit(X=train.text, y=train.target)

Pipeline(steps=[('tokenizer',
                 <model.tokenizer.Tokenizer object at 0x0000016716AFAD90>),
                ('countvectorizer',
                 CountVectorizer(max_df=0.5, ngram_range=(1, 2),
                                 stop_words=['enough', 'other', 'anyone',
                                             'will', 'n’t', 'we', 'seemed',
                                             "'d", 'latterly', 'less', 'yet',
                                             'seems', 'whoever', 'everywhere',
                                             'beforehand', 'move', 'becoming',
                                             'more', 'another', 'around',
                                             'take', 'get', 'six', 'nor', 'can',
                                             'done', 'so', 'already', 'once',
                                             'her', ...])),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, random_state=0))])

In [38]:
df_coef = pd.DataFrame()
df_coef['feature']=count_text_pipeline.named_steps.countvectorizer.get_feature_names_out()
df_coef['coef']=np.abs(count_text_pipeline.named_steps.logisticregression.coef_[0])

df_coef.sort_values(by='coef', ascending=False).head(5)

Unnamed: 0,feature,coef
4638,great,1.976654
3528,excellent,1.549583
3252,easy,1.495637
12924,unfortunately,1.365266
6353,love,1.323106


## Sample

In [None]:
count_text_pipeline.fit(X=train.text, y=train.target)

In [23]:
sample_submit.head()

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,0


In [30]:
sample_submit['y'] = count_text_pipeline.predict(test.text)

In [150]:
sample_submit.to_csv('D:\Files\dl_workspace\Анализ тональности отзывов\data\onswer.csv')

## Сохраним пайплайн

In [7]:
joblib.dump(count_text_pipeline,
            'D:\Files\dl_workspace\Анализ тональности отзывов\simple_demo\model\linear_regression_pipe.pkl')

['D:\\Files\\dl_workspace\\Анализ тональности отзывов\\simple_demo\\model\\linear_regression_pipe.pkl']