In [1]:
# пакеты ntlk могут ругаться при импорте и требовать загрузки
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import  VotingClassifier
from sklearn.svm import SVC
import nltk, re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
# загрузим данные
test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col=0)
train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'y'])

In [3]:
# для удобства объеидим их в один корпус
all_texts = pd.concat([train[['text']], test], axis=0, ignore_index=True)
corpus = all_texts.text.to_list()

In [4]:
normalizer = WordNetLemmatizer()
# функция для нахождения наиболее вероятной части речи через синонимы
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

In [5]:
#шаблон для замены
special_chars = re.compile(r'[^A-Za-z_\d,.?!;:$\- \'\"]', re.IGNORECASE)

In [6]:
# препроцессинг
def preprocess_text(text):
    #чистка текста
    text = text.lower()
    text = text.replace("``", '"').replace("''", '"').replace("`", "'")
    text = text.replace("n 't", "n't").replace("can not", "cannot")
    text = special_chars.sub(' ', text)
    text = re.sub(' +', ' ', text)
    # токенизация
    tokenized = word_tokenize(text)
    # Лемматизация с использованием наиболее вероятной части речи
    normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
    return normalized

In [7]:
#процессим текст
proccesed_corpus = [preprocess_text(sent) for sent in corpus]

In [8]:
#векторизация текста
bow_vectorizer = CountVectorizer(ngram_range=(1,2))

In [9]:
bow_vectorizer.fit(proccesed_corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
#Формируем признаки с CountVectorizer  c нграммами (1,2)
X_train = bow_vectorizer.transform(proccesed_corpus[:2000])
X_test = bow_vectorizer.transform(proccesed_corpus[2000:])
y = train.y

In [11]:
# модели для ансамбля
clf1 = LogisticRegression()
clf2 = MultinomialNB()
clf3 = SVC(C=1000, gamma=0.001, probability=True)
clf4 = CatBoostClassifier(eval_metric='Accuracy')

In [12]:
# Ансамбль моделей
eclf4 = VotingClassifier(estimators=[
        ('lr', clf1), ('nb', clf2), ('svm', clf3), ('cvc', clf4)], voting='soft')

In [13]:
eclf4 = eclf4.fit(X_train, y)

Learning rate set to 0.020385
0:	learn: 0.6375000	total: 113ms	remaining: 1m 52s
1:	learn: 0.6415000	total: 158ms	remaining: 1m 19s
2:	learn: 0.6405000	total: 206ms	remaining: 1m 8s
3:	learn: 0.6490000	total: 252ms	remaining: 1m 2s
4:	learn: 0.6710000	total: 298ms	remaining: 59.3s
5:	learn: 0.6720000	total: 344ms	remaining: 57s
6:	learn: 0.6780000	total: 393ms	remaining: 55.8s
7:	learn: 0.6760000	total: 438ms	remaining: 54.4s
8:	learn: 0.6825000	total: 486ms	remaining: 53.5s
9:	learn: 0.6810000	total: 530ms	remaining: 52.5s
10:	learn: 0.6690000	total: 575ms	remaining: 51.7s
11:	learn: 0.6715000	total: 620ms	remaining: 51.1s
12:	learn: 0.6690000	total: 665ms	remaining: 50.5s
13:	learn: 0.6720000	total: 709ms	remaining: 50s
14:	learn: 0.6695000	total: 754ms	remaining: 49.5s
15:	learn: 0.6705000	total: 798ms	remaining: 49.1s
16:	learn: 0.6690000	total: 844ms	remaining: 48.8s
17:	learn: 0.6670000	total: 890ms	remaining: 48.6s
18:	learn: 0.6735000	total: 936ms	remaining: 48.3s
19:	learn: 0.

161:	learn: 0.7305000	total: 7.41s	remaining: 38.3s
162:	learn: 0.7310000	total: 7.45s	remaining: 38.3s
163:	learn: 0.7320000	total: 7.49s	remaining: 38.2s
164:	learn: 0.7310000	total: 7.54s	remaining: 38.2s
165:	learn: 0.7320000	total: 7.58s	remaining: 38.1s
166:	learn: 0.7320000	total: 7.63s	remaining: 38.1s
167:	learn: 0.7325000	total: 7.67s	remaining: 38s
168:	learn: 0.7330000	total: 7.72s	remaining: 37.9s
169:	learn: 0.7330000	total: 7.76s	remaining: 37.9s
170:	learn: 0.7335000	total: 7.8s	remaining: 37.8s
171:	learn: 0.7350000	total: 7.85s	remaining: 37.8s
172:	learn: 0.7350000	total: 7.89s	remaining: 37.7s
173:	learn: 0.7350000	total: 7.94s	remaining: 37.7s
174:	learn: 0.7370000	total: 7.98s	remaining: 37.6s
175:	learn: 0.7360000	total: 8.03s	remaining: 37.6s
176:	learn: 0.7375000	total: 8.07s	remaining: 37.5s
177:	learn: 0.7380000	total: 8.12s	remaining: 37.5s
178:	learn: 0.7380000	total: 8.16s	remaining: 37.4s
179:	learn: 0.7390000	total: 8.21s	remaining: 37.4s
180:	learn: 0.7

321:	learn: 0.7920000	total: 14.5s	remaining: 30.5s
322:	learn: 0.7910000	total: 14.5s	remaining: 30.5s
323:	learn: 0.7910000	total: 14.6s	remaining: 30.4s
324:	learn: 0.7915000	total: 14.6s	remaining: 30.4s
325:	learn: 0.7920000	total: 14.7s	remaining: 30.3s
326:	learn: 0.7930000	total: 14.7s	remaining: 30.3s
327:	learn: 0.7920000	total: 14.8s	remaining: 30.2s
328:	learn: 0.7935000	total: 14.8s	remaining: 30.2s
329:	learn: 0.7930000	total: 14.8s	remaining: 30.1s
330:	learn: 0.7930000	total: 14.9s	remaining: 30.1s
331:	learn: 0.7935000	total: 14.9s	remaining: 30s
332:	learn: 0.7955000	total: 15s	remaining: 30s
333:	learn: 0.7960000	total: 15s	remaining: 30s
334:	learn: 0.7965000	total: 15.1s	remaining: 29.9s
335:	learn: 0.7970000	total: 15.1s	remaining: 29.9s
336:	learn: 0.7970000	total: 15.2s	remaining: 29.8s
337:	learn: 0.7970000	total: 15.2s	remaining: 29.8s
338:	learn: 0.7970000	total: 15.2s	remaining: 29.7s
339:	learn: 0.7980000	total: 15.3s	remaining: 29.7s
340:	learn: 0.7975000	

481:	learn: 0.8385000	total: 21.6s	remaining: 23.2s
482:	learn: 0.8395000	total: 21.6s	remaining: 23.2s
483:	learn: 0.8395000	total: 21.7s	remaining: 23.1s
484:	learn: 0.8400000	total: 21.7s	remaining: 23.1s
485:	learn: 0.8395000	total: 21.8s	remaining: 23s
486:	learn: 0.8410000	total: 21.8s	remaining: 23s
487:	learn: 0.8410000	total: 21.9s	remaining: 23s
488:	learn: 0.8415000	total: 21.9s	remaining: 22.9s
489:	learn: 0.8425000	total: 22s	remaining: 22.9s
490:	learn: 0.8425000	total: 22s	remaining: 22.8s
491:	learn: 0.8430000	total: 22.1s	remaining: 22.8s
492:	learn: 0.8430000	total: 22.1s	remaining: 22.7s
493:	learn: 0.8425000	total: 22.2s	remaining: 22.7s
494:	learn: 0.8425000	total: 22.2s	remaining: 22.7s
495:	learn: 0.8430000	total: 22.3s	remaining: 22.6s
496:	learn: 0.8430000	total: 22.3s	remaining: 22.6s
497:	learn: 0.8425000	total: 22.4s	remaining: 22.6s
498:	learn: 0.8430000	total: 22.4s	remaining: 22.5s
499:	learn: 0.8440000	total: 22.5s	remaining: 22.5s
500:	learn: 0.8460000	

642:	learn: 0.8790000	total: 29.7s	remaining: 16.5s
643:	learn: 0.8785000	total: 29.7s	remaining: 16.4s
644:	learn: 0.8785000	total: 29.8s	remaining: 16.4s
645:	learn: 0.8785000	total: 29.8s	remaining: 16.3s
646:	learn: 0.8790000	total: 29.9s	remaining: 16.3s
647:	learn: 0.8785000	total: 29.9s	remaining: 16.3s
648:	learn: 0.8785000	total: 30s	remaining: 16.2s
649:	learn: 0.8795000	total: 30s	remaining: 16.2s
650:	learn: 0.8805000	total: 30.1s	remaining: 16.1s
651:	learn: 0.8815000	total: 30.1s	remaining: 16.1s
652:	learn: 0.8815000	total: 30.2s	remaining: 16s
653:	learn: 0.8820000	total: 30.2s	remaining: 16s
654:	learn: 0.8825000	total: 30.3s	remaining: 15.9s
655:	learn: 0.8825000	total: 30.3s	remaining: 15.9s
656:	learn: 0.8820000	total: 30.4s	remaining: 15.9s
657:	learn: 0.8825000	total: 30.4s	remaining: 15.8s
658:	learn: 0.8825000	total: 30.5s	remaining: 15.8s
659:	learn: 0.8830000	total: 30.5s	remaining: 15.7s
660:	learn: 0.8835000	total: 30.6s	remaining: 15.7s
661:	learn: 0.883500

804:	learn: 0.9025000	total: 37.4s	remaining: 9.05s
805:	learn: 0.9030000	total: 37.4s	remaining: 9s
806:	learn: 0.9030000	total: 37.5s	remaining: 8.96s
807:	learn: 0.9030000	total: 37.5s	remaining: 8.91s
808:	learn: 0.9035000	total: 37.5s	remaining: 8.86s
809:	learn: 0.9035000	total: 37.6s	remaining: 8.82s
810:	learn: 0.9035000	total: 37.6s	remaining: 8.77s
811:	learn: 0.9045000	total: 37.7s	remaining: 8.72s
812:	learn: 0.9045000	total: 37.7s	remaining: 8.68s
813:	learn: 0.9050000	total: 37.8s	remaining: 8.63s
814:	learn: 0.9050000	total: 37.8s	remaining: 8.59s
815:	learn: 0.9055000	total: 37.9s	remaining: 8.54s
816:	learn: 0.9050000	total: 37.9s	remaining: 8.49s
817:	learn: 0.9055000	total: 38s	remaining: 8.45s
818:	learn: 0.9050000	total: 38s	remaining: 8.4s
819:	learn: 0.9050000	total: 38.1s	remaining: 8.35s
820:	learn: 0.9060000	total: 38.1s	remaining: 8.31s
821:	learn: 0.9060000	total: 38.1s	remaining: 8.26s
822:	learn: 0.9060000	total: 38.2s	remaining: 8.21s
823:	learn: 0.906500

963:	learn: 0.9250000	total: 44.7s	remaining: 1.67s
964:	learn: 0.9250000	total: 44.7s	remaining: 1.62s
965:	learn: 0.9245000	total: 44.8s	remaining: 1.57s
966:	learn: 0.9250000	total: 44.8s	remaining: 1.53s
967:	learn: 0.9240000	total: 44.9s	remaining: 1.48s
968:	learn: 0.9240000	total: 44.9s	remaining: 1.44s
969:	learn: 0.9240000	total: 44.9s	remaining: 1.39s
970:	learn: 0.9245000	total: 45s	remaining: 1.34s
971:	learn: 0.9245000	total: 45s	remaining: 1.3s
972:	learn: 0.9245000	total: 45.1s	remaining: 1.25s
973:	learn: 0.9245000	total: 45.1s	remaining: 1.2s
974:	learn: 0.9245000	total: 45.2s	remaining: 1.16s
975:	learn: 0.9255000	total: 45.2s	remaining: 1.11s
976:	learn: 0.9260000	total: 45.3s	remaining: 1.06s
977:	learn: 0.9260000	total: 45.3s	remaining: 1.02s
978:	learn: 0.9260000	total: 45.4s	remaining: 973ms
979:	learn: 0.9260000	total: 45.4s	remaining: 927ms
980:	learn: 0.9270000	total: 45.4s	remaining: 880ms
981:	learn: 0.9265000	total: 45.5s	remaining: 834ms
982:	learn: 0.9275

In [19]:
y_pred = eclf4.predict(X_test)

In [20]:
#Выгружаем Submission. На публичном лидерборде дало 0,80888
submission = pd.DataFrame(y_pred, index = test.index, columns=['y'])
submission.to_csv('submission_veo_kaggle_1805.csv')

In [14]:
#import pickle

In [15]:
#pickle.dump(bow_vectorizer,open('bow_vectorizer.pickle','wb'))

In [16]:
#pickle.dump(eclf4,open('model.pickle','wb'))