## Импрорт датасета

In [20]:
import pandas as pd

In [21]:
df_positive = pd.read_csv('data/positive.csv', sep=',', usecols=[3], names=['text'], skiprows=1)
df_positive['label'] = 'positive'
df_negative = pd.read_csv('data/negative.csv', sep=',', usecols=[3], names=['text'], skiprows=1)
df_negative['label'] = 'negative'
df = pd.concat([df_positive, df_negative], ignore_index=True)


In [3]:
df.sample(10)

Unnamed: 0,text,label
182518,"Все покупают подарки, а я дома с температурой!...",negative
101872,Мне много не надо... Власть над миром и чего-н...,positive
222805,@annett_14 @Vukadinovich дааа уж...просто слов...,negative
190257,расстроеная и заплаканая иду спать :( всем сла...,negative
34813,"@akhitruk :)) просто переживаю, чтобы они себе...",positive
61388,"Сидим на работе в темноте все утро, отдыхаем))...",positive
187397,"RT @sd0107: «Наша страна стала лучше, богаче, ...",negative
58970,"Сегодня ""напросился"" к одногрупнику на поесть,...",positive
188131,"@Nastya_Ertulova ВЕЗЁТ ТЕБЕ, А Я В ЧЕТЫРЕ УТРА...",negative
98009,@Hey_hey_bitch этой традиции уже несколько век...,positive


In [4]:
df['label'].value_counts()

label
positive    114911
negative    111923
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.text, df.label, random_state=42, test_size=0.2)

## Чистка текста

In [6]:
import re
from string import punctuation

def clean_text(text):
    text = re.sub(r'@\w+', '', text) #remove mentions
    text = re.sub(r'#\w+', '', text) #remove hashtags
    text = re.sub(r'http\S+', '', text) #remove urls
    text = re.sub('[{}]'.format(punctuation), ' ', text) #remove punctuation
    text = re.sub(r'\d+', '', text) #remove digits
    text = re.sub(r'\s+', ' ', text) #remove extra whitespaces
    return text.strip()

## Лемматизация

In [7]:
from pymystem3 import Mystem
import nltk
nltk.download('stopwords')


from nltk.corpus import stopwords

to_remove = stopwords.words('russian') + ['RT', '', ' ', '\n']
#to_remove = ['RT', '', ' ', '\n']


def lemmatize(text):
    mystem_analyzer = Mystem(grammar_info=False)
    lemmas = mystem_analyzer.lemmatize(text)
    lemmas[-1] = lemmas[-1].rstrip()
    return [word for word in lemmas if word not in to_remove]

def preprocess(text):
    text = clean_text(text)
    text = lemmatize(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pavel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from tqdm import tqdm

from joblib import Parallel, delayed

def process_batch(text):
    merged_text = " sep ".join(text)

    doc = []
    res = []

    for t in preprocess(merged_text):
        if t.strip() != 'sep':
            doc.append(t)
        else:
            res.append(doc)
            doc = []
    res.append(doc)
    return res

def parallel_preprocess(data, batch_size=1000):
    texts = data.values

    text_batch = [texts[i: i + batch_size] for i in range(0, len(texts), batch_size)]
    processed_texts = Parallel(n_jobs=-1, backend="threading")(delayed(process_batch)(t) for t in tqdm(text_batch))
    combined_texts = [' '.join(text) for batch in processed_texts for text in batch]
    return pd.Series(combined_texts, index=data.index)


In [19]:
x_train = parallel_preprocess(x_train)
x_test = parallel_preprocess(x_test)


In [10]:
x_train

107097             немного выпивать говорить ничто говорить
224163     твой общение шутка соскучиться че давно видеться
63992                              ахахаахх собака называть
108549    лично вообще барабан гиа написать заморачивать...
67962            наташа реальный китаец писать хер понимать
                                ...                        
119879    хотеть графический планшетик скоро приходить х...
103694                                           работяга D
131932           дом год приходиться второй раковина менять
146867                                 представлять бояться
121958           день месяц общаться мм время быстро лететь
Length: 181467, dtype: object

## Векторизация текста

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_unigram = TfidfVectorizer(ngram_range=(1, 1))
vec_bigram = TfidfVectorizer(ngram_range=(2, 2))
vec_multigram = TfidfVectorizer(ngram_range=(1, 3))
#select vectorizer
vec = vec_unigram
vec_train = vec.fit_transform(x_train)
vec_test = vec.transform(x_test)

import joblib
joblib.dump(vec, 'vectorizer.pkl')

['vectorizer.pkl']

## Построение моделей

In [None]:
from sklearn.metrics import classification_report

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, n_jobs=-1)
clf.fit(vec_train, y_train)
pred = clf.predict(vec_test)
print('classification report for Random Forest:')
print(classification_report(pred, y_test))
joblib.dump(clf, 'RandomForest.pkl')

classification report for Random Forest:
              precision    recall  f1-score   support

    negative       0.70      0.71      0.71     22131
    positive       0.72      0.71      0.71     23236

    accuracy                           0.71     45367
   macro avg       0.71      0.71      0.71     45367
weighted avg       0.71      0.71      0.71     45367



['RandomForest.pkl']

In [14]:
from sklearn.svm import LinearSVC
clf = LinearSVC(dual=True)
clf.fit(vec_train, y_train)
pred = clf.predict(vec_test)
print('classification report for SVM:')
print(classification_report(pred, y_test))
joblib.dump(clf, 'SVM.pkl')

classification report for SVM:
              precision    recall  f1-score   support

    negative       0.71      0.72      0.71     22223
    positive       0.72      0.72      0.72     23144

    accuracy                           0.72     45367
   macro avg       0.72      0.72      0.72     45367
weighted avg       0.72      0.72      0.72     45367



['SVM.pkl']

In [15]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(vec_train, y_train)
pred = clf.predict(vec_test)
print('classification report for Naive Bayes:')
print(classification_report(pred, y_test))
joblib.dump(clf, 'NaiveBayes.pkl')

classification report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.71      0.72      0.71     22242
    positive       0.73      0.72      0.72     23125

    accuracy                           0.72     45367
   macro avg       0.72      0.72      0.72     45367
weighted avg       0.72      0.72      0.72     45367



['NaiveBayes.pkl']

## Ансамбль

In [16]:
from sklearn.ensemble import VotingClassifier

clf1 = RandomForestClassifier(n_estimators=100, min_samples_leaf=3, n_jobs=-1)
clf2 = LinearSVC()
clf3 = MultinomialNB()

voting_ensemble = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('nb', clf3)], voting='hard', n_jobs=-1)
voting_ensemble.fit(vec_train, y_train)
pred = voting_ensemble.predict(vec_test)
print('classification report for Ensemble:')
print(classification_report(pred, y_test))
joblib.dump(voting_ensemble, 'Ensemble.pkl')

classification report for Ensemble:
              precision    recall  f1-score   support

    negative       0.71      0.73      0.72     22046
    positive       0.74      0.72      0.73     23321

    accuracy                           0.72     45367
   macro avg       0.72      0.72      0.72     45367
weighted avg       0.72      0.72      0.72     45367



['Ensemble.pkl']