In [196]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from pymystem3 import Mystem

RANDOM_SEED = 42
from sklearn.metrics import roc_auc_score

In [197]:
df_train = pd.read_csv('rusentitweet_train.csv')
df_test = pd.read_csv('rusentitweet_test.csv')

#–æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ label = positive/negative
df_train = df_train[df_train['label'].isin(['positive', 'negative'])]
df_test = df_test[df_test['label'].isin(['positive', 'negative'])]
df_train

Unnamed: 0,text,label,id
0,–ü–æ–º–æ–π–º—É —è –≤–∫—Ä–∞—à–∏–ª–∞—Å—å –≤ –ß–∏–º–∏–Ω–∞ü§ß https://t.co/t2...,positive,1282311169534038016
5,@buybread_ —è –Ω–µ —Å –ø–æ—Ä—è–¥–∫–µ!!!!,negative,1335130757044563971
10,@ange1flyhigh –í —Å–ª–µ–¥—É—é—â–∏–π —Ä–∞–∑ –±—É–¥—É –¥–æ –ø–æ–±–µ–¥–Ω–æ–≥...,positive,1215370396465291267
15,@LimitaVIP –£–¥–∏–≤–∏—Ç–µ–ª—å–Ω—ã–π –≥i–º–Ω...\r\n–£–¥–∏–≤–∏—Ç–µ–ª—å–Ω–æ...,negative,1253799540848762887
17,—è —Å—Ä–∞–ª–∞ –Ω–∞ —ç—Ç—É –±–∏–æ–ª–æ–≥–∏—é,negative,1339418979887173632
...,...,...,...
10704,"–î–µ–¥—Ä–∞–¥–∏–æ5 —Ç–æ –µ—Å—Ç—å —Ç—ã —Ö–æ—á–µ—à—å —Å–∫–∞–∑–∞—Ç—å, —á—Ç–æ —Ç—ã –ª–∞...",negative,1315037255833092098
10705,"@kmoo_m –î–ï–ô–°–¢–í–ò–¢–ï–õ–¨–ù–û\r\n–µ—Å–ª–∏ –ª—é–±–æ–≤—å, —Ç–æ —Ç–æ–ª—å–∫...",positive,1323606772578459648
10708,–° –•–æ–±–∏ —É—Ç—Ä–æ –º–æ–∂–µ—Ç –±—ã—Ç—å —Ç–æ–ª—å–∫–æ –¥–æ–±—Ä—ã–ºüòÇ https://...,positive,1310485706213666816
10711,"–ù–µ —É—Å–ø–µ–ª–∞ –≤—Å—Ç–∞—Ç—å, –∞ —É–∂–µ –∑–∞–µ–±–∞–ª–∞—Å—å, –≤–ø—Ä–æ—á–µ–º –Ω–∏—á...",negative,1343031810746425344


In [198]:
russian_stopwords = stopwords.words("russian")

def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)   # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫
    tweet = re.sub(r'@\w+', '', tweet)      # –£–¥–∞–ª–µ–Ω–∏–µ —É–ø–æ–º–∏–Ω–∞–Ω–∏–π –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
    tweet = re.sub(r'#\w+', '', tweet)      # –£–¥–∞–ª–µ–Ω–∏–µ —Ö–µ—à—Ç–µ–≥–æ–≤
    tweet = re.sub(r'\d+', '', tweet)       # –£–¥–∞–ª–µ–Ω–∏–µ —á–∏—Å–µ–ª
    tweet = re.sub(r'[^\w\s]', '', tweet)   # –£–¥–∞–ª–µ–Ω–∏–µ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
    tweet = re.sub(r'[a-zA-Z]+', '', tweet) # –£–¥–∞–ª–µ–Ω–∏–µ —Å–ª–æ–≤ –Ω–∞ –ª–∞—Ç–∏–Ω–∏—Ü–µ
    tweet = re.sub(r'[^\w\s,]', '', tweet)  # –£–¥–∞–ª–µ–Ω–∏–µ —ç–º–æ–¥–∑–∏
    tweet = tweet.lower()                   # –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
    tweet = " ".join([word for word in tweet.split() if word not in russian_stopwords]) # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤
    tweet = re.sub(r'\s+', ' ', tweet).strip() # –£–¥–∞–ª–µ–Ω–∏–µ –ª–∏—à–Ω–∏—Ö –ø—Ä–æ–±–µ–ª–æ–≤
    tweet = re.sub(r'\b\w\b', '', tweet)   # –£–¥–∞–ª–µ–Ω–∏–µ —Å–ª–æ–≤ –∏–∑ –æ–¥–Ω–æ–π –±—É–∫–≤—ã
    return tweet

# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏ –æ—á–∏—Å—Ç–∫–∏ –∫ –∫–∞–∂–¥–æ–º—É —Ç–≤–∏—Ç—É –≤ –Ω–∞–±–æ—Ä–∞—Ö –¥–∞–Ω–Ω—ã—Ö
df_train['text'] = df_train['text'].apply(clean_tweet)
df_test['text'] = df_test['text'].apply(clean_tweet)

# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏ –æ—á–∏—Å—Ç–∫–∏ –∫ –∫–∞–∂–¥–æ–º—É —Ç–≤–∏—Ç—É –≤ –Ω–∞–±–æ—Ä–∞—Ö –¥–∞–Ω–Ω—ã—Ö
df_train['text'] = df_train['text'].apply(clean_tweet)
df_test['text'] = df_test['text'].apply(clean_tweet)

# –£–¥–∞–ª–µ–Ω–∏–µ –ø—É—Å—Ç—ã—Ö —Ç–≤–∏—Ç–æ–≤
df_train = df_train[df_train['text'].str.strip().astype(bool)]
df_test = df_test[df_test['text'].str.strip().astype(bool)]

In [199]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º —Å—Ç–µ–º–º–µ—Ä –¥–ª—è —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞
stemmer = SnowballStemmer("russian")

def stem_tweet(tweet):
    # –†–∞–∑–¥–µ–ª—è–µ–º —Ç–≤–∏—Ç –Ω–∞ —Å–ª–æ–≤–∞
    words = tweet.split()
    # –ü—Ä–∏–º–µ–Ω—è–µ–º —Å—Ç–µ–º–º–∏–Ω–≥ –∫ –∫–∞–∂–¥–æ–º—É —Å–ª–æ–≤—É
    stemmed_words = [stemmer.stem(word) for word in words]
    # –°–æ–±–∏—Ä–∞–µ–º –æ–±—Ä–∞—Ç–Ω–æ –≤ —Å—Ç—Ä–æ–∫—É
    return ' '.join(stemmed_words)

df_train_stem = df_train.copy()
df_test_stem = df_test.copy()

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é —Å—Ç–µ–º–º–∏–Ω–≥–∞ –∫ –∫–∞–∂–¥–æ–º—É —Ç–≤–∏—Ç—É
df_train_stem['text'] = df_train_stem['text'].apply(stem_tweet)
df_test_stem['text'] = df_test_stem['text'].apply(stem_tweet)

df_train_stem

Unnamed: 0,text,label,id
0,–ø–æ–º–æ–π–º –≤–∫—Ä–∞—à —á–∏–º–∏–Ω,positive,1282311169534038016
5,–ø–æ—Ä—è–¥–∫,negative,1335130757044563971
10,—Å–ª–µ–¥ –±—É–¥ –ø–æ–±–µ–¥–Ω –∑–∞–∫—Ä—ã–≤–∞ –ø–æ–∂–µ–ª–∞ —É–¥–∞—á,positive,1215370396465291267
15,—É–¥–∏–≤–∏—Ç–µ–ª—å–Ω –≥–º–Ω —É–¥–∏–≤–∏—Ç–µ–ª—å–Ω –ø–æ–∫ –µ—â —Å–¥–æ—Ö–ª —É–∫—Ä–∞–∏–Ω,negative,1253799540848762887
17,—Å—Ä–∞–ª –±–∏–æ–ª–æ–≥,negative,1339418979887173632
...,...,...,...
10704,–¥–µ–¥—Ä–∞–¥ —Ö–æ—á–µ—à —Å–∫–∞–∑–∞ –ª–∞–π–∫–∞ –º–æ —Ç–≤–∏—Ç –ø–∑–¥—Ü —É–¥–∞–ª—è –æ—Ç—Å—é–¥,negative,1315037255833092098
10705,–¥–µ–π—Å—Ç–≤–∏—Ç–µ–ª—å–Ω –ª—é–±–æ–≤ —Ç–∞–∫,positive,1323606772578459648
10708,—Ö–æ–± —É—Ç—Ä –¥–æ–±—Ä,positive,1310485706213666816
10711,—É—Å–ø–µ–ª –≤—Å—Ç–∞—Ç –∑–∞–µ–±–∞ –Ω–æ–≤,negative,1343031810746425344


In [200]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è CountVectorizer
vectorizer = CountVectorizer()

# –û–±—É—á–µ–Ω–∏–µ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∏—Ö –≤ –º–µ—à–æ–∫ —Å–ª–æ–≤
X_train = vectorizer.fit_transform(df_train_stem['text'])

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö –≤ –º–µ—à–æ–∫ —Å–ª–æ–≤
X_test = vectorizer.transform(df_test_stem['text'])

In [201]:
# –°–æ–∑–¥–∞–Ω–∏–µ –æ–±—ä–µ–∫—Ç–∞ tf-idf transformer
tfidf_transformer = TfidfTransformer()

# –û–±—É—á–µ–Ω–∏–µ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∏—Ö
X_train_tfidf = tfidf_transformer.fit_transform(X_train)

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
X_test_tfidf = tfidf_transformer.transform(X_test)

In [202]:
# –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
logreg = LogisticRegression(random_state=RANDOM_SEED)
logreg.fit(X_train_tfidf, df_train['label'])

# –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞
rf = RandomForestClassifier(random_state=RANDOM_SEED)
rf.fit(X_train_tfidf, df_train['label'])

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
logreg_pred = logreg.predict(X_test_tfidf)
rf_pred = rf.predict(X_test_tfidf)

# –û—Ü–µ–Ω–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
print("–õ–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è:\n", classification_report(df_test['label'], logreg_pred))
print("–°–ª—É—á–∞–π–Ω—ã–π –ª–µ—Å:\n", classification_report(df_test['label'], rf_pred))



# Convert labels in df_test_lem to numeric
numeric_labels = [0 if label == 'negative' else 1 for label in df_test_stem['label']]

# Convert predictions in logreg_pred_lem to numeric
numeric_predictions = [0 if pred == 'negative' else 1 for pred in logreg_pred]

roc_auc = roc_auc_score(numeric_labels, numeric_predictions)
print("ROC_AUC_SCORE –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è c —ç–º–æ–¥–∑–∏:\n", roc_auc)
logreg_pred.shape()

–õ–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è:
               precision    recall  f1-score   support

    negative       0.73      0.93      0.82       656
    positive       0.84      0.54      0.66       477

    accuracy                           0.76      1133
   macro avg       0.79      0.73      0.74      1133
weighted avg       0.78      0.76      0.75      1133

–°–ª—É—á–∞–π–Ω—ã–π –ª–µ—Å:
               precision    recall  f1-score   support

    negative       0.73      0.88      0.80       656
    positive       0.77      0.55      0.64       477

    accuracy                           0.74      1133
   macro avg       0.75      0.72      0.72      1133
weighted avg       0.75      0.74      0.73      1133

ROC_AUC_SCORE –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è c —ç–º–æ–¥–∑–∏:
 0.7320444725673672


In [203]:
# –î–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
feature_names = vectorizer.get_feature_names_out()
logreg_coef = logreg.coef_[0]
sorted_features = sorted(zip(logreg_coef, feature_names), reverse=True)
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:")
sorted_words = [word for coef, word in sorted_features]
sorted_words_string = ' '.join(sorted_words[:10])
print(sorted_words_string)

sorted_features = sorted(zip(logreg_coef, feature_names))
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:")
sorted_words = [word for coef, word in sorted_features]
sorted_words_string = ' '.join(sorted_words[:10])
print(sorted_words_string)

# –î–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞
rf_importances = rf.feature_importances_
# –ü–æ–ª—É—á–∞–µ–º –∏–Ω–¥–µ–∫—Å—ã –Ω–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ –ø–æ—Ä—è–¥–∫–µ —É–±—ã–≤–∞–Ω–∏—è –∏—Ö –≤–∞–∂–Ω–æ—Å—Ç–∏
sorted_indices = np.argsort(rf_importances)[::-1]
# –ò–∑–≤–ª–µ–∫–∞–µ–º —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏–µ —Å–ª–æ–≤–∞
top_words = [feature_names[i] for i in sorted_indices[:20]]
# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤ –≤ —Å—Ç—Ä–æ–∫—É, —Ä–∞–∑–¥–µ–ª—è—è —Å–ª–æ–≤–∞ –ø—Ä–æ–±–µ–ª–∞–º–∏
top_words_string = ' '.join(top_words)
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞:")
print(top_words_string)

–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:
–ª—é–±–ª –∫—Ä–∞—Å–∏–≤ –ø—Ä–µ–∫—Ä–∞—Å–Ω –ª—É—á—à –º–∏–ª –∫–ª–∞—Å—Å–Ω –∫—Ä—É—Ç –Ω—Ä–∞–≤ –≤–∞ —Ä–∞–¥
–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:
–±–ª—è—Ç –ø–∏–∑–¥–µ—Ü –Ω–∞—Ö —Å—É–∫ —Ö—É–π–Ω –≤–æ–æ–±—â –∑–∞–µ–±–∞ –Ω–µ–Ω–∞–≤–∏–∂ –≥—Ä—É—Å—Ç–Ω —É–∂–∞—Å–Ω
–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞:
–ª—é–±–ª –∫—Ä–∞—Å–∏–≤ –±–ª—è—Ç –ø—Ä–µ–∫—Ä–∞—Å–Ω –∫—Ä—É—Ç –ª—É—á—à –º–∏–ª —ç—Ç –∫–ª–∞—Å—Å–Ω –ø–∏–∑–¥–µ—Ü –Ω—Ä–∞–≤ —Ö–æ—Ä–æ—à —Ä–∞–¥ –≤–∞ —Ö–æ—á —Å—É–∫ –æ—á–µ–Ω –ª—é–±–∏–º –æ–±–æ–∂–∞ –Ω–∞—Ö


In [204]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º Mystem
mystem = Mystem()

def lemmatize_text(text):
    # –ü—Ä–∏–º–µ–Ω—è–µ–º –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—é –∫ —Ç–µ–∫—Å—Ç—É –∏ –æ–±—ä–µ–¥–∏–Ω—è–µ–º –æ–±—Ä–∞—Ç–Ω–æ –≤ —Å—Ç—Ä–æ–∫—É
    lemmas = mystem.lemmatize(text)
    return ''.join(lemmas).strip()

df_train_lem = df_train.copy()
df_test_lem = df_test.copy()

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏ –∫ —Å—Ç–æ–ª–±—Ü—É —Å —Ç–µ–∫—Å—Ç–æ–º –≤ DataFrame
df_train_lem['text'] = df_train_lem['text'].apply(lemmatize_text)
df_test_lem['text'] = df_test_lem['text'].apply(lemmatize_text)

In [205]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è CountVectorizer
vectorizer = CountVectorizer()

# –û–±—É—á–µ–Ω–∏–µ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∏—Ö –≤ –º–µ—à–æ–∫ —Å–ª–æ–≤
X_train_lem = vectorizer.fit_transform(df_train_lem['text'])

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö –≤ –º–µ—à–æ–∫ —Å–ª–æ–≤
X_test_lem = vectorizer.transform(df_test_lem['text'])

In [206]:
# –°–æ–∑–¥–∞–Ω–∏–µ –æ–±—ä–µ–∫—Ç–∞ tf-idf transformer
tfidf_transformer = TfidfTransformer()

# –û–±—É—á–µ–Ω–∏–µ –Ω–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –∏ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∏—Ö
X_train_tfidf_lem = tfidf_transformer.fit_transform(X_train_lem)

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
X_test_tfidf_lem = tfidf_transformer.transform(X_test_lem)

In [207]:
# –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
logreg = LogisticRegression(random_state=RANDOM_SEED)
logreg.fit(X_train_tfidf_lem, df_train_lem['label'])

# –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞
rf = RandomForestClassifier(random_state=RANDOM_SEED)
rf.fit(X_train_tfidf_lem, df_train_lem['label'])

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
logreg_pred_lem = logreg.predict(X_test_tfidf_lem)
rf_pred_lem = rf.predict(X_test_tfidf_lem)

# –û—Ü–µ–Ω–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
print("–õ–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è:\n", classification_report(df_test_lem['label'], logreg_pred_lem))
print("–°–ª—É—á–∞–π–Ω—ã–π –ª–µ—Å:\n", classification_report(df_test_lem['label'], rf_pred_lem))

# Convert labels in df_test_lem to numeric
numeric_labels = [0 if label == 'negative' else 1 for label in df_test_lem['label']]

# Convert predictions in logreg_pred_lem to numeric
numeric_predictions = [0 if pred == 'negative' else 1 for pred in logreg_pred_lem]

roc_auc = roc_auc_score(numeric_labels, numeric_predictions)
print("ROC_AUC_SCORE –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è –±–µ–∑ —ç–º–æ–¥–∑–∏:\n", roc_auc)

–õ–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è:
               precision    recall  f1-score   support

    negative       0.74      0.92      0.82       656
    positive       0.84      0.56      0.67       477

    accuracy                           0.77      1133
   macro avg       0.79      0.74      0.75      1133
weighted avg       0.79      0.77      0.76      1133

–°–ª—É—á–∞–π–Ω—ã–π –ª–µ—Å:
               precision    recall  f1-score   support

    negative       0.73      0.87      0.79       656
    positive       0.76      0.55      0.63       477

    accuracy                           0.74      1133
   macro avg       0.74      0.71      0.71      1133
weighted avg       0.74      0.74      0.73      1133

ROC_AUC_SCORE –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è –±–µ–∑ —ç–º–æ–¥–∑–∏:
 0.7428126757682671


In [208]:
# –î–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏
feature_names = vectorizer.get_feature_names_out()
logreg_coef = logreg.coef_[0]
sorted_features = sorted(zip(logreg_coef, feature_names), reverse=True)
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:")
sorted_words = [word for coef, word in sorted_features]
sorted_words_string = ' '.join(sorted_words[:10])
print(sorted_words_string)

sorted_features = sorted(zip(logreg_coef, feature_names))
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:")
sorted_words = [word for coef, word in sorted_features]
sorted_words_string = ' '.join(sorted_words[:10])
print(sorted_words_string)

# –î–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞
rf_importances = rf.feature_importances_
# –ü–æ–ª—É—á–∞–µ–º –∏–Ω–¥–µ–∫—Å—ã –Ω–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ –ø–æ—Ä—è–¥–∫–µ —É–±—ã–≤–∞–Ω–∏—è –∏—Ö –≤–∞–∂–Ω–æ—Å—Ç–∏
sorted_indices = np.argsort(rf_importances)[::-1]
# –ò–∑–≤–ª–µ–∫–∞–µ–º —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏–µ —Å–ª–æ–≤–∞
top_words = [feature_names[i] for i in sorted_indices[:20]]
# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤ –≤ —Å—Ç—Ä–æ–∫—É, —Ä–∞–∑–¥–µ–ª—è—è —Å–ª–æ–≤–∞ –ø—Ä–æ–±–µ–ª–∞–º–∏
top_words_string = ' '.join(top_words)
print("–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞:")
print(top_words_string)

–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:
–ª—é–±–∏—Ç—å —Ö–æ—Ä–æ—à–∏–π –∫—Ä–∞—Å–∏–≤—ã–π –º–∏–ª—ã–π –ø—Ä–µ–∫—Ä–∞—Å–Ω—ã–π –∫–ª–∞—Å—Å–Ω—ã–π –≤–∞—É –Ω—Ä–∞–≤–∏—Ç—å—Å—è –∫—Ä—É—Ç–æ–π –ª—é–±–æ–≤—å
–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏:
–±–ª—è—Ç—å –ø–∏–∑–¥–µ—Ü –Ω–∞—Ö—É–π —Å—É–∫–∞ —É–º–∏—Ä–∞—Ç—å –≤–æ–æ–±—â–µ —Ö—É–π–Ω—è –Ω–µ–Ω–∞–≤–∏–¥–µ—Ç—å —É—Ö–æ–¥–∏—Ç—å —Å–¥—ã—Ö–∞—Ç—å
–ù–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è —Å–ª—É—á–∞–π–Ω–æ–≥–æ –ª–µ—Å–∞:
–ª—é–±–∏—Ç—å —Ö–æ—Ä–æ—à–∏–π –±–ª—è—Ç—å –∫—Ä–∞—Å–∏–≤—ã–π —ç—Ç–æ –ø–∏–∑–¥–µ—Ü –Ω—Ä–∞–≤–∏—Ç—å—Å—è –≤–∞—É –ø—Ä–µ–∫—Ä–∞—Å–Ω—ã–π —Ö–æ—Ç–µ—Ç—å –ª—é–±–æ–≤—å –º–∏–ª—ã–π –∫–ª–∞—Å—Å–Ω—ã–π –∫—Ä—É—Ç–æ–π —Å–∞–º—ã–π –æ—á–µ–Ω—å –≤–æ–æ–±—â–µ –∫—Ä–∞—Å–∏–≤–æ —É–º–∏—Ä–∞—Ç—å —Ä–∞–¥
