In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.float_format', '{:.2f}'.format)

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from sklearn.feature_extraction.text import CountVectorizer,  TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [11]:
X_train = pd.read_csv('rusentitweet_train.csv')
X_test = pd.read_csv('rusentitweet_test.csv')
X_train = X_train[X_train.label.isin(['positive','negative'])]
X_train = X_train.drop('id', axis=1).replace(['positive'], 1).replace(['negative'], 0)
X_test = X_test[X_test.label.isin(['positive','negative'])]
X_test = pd.DataFrame([["Люблю хороших пиздецов"]])
print(X_test)

                        0
0  Люблю хороших пиздецов


In [12]:
# Изменение регистра, удаление пунктуации и спец.символов
def cleaning(str):
    str = str.lower()
    # Чистка от ссылок
    str = re.sub(r'http\S+', '', str)
    # Чистка от табуляции, переноса строки и т.д.
    str = re.sub(r'\r|\n|\t|_|\\u', ' ', str)
    # Чистка от тэгов
    str = re.sub(r'\@\S*', '', str)
    # Чистка от символов
    str = re.sub(r'[()/,.0-9%#-?!`\'—]', '', str)
    
    # str = re.sub(r'', '', str)
    return str



In [13]:
# Стемминг
stemmer = RussianStemmer()
def stem_words(words):
    return [stemmer.stem(word) for word in words]

def preprocess_tweet(tweet):
    tweet = cleaning(tweet)
    # Tokenize the tweet
    words = nltk.word_tokenize(tweet)
    words = stem_words(words)
    # Return the preprocessed tweet as a string
    return ' '.join(words)

In [20]:
print(X_test[0])

0    Люблю хороших пиздецов
Name: 0, dtype: object


In [21]:
# Применение методов
milled_train = list(map(preprocess_tweet, X_train.text))
milled_test = list(map(preprocess_tweet, X_test[0]))

In [22]:
# Initialize the CountVectorizer from sklearn to convert the preprocessed tweets into a bag of words
vectorizer = CountVectorizer()
# Preprocess the train tweets and convert them into a bag of words
train_tweets_bow = vectorizer.fit_transform(milled_train)
# Preprocess the test tweets and convert them into a bag of words using the same vectorizer
test_tweets_bow = vectorizer.transform(milled_test)

# Calculate the tf-idf
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit(train_tweets_bow)

x_tf_idf_train = tfidf_matrix.transform(train_tweets_bow)
x_tf_idf_test = tfidf_matrix.transform(test_tweets_bow)

# vectorizer.get_feature_names_out()

In [27]:
# Обучение моделей
lr = LogisticRegression()
lr.fit(x_tf_idf_train, X_train.label)

rf = RandomForestClassifier()
rf.fit(x_tf_idf_train, X_train.label)

y_lr_train = lr.score(x_tf_idf_train, X_train.label)
y_rf_train = rf.score(x_tf_idf_train, X_train.label)
# # Оценка качества моделей на тестовых данных
# y_lr_test = lr.score(x_tf_idf_test, X_test.label)
# y_rf_test = rf.score(x_tf_idf_test, X_test.label)

# Предикты
y_pred_lr_train = lr.predict(x_tf_idf_train)
y_pred_rf_train = rf.predict(x_tf_idf_train)
# Оценка качества моделей на тестовых данных
y_pred_lr_test = lr.predict_proba(x_tf_idf_test)
y_pred_rf_test = rf.predict_proba(x_tf_idf_test)

# Отчет о классификации
print('Classification Report of Logistic Regression:')
print(y_lr_train, y_pred_lr_test)

print('Classification Report of Random Forest:')
print(y_rf_train, y_pred_rf_test)

Classification Report of Logistic Regression:
0.9096082293718538 [[0.10883031 0.89116969]]
Classification Report of Random Forest:
0.9991245349091705 [[0.33 0.67]]


In [8]:
print(pd.Series(lr.coef_[0], index = vectorizer.get_feature_names_out()).sort_values(ascending=False))

любл      3.51
хорош     3.38
красив    3.07
нрав      2.74
крут      2.66
          ... 
нах      -2.43
пиздец   -2.85
нет      -2.89
блят     -3.32
не       -3.46
Length: 8997, dtype: float64


In [9]:
# Вычисление метрик для логистической регрессии
data_logreg = {
    'Type': ['logreg_train', 'logreg_test'],
    'Accuracy': [accuracy_score(X_train.label, y_pred_lr_train), accuracy_score(X_test.label, y_pred_lr_test)], 
    'Precision': [precision_score(X_train.label, y_pred_lr_train, average='macro'), precision_score(X_test.label, y_pred_lr_test, average='macro')], 
}

# Вычисление метрик для случайного леса
data_forest = {
    'Type': ['randf_train', 'randf_test'],
    'Accuracy': [accuracy_score(X_train.label, y_pred_rf_train), accuracy_score(X_test.label, y_pred_rf_test)], 
    'Precision': [precision_score(X_train.label, y_pred_rf_train, average='macro'), precision_score(X_test.label, y_pred_rf_test, average='macro')], 
}

print(pd.concat([pd.DataFrame(data_forest), pd.DataFrame(data_logreg)]))

           Type  Accuracy  Precision
0   randf_train      1.00       1.00
1    randf_test      0.75       0.75
0  logreg_train      0.91       0.92
1   logreg_test      0.77       0.80


In [10]:
import pymystem3

X_train = X_train.head(100)
X_test = X_test.head(100)

# Initialize the lemmatizer
mystem = pymystem3.Mystem()

def preprocess_tweet(tweet):
    tweet = cleaning(tweet)
    lemmas = mystem.lemmatize(tweet)
    return ' '.join(lemmas)

# Применение методов
milled_train = list(map(preprocess_tweet, X_train.text))
milled_test = list(map(preprocess_tweet, X_test.text))

# Initialize the CountVectorizer from sklearn to convert the preprocessed tweets into a bag of words
vectorizer = CountVectorizer()
# Preprocess the train tweets and convert them into a bag of words
train_tweets_bow = vectorizer.fit_transform(milled_train)
# Preprocess the test tweets and convert them into a bag of words using the same vectorizer
test_tweets_bow = vectorizer.transform(milled_test)

# Calculate the tf-idf
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit(train_tweets_bow)

x_tf_idf_train = tfidf_matrix.transform(train_tweets_bow)
x_tf_idf_test = tfidf_matrix.transform(test_tweets_bow)

# vectorizer.get_feature_names_out()
# Обучение моделей
lr = LogisticRegression()
lr.fit(x_tf_idf_train, X_train.label)

rf = RandomForestClassifier()
rf.fit(x_tf_idf_train, X_train.label)

y_lr_train = lr.score(x_tf_idf_train, X_train.label)
y_rf_train = rf.score(x_tf_idf_train, X_train.label)
# Оценка качества моделей на тестовых данных
y_lr_test = lr.score(x_tf_idf_test, X_test.label)
y_rf_test = rf.score(x_tf_idf_test, X_test.label)

# Предикты
y_pred_lr_train = lr.predict(x_tf_idf_train)
y_pred_rf_train = rf.predict(x_tf_idf_train)
# Оценка качества моделей на тестовых данных
y_pred_lr_test = lr.predict(x_tf_idf_test)
y_pred_rf_test = rf.predict(x_tf_idf_test)

# Отчет о классификации
print('Classification Report of Logistic Regression:')
print(y_lr_train, y_lr_test)

print('Classification Report of Random Forest:')
print(y_rf_train, y_rf_test)

Classification Report of Logistic Regression:
0.98 0.6
Classification Report of Random Forest:
1.0 0.63


In [11]:
print(pd.Series(lr.coef_[0], index = vectorizer.get_feature_names_out()).sort_values(ascending=False))

# Вычисление метрик для логистической регрессии
data_logreg = {
    'Type': ['logreg_train', 'logreg_test'],
    'Accuracy': [accuracy_score(X_train.label, y_pred_lr_train), accuracy_score(X_test.label, y_pred_lr_test)], 
    'Precision': [precision_score(X_train.label, y_pred_lr_train, average='macro'), precision_score(X_test.label, y_pred_lr_test, average='macro')], 
}

# Вычисление метрик для случайного леса
data_forest = {
    'Type': ['randf_train', 'randf_test'],
    'Accuracy': [accuracy_score(X_train.label, y_pred_rf_train), accuracy_score(X_test.label, y_pred_rf_test)], 
    'Precision': [precision_score(X_train.label, y_pred_rf_train, average='macro'), precision_score(X_test.label, y_pred_rf_test, average='macro')], 
}

print(pd.concat([pd.DataFrame(data_forest), pd.DataFrame(data_logreg)]))

такой      0.48
мой        0.47
самый      0.47
ахах       0.46
ахаххах    0.46
           ... 
что       -0.46
вы        -0.53
как       -0.53
пиздец    -0.56
не        -0.59
Length: 492, dtype: float64
           Type  Accuracy  Precision
0   randf_train      1.00       1.00
1    randf_test      0.63       0.62
0  logreg_train      0.98       0.98
1   logreg_test      0.60       0.58
