In [98]:
!pip install pymorphy3



In [99]:
import re
import pandas as pd
import numpy as np
import pymorphy3
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import to_categorical
import keras
from keras.models import Sequential
from keras.layers import Dense
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input



nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [100]:
df = pd.read_csv('/content/rusentiment_random_posts.csv')
df.head()

Unnamed: 0,label,text
0,negative,"А попа подозревала давно,что ты с кавказа..пер..."
1,speech,З прошедшим Днем Ангела))))))))
2,skip,Два дня до отлёта с острова!!!!!!!
3,negative,"Блин, почему эта жизнь столь не справедлива (((("
4,skip,где еще встречать свой день рождения как не на...


In [101]:
df.drop(df[df['label'].isin(['skip', 'speech'])].index, inplace=True)
df = df.reset_index(drop=True)

In [102]:
morph = pymorphy3.MorphAnalyzer()

def preprocessing_text(str_row):
    if not isinstance(str_row, str):
        str_row = ""
    s1 = re.sub(r'[^\w\s]+|[\d]+', r'', str_row).strip()
    s1 = s1.lower()
    word_arr = word_tokenize(s1)
    words = []
    for i in word_arr:
        pv = morph.parse(i)
        words.append(pv[0].normal_form)
    sentence = ' '.join(words)
    return sentence

In [103]:
df['preprocess_text'] = df['text'].apply(preprocessing_text)
russian_stopwords = stopwords.words("russian")

In [104]:
vectorizer_tfidf = TfidfVectorizer(max_features=1000, min_df=20, max_df=0.7, stop_words=russian_stopwords)
text_tfidf = vectorizer_tfidf.fit_transform(df['preprocess_text'])
text_tfidf = pd.DataFrame(text_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out())

In [105]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df['label'])
y_cat = to_categorical(y_encoded)

In [106]:
X_train_full, X_test, y_train_full, y_test = train_test_split(text_tfidf, y_cat, test_size=0.2, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=0)

In [107]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(y_cat.shape[1], activation='softmax')
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])

In [108]:
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_valid, y_valid))
loss, acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {acc:.4f}")

Epoch 1/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.4529 - loss: 1.3231 - val_accuracy: 0.5662 - val_loss: 0.9309
Epoch 2/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6392 - loss: 0.8189 - val_accuracy: 0.6284 - val_loss: 0.8536
Epoch 3/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6844 - loss: 0.7449 - val_accuracy: 0.6223 - val_loss: 0.8683
Epoch 4/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7191 - loss: 0.6688 - val_accuracy: 0.6149 - val_loss: 0.8901
Epoch 5/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7403 - loss: 0.6331 - val_accuracy: 0.6219 - val_loss: 0.9221
Epoch 6/15
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7594 - loss: 0.5839 - val_accuracy: 0.6321 - val_loss: 0.9326
Epoch 7/15
[1m305/305[0m 

In [110]:
model.save('sentiment_model.h5')
joblib.dump(vectorizer_tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(encoder, 'label_encoder.joblib')



['label_encoder.joblib']