In [1]:
import spacy
import pandas as pd
import pickle
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from spacy.lang.en.stop_words import STOP_WORDS 

In [2]:
polarity = ['Злость', 'Нейтральность', 'Восторженность']

In [3]:
nlp = spacy.load("en_core_web_md")

#Предобработка текста
def preprocess(text):
    text = text.lower().strip() #приводим к нижнему регистру и избавляеся от лишних пробелов
    text = re.sub(r'<.*?>', '', text)#убираем htlm таги
    text = re.sub(r'[^\w\s]', ' ', text) #убираем пунктицию и символы как ',", # итд
    text = re.sub(r'\s+', ' ', text)# убираем доп пробелы
    return text

#Убирает Шумовые слова как: этом, пока, одном итд
def stopword(string):
    a = [word for word in string.split() if word not in STOP_WORDS]
    return ' '.join(a)

#Приводит слово к изначальному типу
def lemmatizer(string):
    a = nlp(string)
    a = [word.lemma_ for word in a]
    return " ".join(a)

In [4]:
df = pd.read_csv('./data/reddit_processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37149 entries, 0 to 37148
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  37149 non-null  int64 
 1   index       37149 non-null  int64 
 2   text        37149 non-null  object
 3   sentiment   37149 non-null  int64 
 4   new_text    36872 non-null  object
dtypes: int64(3), object(2)
memory usage: 1.4+ MB


In [5]:
df_train = df.drop(columns=['index'])
df_train["new_text"], df_train["sentiment"] = df_train["new_text"].fillna(' '), df_train["sentiment"]


In [6]:
X_train, X_val, y_train, y_val = train_test_split(df_train["new_text"],
                                                  df_train["sentiment"],
                                                  test_size=0.20,
                                                  shuffle=True)

In [7]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

In [8]:
#обучаем модель используя Логическую регрессию(tf-idf)  
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))

              precision    recall  f1-score   support

          -1       0.77      0.64      0.70      1616
           0       0.78      0.88      0.82      2610
           1       0.82      0.80      0.81      3204

    accuracy                           0.80      7430
   macro avg       0.79      0.78      0.78      7430
weighted avg       0.80      0.80      0.79      7430



In [9]:
#обучаем модель используя Наивной Баевской классификации(tf-idf)  
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train) 

y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 
print(classification_report(y_val,y_predict))

              precision    recall  f1-score   support

          -1       0.94      0.09      0.17      1616
           0       0.83      0.29      0.43      2610
           1       0.49      0.97      0.65      3204

    accuracy                           0.54      7430
   macro avg       0.75      0.45      0.42      7430
weighted avg       0.71      0.54      0.47      7430



In [10]:
#Сохраняем модели
with open('./models/model_lr.pkl','wb') as f:
    pickle.dump(lr_tfidf,f)
with open('./models/model_nb.pkl','wb') as f:
    pickle.dump(nb_tfidf,f)

In [11]:
#Тестирум модель
test = input('enter text', )
test_processed = lemmatizer(stopword(preprocess(test)))
X_test = [test_processed]
X_vector = tfidf_vectorizer.transform(X_test)
lr_predict = lr_tfidf.predict(X_vector)
lr_prob =  lr_tfidf.predict_proba(X_vector)
nb_predict = nb_tfidf.predict(X_vector)
nb_prob =  nb_tfidf.predict_proba(X_vector)
print('Результат логической регрессии -', polarity[lr_predict.item() + 1])
print('Результат Наивной Баевской классификации -', polarity[nb_predict.item() + 1])

Результат логической регрессии - Нейтральность
Результат Наивной Баевской классификации - Восторженность
