In [1]:
import pandas as pd
import pymorphy3

from tqdm import tqdm
tqdm.pandas()

from nltk.tokenize import word_tokenize 
import re

from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import eli5

In [2]:
# drop by the class neutral
df = pd.read_csv('women_clothing.csv', encoding="utf8", sep='\t')
print(df['sentiment'].value_counts())

sentiment
negative    30000
neautral    30000
positive    30000
Name: count, dtype: int64


In [3]:
# to delete all
df = df[df['sentiment'] != 'neautral']
df['review_processed'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x)).values
df['review_processed'] = df['review_processed'].apply(lambda x: x.lower())
#df['review_processed'] = df['review_processed'].progress_apply(lambda x: word_tokenize(x))

dict = {'negative':0, 'positive':1}

df['label'] = df['sentiment'].apply(lambda x: dict[x])

In [4]:
# Morthitization
morthy = pymorphy3.MorphAnalyzer()
# Example of morhpy work
morthy.parse("Пример")[0].normal_form # Normilise form of thr words

'пример'

In [5]:
df['review_processed'] = df['review_processed'].progress_apply(lambda x: [morthy.parse(w)[0].normal_form for w in x])

100%|██████████| 60000/60000 [37:45<00:00, 26.48it/s]  


In [13]:
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning


In [20]:
parsed = morthy.parse('замочная')[0]
print(parsed)

Parse(word='замочная', tag=OpencorporaTag('ADJF,Qual femn,sing,nomn'), normal_form='замочный', score=1.0, methods_stack=((DictionaryAnalyzer(), 'замочная', 12, 7),))


In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
#X = vectorizer.fit_transform(df['review_processed'])
#y = df['sentiment']
X = vectorizer.fit_transform(df['review_processed'].apply(lambda x: ''.join(x)))
y = df['label']

In [15]:
# Classification moment
'''
    1) Prepair our date(like before)
    2) Also use train_test_split or something like that
    3) Classifire (LogRegression)
    4) fit & predict
    5) roc_acc_score
'''
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.7, random_state=10
        )
    model = LogisticRegression()
    model.fit(X_train, y_train)
    display(model.score(X_test, y_test))

0.9249444444444445

In [16]:
y_pred = model.predict(X_test)
conf_matrix_baseline = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_baseline)
display('accuracy = ', accuracy_score(y_test, y_pred))
display('Roc = ', roc_auc_score(y_test, y_pred))


Unnamed: 0,predicted 0,predicted 1
actual 0,8330,602
actual 1,749,8319


'accuracy = '

0.9249444444444445

'Roc = '

0.9250018667732341

In [17]:
eli5.show_weights(
    estimator=model,
    feature_names = list(vectorizer.get_feature_names_out()),
    top =(50, 50)
)

Weight?,Feature
+8.326,спасибо
+8.104,супбытьр
+7.729,хороширотао
+7.386,отаклично
+7.335,нбытьмного
+6.795,хороширотаая
+6.497,хороширотабытьбыть
+6.256,отакличная
+6.131,далееовольна
+5.868,отакличнобыть


In [18]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    parameters = [{'penalty':['l1','l2']}, 
                {'C':[1, 10, 100, 1000]}]
    grid_search = GridSearchCV(estimator = model,  
                            param_grid = parameters,
                            scoring = 'accuracy',
                            cv = 5,
                            verbose=0)

    grid_search.fit(X_train, y_train)

5 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nikgo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nikgo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nikgo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 5

In [21]:
y_pred = grid_search.predict(X_test)
conf_matrix_baseline = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])
display(conf_matrix_baseline)
display('accuracy = ', accuracy_score(y_test, y_pred))
display('Roc = ', roc_auc_score(y_test, y_pred))

Unnamed: 0,predicted 0,predicted 1
actual 0,8391,541
actual 1,698,8370


'accuracy = '

0.9311666666666667

'Roc = '

0.9312286419906243