In [1]:
import pandas as pd
import pymorphy2

from tqdm import tqdm
tqdm.pandas()

from nltk.tokenize import word_tokenize 
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
# import eli5

In [2]:
df = pd.read_csv('rureviews/women-clothing-accessories.3-class.balanced.csv', encoding="utf8", sep='\t')

In [3]:
df['sentiment'].value_counts()

positive    30000
neautral    30000
negative    30000
Name: sentiment, dtype: int64

In [4]:
df = df[df['sentiment'] != 'neautral']

In [5]:
df.iloc[0]['review']

'качество плохое пошив ужасный (горловина наперекос) Фото не соответствует Ткань ужасная рисунок блеклый маленький рукав не такой УЖАС!!!!! не стоит за такие деньги г.......'

In [6]:
# убрали знаки препинания
df['review_processed'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x)).values

In [7]:
df['review_processed'] = df['review_processed'].apply(lambda x: x.lower())

In [8]:
df['review_processed'] = df['review_processed'].progress_apply(lambda x: word_tokenize(x))

100%|██████████| 60000/60000 [00:06<00:00, 9879.70it/s] 


In [9]:
df['review_processed'].iloc[0]

['качество',
 'плохое',
 'пошив',
 'ужасный',
 'горловина',
 'наперекос',
 'фото',
 'не',
 'соответствует',
 'ткань',
 'ужасная',
 'рисунок',
 'блеклый',
 'маленький',
 'рукав',
 'не',
 'такой',
 'ужас',
 'не',
 'стоит',
 'за',
 'такие',
 'деньги',
 'г']

In [10]:
morph = pymorphy2.MorphAnalyzer()

In [11]:
morph.parse("сделали")[0].normal_form

'сделать'

In [12]:
df['review_lemmatized'] = df['review_processed'].progress_apply(lambda x: [morph.parse(w)[0].normal_form for w in x])

100%|██████████| 60000/60000 [02:18<00:00, 433.28it/s]


In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1)) 

In [14]:
X = vectorizer.fit_transform(df['review_lemmatized'].apply(lambda x: ' '.join(x)))

In [15]:
X.shape

(60000, 51549)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df['sentiment'], test_size=0.3, random_state=42
)

1. train_test_split <br> 2. LogisticRegression <br> 3. fit & predict <br> 4. roc_auc_score

In [17]:
logit = LogisticRegression()

In [18]:
logit.fit(X_train, y_train)

LogisticRegression()

In [19]:
y_pred = logit.predict_proba(X_test)[:, 1]

In [20]:
roc_auc_score(y_test, y_pred)

0.9735222897024753