In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
stop = stopwords.words('russian')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
import ipywidgets as widgets

In [2]:
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

In [3]:
df =  pd.read_csv(r"C:\Users\coolz\assignments\data.csv", names = ['Review', 'Rating'], sep=',', dtype = {"Review": str, 'Rating': 'int32'})
df = df.dropna(axis=0)
np.random.seed(42)
df = df.drop_duplicates(subset=['Review', 'Rating'], keep=False)
df = df.reset_index(drop=True)

In [4]:
df['Rating']=df['Rating'].astype(int)
df=df[df['Rating']!=3]
df['label']=np.where(df['Rating']>=3,1,0)

In [5]:
df['Rating'].value_counts()

5    107885
4     50785
2     16121
1     15511
Name: Rating, dtype: int64

In [6]:
#take some amount of labels
df = df.sample(frac=1).reset_index(drop=True) #shuffle
data = df[df['label']==0][:30000]
data = data.append(df[df['label']==1][:150000])
data = data.reset_index(drop=True)
display(data['label'].value_counts())

1    150000
0     30000
Name: label, dtype: int64

In [7]:
data['preprocess'] = data['Review'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
data['preprocess'] = data['preprocess'].apply(lambda x: re.sub(r"http\S+", "", x))
data["preprocess"] = data['preprocess'].str.replace('[^\w\s]','')
data['preprocess'] = data['preprocess'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

  data["preprocess"] = data['preprocess'].str.replace('[^\w\s]','')


In [8]:
#making train/test split
X_train,X_test,Y_train, Y_test = train_test_split(data['preprocess'], data['label'], test_size = 0.2, random_state = 142)
print("Train:",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train: (144000,) (144000,) Test:  ((36000,), (36000,))


In [9]:
#choosing vectorizer
vectorizer = TfidfVectorizer(dtype = np.float32)
vectorizer.fit(X_train.values.ravel())
tf_x_train = vectorizer.transform(X_train.values.ravel())
tf_x_test = vectorizer.transform(X_test.values.ravel())

In [10]:
#choosing classifier
svm = LinearSVC(random_state=9742)
сlassifier = CalibratedClassifierCV(svm) 

In [11]:
# build normal model with vectorizer
vectorized_normal_pipeline = make_pipeline(сlassifier)
vectorized_normal_model = vectorized_normal_pipeline.fit(tf_x_train, Y_train)
vectorized_normal_prediction = vectorized_normal_model.predict(tf_x_test)

In [12]:
# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42, k_neighbors = 5, n_jobs = 8), сlassifier)
smote_model = smote_pipeline.fit(tf_x_train, Y_train)
smote_prediction = smote_model.predict(tf_x_test)

In [13]:
# print information about both models
print("normal data distribution: {}".format(Counter(data['label'])))
X_smote, y_smote = SMOTE().fit_resample(tf_x_train, Y_train)
print("SMOTE data distribution: {}".format(Counter(y_smote)))

normal data distribution: Counter({1: 150000, 0: 30000})
SMOTE data distribution: Counter({1: 119962, 0: 119962})


In [14]:
# classification report
print(classification_report(Y_test, vectorized_normal_prediction))
print(classification_report_imbalanced(Y_test, smote_prediction))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80      5962
           1       0.95      0.97      0.96     30038

    accuracy                           0.94     36000
   macro avg       0.90      0.86      0.88     36000
weighted avg       0.93      0.94      0.94     36000

                   pre       rec       spe        f1       geo       iba       sup

          0       0.75      0.77      0.95      0.76      0.85      0.72      5962
          1       0.95      0.95      0.77      0.95      0.85      0.74     30038

avg / total       0.92      0.92      0.80      0.92      0.85      0.74     36000



In [15]:
print('vectorized_normal Pipline Score {}'.format(vectorized_normal_pipeline.score(tf_x_test, Y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(tf_x_test, Y_test)))

vectorized_normal Pipline Score 0.9192222222222223
SMOTE Pipeline Score 0.9192222222222223


In [16]:
print_results("vectorized normal classification", Y_test, vectorized_normal_prediction)
print_results("SMOTE classification", Y_test, smote_prediction)

vectorized normal classification
accuracy: 0.9369166666666666
precision: 0.9519221379512386
recall: 0.9735668153672016
f1: 0.9626228213104231
SMOTE classification
accuracy: 0.9192222222222223
precision: 0.9538914541925986
recall: 0.9490645182768493
f1: 0.9514718643615246


In [17]:
roc_auc_score(Y_test, smote_prediction)

0.8589670125768681

In [18]:
roc_auc_score(Y_test, vectorized_normal_prediction)

0.862915578096214

In [35]:
user = widgets.Text(
    value=' ',
    placeholder='Введите отзыв',
    description='Отзыв:',
    disabled=False
)
display (user)
custom_text = user.value

Text(value=' ', description='Отзыв:', placeholder='Введите отзыв')

In [38]:
custom_text = ["хороший телефон но греется"]
custom_text = vectorizer.transform(custom_text)

In [39]:
vectorized_normal_model.predict_proba(custom_text)

array([[0.00192984, 0.99807016]])