In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop = stopwords.words('russian')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [3]:
def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

In [4]:
df =  pd.read_csv(r"C:\Users\coolz\assignments\data.csv", names = ['Review', 'Rating'], sep=',', dtype = {"Review": str, 'Rating': 'int32'})
df = df.dropna(axis=0)
np.random.seed(42)
df = df.drop_duplicates(subset=['Review', 'Rating'], keep=False)
df = df.reset_index(drop=True)

In [5]:
df['Rating']=df['Rating'].astype(int)
df=df[df['Rating']!=3]
df['label']=np.where(df['Rating']>=3,1,0)

In [6]:
df['Rating'].value_counts()

5    107885
4     50785
2     16121
1     15511
Name: Rating, dtype: int64

In [195]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle
data = df[df['label']==0][:30000]
data = data.append(df[df['label']==1][:150000])
data = data.reset_index(drop=True)
display(data['label'].value_counts())
data

1    150000
0     30000
Name: label, dtype: int64

Unnamed: 0,Review,Rating,label
0,1500 грн в урну!,1,0
1,"Так получилось, что телефон купила мне компани...",1,0
2,Если терпеливо относиться к периодическим зави...,2,0
3,"Купил, т. к. нужны были обязательно 2 симки. Р...",1,0
4,телефон был куплен в январе так за эти полгода...,1,0
...,...,...,...
179995,"Хороший телефон, нареканий не вызывает, корпус...",5,1
179996,звездолет ))),5,1
179997,Очень доволен покупкой . Долго выбирал что куп...,5,1
179998,"я доплачу 250 рублей, пойду в днс и куплю bq m...",5,1


In [196]:
data['preprocess'] = data['Review'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
#data['preprocess'] = data['preprocess'].apply(lambda x: BeautifulSoup(x).get_text())
data['preprocess'] = data['preprocess'].apply(lambda x: re.sub(r"http\S+", "", x))
data["preprocess"] = data['preprocess'].str.replace('[^\w\s]','')
#data['preprocess'] = data['preprocess'].apply(lambda x: "".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
data['preprocess'] = data['preprocess'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
data

  data["preprocess"] = data['preprocess'].str.replace('[^\w\s]','')


Unnamed: 0,Review,Rating,label,preprocess
0,1500 грн в урну!,1,0,1500 грн урну
1,"Так получилось, что телефон купила мне компани...",1,0,получилось телефон купила компания рабочий без...
2,Если терпеливо относиться к периодическим зави...,2,0,терпеливо относиться периодическим зависаниям ...
3,"Купил, т. к. нужны были обязательно 2 симки. Р...",1,0,купил т нужны обязательно 2 симки 15 хотел раз...
4,телефон был куплен в январе так за эти полгода...,1,0,телефон куплен январе полгода вылетело 2 кнопк...
...,...,...,...,...
179995,"Хороший телефон, нареканий не вызывает, корпус...",5,1,хороший телефон нареканий вызывает корпус крыш...
179996,звездолет ))),5,1,звездолет
179997,Очень доволен покупкой . Долго выбирал что куп...,5,1,очень доволен покупкой долго выбирал купить хо...
179998,"я доплачу 250 рублей, пойду в днс и куплю bq m...",5,1,доплачу 250 рублей пойду днс куплю bq magic


In [197]:
X_train,X_test,Y_train, Y_test = train_test_split(data['preprocess'], data['label'], test_size = 0.2, random_state = 142)
print("Train:",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train: (144000,) (144000,) Test:  ((36000,), (36000,))


In [198]:
vectorizer = TfidfVectorizer(dtype = np.float32)
vectorizer.fit(X_train.values.ravel())
tf_x_train = vectorizer.transform(X_train.values.ravel())
tf_x_test = vectorizer.transform(X_test.values.ravel())
#tf_x_train = tf_x_train.toarray()
#tf_x_test = tf_x_test.toarray()

In [199]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Review      180000 non-null  object
 1   Rating      180000 non-null  int32 
 2   label       180000 non-null  int32 
 3   preprocess  180000 non-null  object
dtypes: int32(2), object(2)
memory usage: 485.0 MB


In [200]:
classifier = LinearSVC(random_state=9742)
#classifier = RandomForestClassifier

In [201]:
# build normal model with vectorizer
vectorized_normal_pipeline = make_pipeline(LinearSVC(random_state=4792))
vectorized_normal_model = vectorized_normal_pipeline.fit(tf_x_train, Y_train)
vectorized_normal_prediction = vectorized_normal_model.predict(tf_x_test)

In [202]:
# build model with SMOTE imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42, k_neighbors = 5, n_jobs = 8), LinearSVC(random_state=1242))
smote_model = smote_pipeline.fit(tf_x_train, Y_train)
smote_prediction = smote_model.predict(tf_x_test)

In [203]:
# print information about both models
print("normal data distribution: {}".format(Counter(data['label'])))
X_smote, y_smote = SMOTE().fit_resample(tf_x_train, Y_train) #ругается на то что тут буквавки а не циферки, надо заранее векторизировать
print("SMOTE data distribution: {}".format(Counter(y_smote)))
#X_nearmiss, y_nearmiss = NearMiss().fit_sample(data['data'], data['target'])
#print("NearMiss data distribution: {}".format(Counter(y_nearmiss)))

normal data distribution: Counter({1: 150000, 0: 30000})
SMOTE data distribution: Counter({1: 119962, 0: 119962})


In [204]:
#from sklearn.linear_model import LogisticRegression

In [205]:
# build model with undersampling
#nr = NearMiss()
#X_train_miss, y_train_miss = nr.fit_resample(tf_x_train,Y_train.ravel())
#lr2 = LogisticRegression()
#lr2.fit(X_train_miss, y_train_miss.ravel())
#nearmiss_prediction = lr2.predict(tf_x_test)

In [206]:
# classification report
print(classification_report(Y_test, vectorized_normal_prediction))
print(classification_report_imbalanced(Y_test, smote_prediction))

              precision    recall  f1-score   support

           0       0.86      0.74      0.79      5962
           1       0.95      0.98      0.96     30038

    accuracy                           0.94     36000
   macro avg       0.90      0.86      0.88     36000
weighted avg       0.93      0.94      0.93     36000

                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.84      0.92      0.75      0.88      0.77      5962
          1       0.97      0.92      0.84      0.94      0.88      0.78     30038

avg / total       0.92      0.91      0.85      0.91      0.88      0.78     36000



In [207]:
print('vectorized_normal Pipline Score {}'.format(vectorized_normal_pipeline.score(tf_x_test, Y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(tf_x_test, Y_test)))
#print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(tf_x_test, Y_test)))

vectorized_normal Pipline Score 0.9362777777777778
SMOTE Pipeline Score 0.909


In [208]:
print_results("vectorized normal classification", Y_test, vectorized_normal_prediction)
print_results("SMOTE classification", Y_test, smote_prediction)
#print_results("NearMiss classification", Y_test, nearmiss_prediction)

vectorized normal classification
accuracy: 0.9362777777777778
precision: 0.9497762790999287
recall: 0.9751980824289234
f1: 0.9623193166885677
SMOTE classification
accuracy: 0.909
precision: 0.9660420730008359
recall: 0.9233970304281244
f1: 0.9442382978723404


In [209]:
roc_auc_score(Y_test, smote_prediction)

0.8799306520808854

In [210]:
roc_auc_score(Y_test, vectorized_normal_prediction)

0.857692969426471