In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_tokenized.csv')

In [4]:
df.shape

(239031, 2)

In [5]:
df.head()

Unnamed: 0,tokens,sentiment
0,"['everyth', 'perfect', 'nice', 'clean', 'every...",positive
1,"['appart', 'beautiful', 'veri', 'friendli', 'h...",positive
2,"['spent', 'excel', 'night', 'thi', 'apart', 'g...",positive
3,"['great', 'host', 'super', 'respons', 'make', ...",positive
4,"['properti', 'great', 'locat', 'base', 'look',...",positive


In [6]:
df['sentiment'].value_counts()

sentiment
positive    237589
negative      1442
Name: count, dtype: int64

In [7]:
y = np.where(df['sentiment']=='positive',1,0)

##### Сплит 60:20:20

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], y, test_size=0.2, stratify=y, random_state=42)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [10]:
print(y_train.sum()/y_train.shape[0])
print(y_val.sum()/y_val.shape[0])
print(y_test.sum()/y_test.shape[0])

0.9940174873446848
0.993808308580513
0.9939757776057899


Видим сильный бисбаланс классов

In [11]:
X_train = pd.concat([X_train,X_val])
y_train = np.append(y_train,y_val)

### Создать пространство функций (DTM)

Компьютер не может иметь дело с необработанными текстовыми данными, тексты должны быть преобразованы в пространство функций.

In [12]:
vectorizer = CountVectorizer()

In [13]:
vectorizer.fit(X_train)

In [14]:
X_train = vectorizer.transform(X_train)

In [15]:
X_test = vectorizer.transform(X_test)

In [16]:
vectorizer.get_feature_names_out()[1000:1010]

array(['accommod_request_check', 'accommod_request_earli',
       'accommod_respect', 'accommod_respond', 'accommod_respons',
       'accommod_respons_host', 'accommod_respons_would',
       'accommod_return', 'accommod_right', 'accommod_room'], dtype=object)

In [17]:
vectorizer.get_feature_names_out().shape

(117459,)

In [18]:
difference = y_train.shape[0]-y_train.sum()

In [19]:
y_train.shape

(191224,)

Используем оверсэмплинг для обучающей выборки

In [20]:
oversampler = SMOTE(sampling_strategy='minority', random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [21]:
print(y_train.sum()/y_train.shape[0])

0.5


Обучаем классификатор

In [22]:
nb_classifier = MultinomialNB(fit_prior=True)
nb_classifier.fit(X_train, y_train)

In [23]:
y_pred = nb_classifier.predict(X_test)

In [24]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

Accuracy:  0.9939548601669211
Precision:  0.7486627222950272
Recall:  0.8727167027925672
F1:  0.7980611434854583


In [25]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.75      0.60       288
           1       1.00      1.00      1.00     47519

    accuracy                           0.99     47807
   macro avg       0.75      0.87      0.80     47807
weighted avg       1.00      0.99      0.99     47807



In [26]:
confusion_matrix(y_test, y_pred)

array([[  216,    72],
       [  217, 47302]])

In [27]:
for i, label in enumerate(nb_classifier.classes_):
    top = np.argsort(nb_classifier.feature_log_prob_[i])[-20:]
    print("Label %s: %s" % (label, ", ".join(vectorizer.get_feature_names_out()[top])))
    print()

Label 0: night, airbnb, check, clean, bed, day, get, would, us, locat, host, room, veri, place, no, apart, stay, but, thi, not

Label 1: but, help, comfort, well, everyth, us, walk, nice, thi, would, love, clean, recommend, host, apart, place, locat, great, stay, veri



##### Подбираем оптимальные гиперпараметры по сетке

In [28]:
nb_classifier = MultinomialNB()

In [29]:
param_grid = {
    'alpha': [1,5,10,15],
    'fit_prior': [True]
}
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='f1_macro', refit=True, return_train_score=True, verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END alpha=1, fit_prior=True;, score=(train=0.981, test=0.975) total time=   0.5s
[CV 2/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.974) total time=   0.5s
[CV 3/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.975) total time=   0.5s
[CV 4/5] END alpha=1, fit_prior=True;, score=(train=0.975, test=0.974) total time=   0.4s
[CV 5/5] END alpha=1, fit_prior=True;, score=(train=0.974, test=0.974) total time=   0.4s
[CV 1/5] END alpha=5, fit_prior=True;, score=(train=0.971, test=0.968) total time=   0.4s
[CV 2/5] END alpha=5, fit_prior=True;, score=(train=0.968, test=0.968) total time=   0.5s
[CV 3/5] END alpha=5, fit_prior=True;, score=(train=0.968, test=0.969) total time=   0.4s
[CV 4/5] END alpha=5, fit_prior=True;, score=(train=0.968, test=0.968) total time=   0.6s
[CV 5/5] END alpha=5, fit_prior=True;, score=(train=0.968, test=0.968) total time=   0.7s
[CV 1/5] END alpha=10, fit_prior=True;, 

In [30]:
pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_train_score"], columns=["Training Mean F1 Score"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Validation Mean F1 Score"])], axis=1)

Unnamed: 0,alpha,fit_prior,Training Mean F1 Score,Validation Mean F1 Score
0,1,True,0.975623,0.974673
1,5,True,0.96858,0.968168
2,10,True,0.956161,0.955738
3,15,True,0.946704,0.946334


In [31]:
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Hyperparameters:  {'alpha': 1, 'fit_prior': True}
Best F1 Score:  0.9746727972091727


In [32]:
y_pred = grid_search.predict(X_test)

In [33]:
report = classification_report(y_test, y_pred, digits=4)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0     0.4988    0.7500    0.5992       288
           1     0.9985    0.9954    0.9970     47519

    accuracy                         0.9940     47807
   macro avg     0.7487    0.8727    0.7981     47807
weighted avg     0.9955    0.9940    0.9946     47807



In [34]:
confusion_matrix(y_test, y_pred)

array([[  216,    72],
       [  217, 47302]])