In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_tokenized.csv')

In [4]:
df.shape

(239031, 2)

In [5]:
df.head()

Unnamed: 0,tokens,sentiment
0,"['everyth', 'perfect', 'nice', 'clean', 'every...",positive
1,"['appart', 'beautiful', 'veri', 'friendli', 'h...",positive
2,"['spent', 'excel', 'night', 'thi', 'apart', 'g...",positive
3,"['great', 'host', 'super', 'respons', 'make', ...",positive
4,"['properti', 'great', 'locat', 'base', 'look',...",positive


In [6]:
df['sentiment'].value_counts()

sentiment
positive    237589
negative      1442
Name: count, dtype: int64

In [7]:
y = np.where(df['sentiment']=='positive',1,0)

##### Сплит 60:20:20

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], y, test_size=0.2, stratify=y, random_state=42)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [10]:
print(y_train.sum()/y_train.shape[0])
print(y_val.sum()/y_val.shape[0])
print(y_test.sum()/y_test.shape[0])

0.9940174873446848
0.993808308580513
0.9939757776057899


Видим сильный дисбаланс классов

In [11]:
X_train = pd.concat([X_train,X_val])
y_train = np.append(y_train,y_val)

In [12]:
vectorizer = CountVectorizer()

In [13]:
vectorizer.fit(X_train)

In [14]:
X_train = vectorizer.transform(X_train)

In [15]:
X_test = vectorizer.transform(X_test)

In [16]:
vectorizer.get_feature_names_out()[1000:1010]

array(['accommod_request_check', 'accommod_request_earli',
       'accommod_respect', 'accommod_respond', 'accommod_respons',
       'accommod_respons_host', 'accommod_respons_would',
       'accommod_return', 'accommod_right', 'accommod_room'], dtype=object)

In [17]:
vectorizer.get_feature_names_out().shape

(117459,)

Используем оверсэмплинг для обучающей выборки

In [18]:
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [19]:
print(y_train.sum()/y_train.shape[0])

0.5


### Логистическая регрессия

In [20]:
logistic_classifier = LogisticRegression(C=1, max_iter=1000, random_state=42)

In [21]:
logistic_classifier.fit(X_train, y_train)

In [22]:
y_pred = logistic_classifier.predict(X_test)

In [23]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_test, y_pred, average='macro'))
print("F1: ", f1_score(y_test, y_pred, average='macro'))

Accuracy:  0.9943732089442968
Precision:  0.7611711023326894
Recall:  0.8573968438940214
F1:  0.8016578520396558


In [24]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.72      0.61       288
           1       1.00      1.00      1.00     47519

    accuracy                           0.99     47807
   macro avg       0.76      0.86      0.80     47807
weighted avg       1.00      0.99      0.99     47807



In [25]:
confusion_matrix(y_test, y_pred)

array([[  207,    81],
       [  188, 47331]])

##### Подбираем оптимальные гиперпараметры по сетке

In [26]:
logistic_classifier = LogisticRegression(random_state=42)

In [27]:
param_grid = {
    'C': [0.005,0.01,0.025,0.1,1],
    'max_iter': [1000]
}
grid_search = GridSearchCV(logistic_classifier, param_grid, cv=5, scoring='f1_macro', refit=True, return_train_score=True, verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END C=0.005, max_iter=1000;, score=(train=0.988, test=0.988) total time=  11.3s
[CV 2/5] END C=0.005, max_iter=1000;, score=(train=0.988, test=0.987) total time=   8.9s
[CV 3/5] END C=0.005, max_iter=1000;, score=(train=0.988, test=0.988) total time=  11.0s
[CV 4/5] END C=0.005, max_iter=1000;, score=(train=0.988, test=0.988) total time=  10.2s
[CV 5/5] END C=0.005, max_iter=1000;, score=(train=0.988, test=0.988) total time=   8.4s
[CV 1/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.990) total time=  12.7s
[CV 2/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.989) total time=  12.5s
[CV 3/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.990) total time=  12.3s
[CV 4/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.990) total time=   9.9s
[CV 5/5] END C=0.01, max_iter=1000;, score=(train=0.990, test=0.990) total time=  11.6s
[CV 1/5] END C=0.025, max_iter=1000;, score=(train=0.99

In [28]:
pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_train_score"], columns=["Training Mean F1 Score"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Validation Mean F1 Score"])], axis=1)

Unnamed: 0,C,max_iter,Training Mean F1 Score,Validation Mean F1 Score
0,0.005,1000,0.988227,0.987681
1,0.01,1000,0.990221,0.98968
2,0.025,1000,0.992746,0.992013
3,0.1,1000,0.996099,0.994883
4,1.0,1000,0.998178,0.996409


In [29]:
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Hyperparameters:  {'C': 1, 'max_iter': 1000}
Best F1 Score:  0.9964092142992469


In [30]:
y_pred = grid_search.predict(X_test)

In [31]:
report = classification_report(y_test, y_pred , digits=4)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0     0.5241    0.7188    0.6061       288
           1     0.9983    0.9960    0.9972     47519

    accuracy                         0.9944     47807
   macro avg     0.7612    0.8574    0.8017     47807
weighted avg     0.9954    0.9944    0.9948     47807



In [32]:
confusion_matrix(y_test, y_pred)

array([[  207,    81],
       [  188, 47331]])