<div style="background-color:white; text-align:center; font-family:Arial, Helvetica, sans-serif; padding:50px;">
  <!-- Tytuł -->
  <div style="color:#993520; font-size:60px; font-weight:bold; margin-bottom:20px;">
    SENTIMENT ANALYSIS
  </div>

  <!-- Podtytuł -->
  <div style="color:#993520; font-size:35px; margin-bottom:40px;">
    Supervised sentiment analysis: regression, binary & multiclass classification
  </div>

  <!-- Autor -->
  <div style="color:black; font-size:30px; margin-bottom:10px;">
    Maciej Świtała, PhD
  </div>

  <!-- Data / semestr -->
  <div style="color:black; font-size:30px; margin-bottom:50px;">
    Autumn 2025
  </div>

  <!-- Logo -->
  <div>
    <img src="img/wne-logo-new-en.jpg" alt="WNE Logo" style="max-width:400px; height:auto;">
  </div>
</div>


### 1. Intro

In [1]:
# !pip install pandas numpy matplotlib nltk scikit-learn xgboost

In [2]:
import pandas as pd  # for working with data in DataFrames
import numpy as np  # numerical operations and arrays

import matplotlib.pyplot as plt  # data visualization

import pickle  # data loading
import math  # mathematical functions
import ast

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from collections import Counter  # counting occurrences of elements
from itertools import islice

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer # VADER algorithm

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, roc_auc_score, mean_squared_error

In [3]:
with open("data/regression_model_data.pkl", "rb") as f:
    regression_model_data = pickle.load(f)

X_train_regression = regression_model_data['X_train']
X_test_regression = regression_model_data['X_test']
y_train_regression = regression_model_data['y_train']
y_test_regression = regression_model_data['y_test']

with open("data/binary_classification_model_data.pkl", "rb") as f:
    binary_classification_model_data = pickle.load(f)

X_train_binary_classification = binary_classification_model_data['X_train']
X_test_binary_classification = binary_classification_model_data['X_test']
y_train_binary_classification = binary_classification_model_data['y_train']
y_test_binary_classification = binary_classification_model_data['y_test']

with open("data/multinomial_classification_model_data.pkl", "rb") as f:
    multinomial_classification_model_data = pickle.load(f)

X_train_multinomial_classification = multinomial_classification_model_data['X_train']
X_test_multinomial_classification = multinomial_classification_model_data['X_test']
y_train_multinomial_classification = multinomial_classification_model_data['y_train']
y_test_multinomial_classification = multinomial_classification_model_data['y_test']

### 2. Regression models

In [4]:
regression_models_comparison = []
regression_models_comparison.append(['LASSO', 0.4294, 0.5275]) # see: previous materials

In [5]:
# model
reg = LinearRegression()

# parameters to be optimised
param_grid = {}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=reg,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_regression, y_train_regression)

# print best hyperparameters and CV score
print("Best CV RMSE:", np.round(-grid.best_score_,4))

# evaluate on the test set
y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("Test RMSE:", np.round(test_rmse,4))

regression_models_comparison.append(['linear regression',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Best CV RMSE: 0.4474
Test RMSE: 0.5624


In [6]:
# pipeline: scale -> KNN
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("reg", KNeighborsRegressor())
])

# parameters to be optimised
param_grid = {
    "reg__n_neighbors": [3, 5, 7, 9, 11, 15, 20],
    "reg__weights": ["uniform", "distance"],
    "reg__p": [1, 2]   # 1 = Manhattan, 2 = Euclidean
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# fit
grid.fit(X_train_regression, y_train_regression)

# results
print("Best params:", grid.best_params_)
print("Best CV RMSE:", np.round(-grid.best_score_, 4))

y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("Test RMSE:", np.round(test_rmse, 4))

regression_models_comparison.append(['KNN',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Best params: {'reg__n_neighbors': 7, 'reg__p': 2, 'reg__weights': 'distance'}
Best CV RMSE: 0.4489
Test RMSE: 0.5435


In [7]:
# pipeline: scale -> SVR
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("reg", SVR())
])

# parameters to be optimised
param_grid = {
    "reg__kernel": ["linear", "rbf", "poly"],
    "reg__C": [0.1, 1, 10],
    "reg__epsilon": [0.01, 0.1, 1.0],
    "reg__gamma": ["scale", "auto"]
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# fit
grid.fit(X_train_regression, y_train_regression)

# results
print("Best params:", grid.best_params_)
print("Best CV RMSE:", np.round(-grid.best_score_, 4))

y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("Test RMSE:", np.round(test_rmse, 4))

regression_models_comparison.append(['SVR',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Best params: {'reg__C': 1, 'reg__epsilon': 0.1, 'reg__gamma': 'scale', 'reg__kernel': 'rbf'}
Best CV RMSE: 0.4345
Test RMSE: 0.507


In [8]:
# model
reg = DecisionTreeRegressor(random_state=42)

# parameters to be optimised
param_grid = {
    "max_depth": [3, 5, 10, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10]
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=reg,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_regression, y_train_regression)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV RMSE:", np.round(-grid.best_score_, 4))

# evaluate on the test set
y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("Test RMSE:", np.round(test_rmse, 4))

regression_models_comparison.append(['decision tree',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Best params: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best CV RMSE: 0.4776
Test RMSE: 0.5492


In [9]:
# model
reg = RandomForestRegressor(random_state=42, n_jobs=-1)

# parameters to be optimised
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=reg,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_regression, y_train_regression)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV RMSE:", np.round(-grid.best_score_, 4))

# evaluate on the test set
y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("Test RMSE:", np.round(test_rmse, 4))

regression_models_comparison.append(['random forest',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Best params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best CV RMSE: 0.4394
Test RMSE: 0.5375


In [10]:
# model
reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

# parametry do optymalizacji
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 10, 20],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
    "gamma": [0, 0.1, 0.2]
}

# k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=reg,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# dopasowanie modelu do zbioru treningowego
grid.fit(X_train_regression, y_train_regression)

# najlepsze parametry i wynik CV
print("Najlepsze parametry:", grid.best_params_)
print("Najlepszy RMSE CV:", np.round(-grid.best_score_, 4))

# ocena na zbiorze testowym
y_test_pred = grid.best_estimator_.predict(X_test_regression)
test_rmse = np.sqrt(mean_squared_error(y_test_regression, y_test_pred))
print("RMSE testowe:", np.round(test_rmse, 4))

regression_models_comparison.append(['xgboost',np.round(-grid.best_score_,4), np.round(test_rmse,4)])

Najlepsze parametry: {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Najlepszy RMSE CV: 0.4305
RMSE testowe: 0.528


In [11]:
regression_models_comparison_df = pd.DataFrame(regression_models_comparison)
regression_models_comparison_df.columns = ['model','rmse (cv)','rmse (test)']
regression_models_comparison_df.sort_values(by='rmse (cv)', ascending=True)

Unnamed: 0,model,rmse (cv),rmse (test)
0,LASSO,0.4294,0.5275
6,xgboost,0.4305,0.528
3,SVR,0.4345,0.507
5,random forest,0.4394,0.5375
1,linear regression,0.4474,0.5624
2,KNN,0.4489,0.5435
4,decision tree,0.4776,0.5492


### 3. Binary classification

In [12]:
binary_classification_models_comparison = []
binary_classification_models_comparison.append(['LASSO', 0.8167, 0.6117]) # see: previous materials

In [13]:
# model
log_reg = LogisticRegression(solver='liblinear', random_state=42)

# parameters to be optimised (przykład)
param_grid = {}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_binary_classification, y_train_binary_classification)

# print best hyperparameters and CV score
print("Best CV ROC AUC:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['logistic regression', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best CV ROC AUC: 0.8089
Test ROC AUC: 0.6618


In [14]:
# pipeline: scale -> KNN
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

# parameters to be optimised
param_grid = {
    "clf__n_neighbors": [3, 5, 7, 9, 11, 15, 20],
    "clf__weights": ["uniform", "distance"],
    "clf__p": [1, 2]   # 1 = Manhattan, 2 = Euclidean
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit
grid.fit(X_train_binary_classification, y_train_binary_classification)

# results
print("Best params:", grid.best_params_)
print("Best CV ROC AUC:", np.round(grid.best_score_, 4))

y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['KNN',np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'clf__n_neighbors': 3, 'clf__p': 2, 'clf__weights': 'distance'}
Best CV ROC AUC: 0.793
Test ROC AUC: 0.5997


In [15]:
# pipeline: scale -> SVC
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("clf", SVC(probability=True))
])

# parameters to be optimised
param_grid = {
    "clf__kernel": ["linear", "rbf", "poly"],
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", "auto"]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit
grid.fit(X_train_binary_classification, y_train_binary_classification)

# results
print("Best params:", grid.best_params_)
print("Best CV ROC AUC:", np.round(grid.best_score_, 4))

y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['SVC',np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'clf__C': 0.1, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}
Best CV ROC AUC: 0.8968
Test ROC AUC: 0.6558


In [16]:
# model
clf = DecisionTreeClassifier(random_state=42)

# parameters to be optimised
param_grid = {
    "max_depth": [3, 5, 10, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_binary_classification, y_train_binary_classification)

# results
print("Best params:", grid.best_params_)
print("Best CV ROC AUC:", np.round(grid.best_score_, 4))

y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['decision tree',np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best CV ROC AUC: 0.6452
Test ROC AUC: 0.5038


In [17]:
# model
clf = RandomForestClassifier(random_state=42, n_jobs=-1)

# parameters to be optimised
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_binary_classification, y_train_binary_classification)

# results
print("Best params:", grid.best_params_)
print("Best CV ROC AUC:", np.round(grid.best_score_, 4))

y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['random forest',np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best CV ROC AUC: 0.8374
Test ROC AUC: 0.6296


In [18]:
# model
clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42, n_jobs=-1)

# parametry do optymalizacji
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 10, 20],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
    "gamma": [0, 0.1, 0.2]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_binary_classification, y_train_binary_classification)

# results
print("Best params:", grid.best_params_)
print("Best CV ROC AUC:", np.round(grid.best_score_, 4))

y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_binary_classification)[:,1]
test_roc_auc = roc_auc_score(y_test_binary_classification, y_test_pred_proba)
print("Test ROC AUC:", np.round(test_roc_auc,4))

binary_classification_models_comparison.append(['xgboost',np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Best CV ROC AUC: 0.6964
Test ROC AUC: 0.6416


In [19]:
binary_classification_models_comparison_df = pd.DataFrame(binary_classification_models_comparison)
binary_classification_models_comparison_df.columns = ['model','ROC AUC (cv)','ROC AUC (test)']
binary_classification_models_comparison_df.sort_values(by='ROC AUC (cv)', ascending=False)

Unnamed: 0,model,ROC AUC (cv),ROC AUC (test)
3,SVC,0.8968,0.6558
5,random forest,0.8374,0.6296
0,LASSO,0.8167,0.6117
1,logistic regression,0.8089,0.6618
2,KNN,0.793,0.5997
6,xgboost,0.6964,0.6416
4,decision tree,0.6452,0.5038


### 4. Multinomial classification

In [4]:
multinomial_classification_models_comparison = []
multinomial_classification_models_comparison.append(['LASSO', 0.7041, 0.5469]) # see: previous materials

In [5]:
# model
log_reg = LogisticRegression(solver='liblinear', random_state=42)

# parameters to be optimised (przykład)
param_grid = {}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['logistic regression', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best CV ROC AUC OVR: 0.6754
Test ROC AUC OVR: 0.5969




In [6]:
# pipeline: scale -> KNN
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

# parameters to be optimised
param_grid = {
    "clf__n_neighbors": [3, 5, 7, 9, 11, 15, 20],
    "clf__weights": ["uniform", "distance"],
    "clf__p": [1, 2]   # 1 = Manhattan, 2 = Euclidean
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search (multiclass)
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['KNN', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'clf__n_neighbors': 20, 'clf__p': 2, 'clf__weights': 'distance'}
Best CV ROC AUC OVR: 0.6527
Test ROC AUC OVR: 0.5461


In [7]:
# pipeline: scale -> SVC
pipeline = Pipeline([
    ("scale", StandardScaler()),
    ("clf", SVC(probability=True))
])

# parameters to be optimised
param_grid = {
    "clf__kernel": ["linear", "rbf", "poly"],
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", "auto"]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['SVC', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
Best CV ROC AUC OVR: 0.7111
Test ROC AUC OVR: 0.5827


In [8]:
# model
clf = DecisionTreeClassifier(random_state=42)

# parameters to be optimised
param_grid = {
    "max_depth": [3, 5, 10, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['decision tree', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best CV ROC AUC OVR: 0.5977
Test ROC AUC OVR: 0.4721


In [9]:
# model
clf = RandomForestClassifier(random_state=42, n_jobs=-1)

# parameters to be optimised
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['random forest', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best CV ROC AUC OVR: 0.7073
Test ROC AUC OVR: 0.5448


In [10]:
# model
clf = xgb.XGBClassifier(objective='multi:softprob', random_state=42, n_jobs=-1)

# parametry do optymalizacji
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 10, 20],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
    "gamma": [0, 0.1, 0.2]
}

# k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc_ovr',
    n_jobs=-1
)

# target needs to take integer values starting from 1; the easiest way to convert it flexibly is: 
le = LabelEncoder()
y_train_multinomial_classification = le.fit_transform(y_train_multinomial_classification)
y_test_multinomial_classification = le.transform(y_test_multinomial_classification)

# fit the model on the training set
grid.fit(X_train_multinomial_classification, y_train_multinomial_classification)

# print best hyperparameters and CV score
print("Best params:", grid.best_params_)
print("Best CV ROC AUC OVR:", np.round(grid.best_score_,4))

# evaluate on the test set
y_test_pred_proba = grid.best_estimator_.predict_proba(X_test_multinomial_classification)
test_roc_auc = roc_auc_score(y_test_multinomial_classification, y_test_pred_proba, multi_class='ovr')
print("Test ROC AUC OVR:", np.round(test_roc_auc,4))

multinomial_classification_models_comparison.append(['xgboost', np.round(grid.best_score_,4), np.round(test_roc_auc,4)])

Best params: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best CV ROC AUC OVR: 0.6618
Test ROC AUC OVR: 0.5551


In [11]:
multinomial_classification_models_comparison_df = pd.DataFrame(multinomial_classification_models_comparison)
multinomial_classification_models_comparison_df.columns = ['model','ROC AUC OVR (cv)','ROC AUC OVR (test)']
multinomial_classification_models_comparison_df.sort_values(by='ROC AUC OVR (cv)', ascending=False)

Unnamed: 0,model,ROC AUC OVR (cv),ROC AUC OVR (test)
3,SVC,0.7111,0.5827
5,random forest,0.7073,0.5448
0,LASSO,0.7041,0.5469
1,logistic regression,0.6754,0.5969
6,xgboost,0.6618,0.5551
2,KNN,0.6527,0.5461
4,decision tree,0.5977,0.4721
