# Standard Machine Learning models

In [28]:
import importlib
import sys
import os

sys.path.append("..")  # Ensure the parent directory is in the path

from sklearn.metrics import classification_report

# --- Local Application/Module Imports ---
import data_loader.data_loader
import data_preprocessing.data_preprocessing
import models.standard_ml_models
import visualizations.visualizations
import utils.utils

importlib.reload(data_loader.data_loader)
from data_loader.data_loader import *

importlib.reload(data_preprocessing.data_preprocessing)
from data_preprocessing.data_preprocessing import *

importlib.reload(models.standard_ml_models)
from models.standard_ml_models import *

importlib.reload(visualizations.visualizations)
from visualizations.visualizations import *

importlib.reload(utils.utils)
from utils.utils import *

# --- Notebook Configuration ---
%matplotlib inline
%config InlineBackend.figure_format='retina'

# --- Global Settings ---
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
os.environ["TOKENIZERS_PARALLELISM"] = "false"


## Loading data and splitting into train, validation, and test sets

In [2]:
train_df, val_df, test_df = load_and_split_data()

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 81677
Validation size: 10210
Test size: 10210


# Bag-of-words - Baseline models

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df, val_df, test_df)

In [4]:
# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
print_evaluation(y_test, y_test_pred_lr)

              precision    recall  f1-score   support

    negative       0.58      0.45      0.51      2191
     neutral       0.69      0.82      0.75      4915
    positive       0.68      0.59      0.63      3104

    accuracy                           0.67     10210
   macro avg       0.65      0.62      0.63     10210
weighted avg       0.66      0.67      0.66     10210



In [29]:
print(L_score(y_test, y_test_pred_lr))

0.8007345739471107


In [5]:
# Train a Random Forest model
y_train_pred_rf, y_val_pred_rf, y_test_pred_rf, model_rf = train_and_predict_random_forest(X_train, y_train, X_val, X_test)

In [25]:
print_evaluation(y_test, y_test_pred_rf)

              precision    recall  f1-score   support

    negative       0.61      0.28      0.39      2191
     neutral       0.63      0.89      0.73      4915
    positive       0.68      0.49      0.57      3104

    accuracy                           0.64     10210
   macro avg       0.64      0.55      0.57     10210
weighted avg       0.64      0.64      0.61     10210



In [30]:
print(L_score(y_test, y_test_pred_rf))

0.7923604309500489


In [6]:
# Train a XGBoost model
y_train_pred_xgb, y_val_pred_xgb, y_test_pred_xgb, model_xgb = train_and_predict_xgboost(X_train, y_train, X_val, X_test)

In [26]:
print_evaluation(y_test, y_test_pred_xgb)

              precision    recall  f1-score   support

    negative       0.67      0.29      0.41      2191
     neutral       0.62      0.92      0.74      4915
    positive       0.74      0.47      0.57      3104

    accuracy                           0.65     10210
   macro avg       0.68      0.56      0.58     10210
weighted avg       0.67      0.65      0.62     10210



In [31]:
print(L_score(y_test, y_test_pred_xgb))

0.8027913809990206


In [7]:
# Train a MLP model
y_train_pred_mlp, y_val_pred_mlp, y_test_pred_mlp, model_mlp = train_and_predict_mlp(X_train, y_train, X_val, X_test)



In [27]:
print_evaluation(y_test, y_test_pred_mlp)

              precision    recall  f1-score   support

    negative       0.51      0.47      0.49      2191
     neutral       0.71      0.75      0.73      4915
    positive       0.62      0.60      0.61      3104

    accuracy                           0.64     10210
   macro avg       0.61      0.61      0.61     10210
weighted avg       0.64      0.64      0.64     10210



In [32]:
print(L_score(y_test, y_test_pred_mlp))

0.7789911851126347


# Data preprocessing