# Standard Machine Learning models

In [1]:
import importlib
import sys
import os

sys.path.append("..")  # Ensure the parent directory is in the path

from sklearn.metrics import classification_report

# --- Local Application/Module Imports ---
import data_loader.data_loader
import data_preprocessing.data_preprocessing
import models.standard_ml_models
import visualizations.visualizations
import utils.utils

importlib.reload(data_loader.data_loader)
from data_loader.data_loader import *

importlib.reload(data_preprocessing.data_preprocessing)
from data_preprocessing.data_preprocessing import *

importlib.reload(models.standard_ml_models)
from models.standard_ml_models import *

importlib.reload(visualizations.visualizations)
from visualizations.visualizations import *

importlib.reload(utils.utils)
from utils.utils import *

# --- Notebook Configuration ---
%matplotlib inline
%config InlineBackend.figure_format='retina'

# --- Global Settings ---
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tnorlha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tnorlha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/tnorlha/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tnorlha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tnorlha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tnorlha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/tnorlha/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tnorlha/nltk_data...
[nltk_data]   Package wordnet is alre

## Loading data and splitting into train, validation, and test sets

In [2]:
train_df, val_df, test_df = load_and_split_data()

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 81677
Validation size: 10210
Test size: 10210


# Bag-of-words - Baseline models

In [106]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df, val_df, test_df)

In [107]:
# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

In [108]:
print_evaluation(y_test, y_test_pred_lr)

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210



In [109]:
print(L_score(y_test, y_test_pred_lr))

0.8040646425073458


In [86]:
# Train a Random Forest model
y_train_pred_rf, y_val_pred_rf, y_test_pred_rf, model_rf = train_and_predict_random_forest(X_train, y_train, X_val, X_test)

In [87]:
print_evaluation(y_test, y_test_pred_rf)

              precision    recall  f1-score   support

    negative       0.63      0.28      0.38      2191
     neutral       0.63      0.89      0.74      4915
    positive       0.70      0.50      0.58      3104

    accuracy                           0.64     10210
   macro avg       0.65      0.56      0.57     10210
weighted avg       0.65      0.64      0.61     10210



In [88]:
print(L_score(y_test, y_test_pred_rf))

0.7954946131243879


In [10]:
# Train a XGBoost model
y_train_pred_xgb, y_val_pred_xgb, y_test_pred_xgb, model_xgb = train_and_predict_xgboost(X_train, y_train, X_val, X_test)

In [11]:
print_evaluation(y_test, y_test_pred_xgb)

              precision    recall  f1-score   support

    negative       0.67      0.29      0.41      2191
     neutral       0.62      0.92      0.74      4915
    positive       0.74      0.47      0.57      3104

    accuracy                           0.65     10210
   macro avg       0.68      0.56      0.58     10210
weighted avg       0.67      0.65      0.62     10210



In [12]:
print(L_score(y_test, y_test_pred_xgb))

0.8027913809990206


In [13]:
# Train a MLP model
y_train_pred_mlp, y_val_pred_mlp, y_test_pred_mlp, model_mlp = train_and_predict_mlp(X_train, y_train, X_val, X_test)

In [14]:
print_evaluation(y_test, y_test_pred_mlp)

              precision    recall  f1-score   support

    negative       0.53      0.40      0.46      2191
     neutral       0.69      0.77      0.73      4915
    positive       0.61      0.60      0.60      3104

    accuracy                           0.64     10210
   macro avg       0.61      0.59      0.60     10210
weighted avg       0.63      0.64      0.63     10210



In [15]:
print(L_score(y_test, y_test_pred_mlp))

0.7782076395690499


# Data preprocessing

In [110]:
train_df_lowercase = preprocess_ml_pipeline(train_df, ["lowercase"])
val_df_lowercase = preprocess_ml_pipeline(val_df, ["lowercase"])
test_df_lowercase = preprocess_ml_pipeline(test_df, ["lowercase"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_lowercase, val_df_lowercase, test_df_lowercase)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


bag of words transforms to lowercase by default, so this step is redundant

In [111]:
train_df_expand_contractions = preprocess_ml_pipeline(train_df, ["expand_contractions"])
val_df_expand_contractions = preprocess_ml_pipeline(val_df, ["expand_contractions"])
test_df_expand_contractions = preprocess_ml_pipeline(test_df, ["expand_contractions"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_expand_contractions, val_df_expand_contractions, test_df_expand_contractions)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.62      0.40      0.49      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.70      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8057296767874633


In [112]:
train_df_remove_emails = preprocess_ml_pipeline(train_df, ["remove_emails"])
val_df_remove_emails = preprocess_ml_pipeline(val_df, ["remove_emails"])
test_df_remove_emails = preprocess_ml_pipeline(test_df, ["remove_emails"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_emails, val_df_remove_emails, test_df_remove_emails)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.70      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.804407443682664


In [113]:
train_df_remove_urls = preprocess_ml_pipeline(train_df, ["remove_urls"])
val_df_remove_urls = preprocess_ml_pipeline(val_df, ["remove_urls"])
test_df_remove_urls = preprocess_ml_pipeline(test_df, ["remove_urls"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_urls, val_df_remove_urls, test_df_remove_urls)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8041625857002939


In [114]:
train_df_remove_digits = preprocess_ml_pipeline(train_df, ["remove_digits"])
val_df_remove_digits = preprocess_ml_pipeline(val_df, ["remove_digits"])
test_df_remove_digits = preprocess_ml_pipeline(test_df, ["remove_digits"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_digits, val_df_remove_digits, test_df_remove_digits)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.47      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.63      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8036728697355534


In [115]:
train_df_remove_special_chars = preprocess_ml_pipeline(train_df, ["remove_special_chars"])
val_df_remove_special_chars = preprocess_ml_pipeline(val_df, ["remove_special_chars"])
test_df_remove_special_chars = preprocess_ml_pipeline(test_df, ["remove_special_chars"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_special_chars, val_df_remove_special_chars, test_df_remove_special_chars)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


In [116]:
train_df_collapse_spaces = preprocess_ml_pipeline(train_df, ["collapse_spaces"])
val_df_collapse_spaces = preprocess_ml_pipeline(val_df, ["collapse_spaces"])
test_df_collapse_spaces = preprocess_ml_pipeline(test_df, ["collapse_spaces"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_collapse_spaces, val_df_collapse_spaces, test_df_collapse_spaces)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


In [117]:
train_df_remove_accented_chars = preprocess_ml_pipeline(train_df, ["remove_accented_chars"])
val_df_remove_accented_chars = preprocess_ml_pipeline(val_df, ["remove_accented_chars"])
test_df_remove_accented_chars = preprocess_ml_pipeline(test_df, ["remove_accented_chars"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_accented_chars, val_df_remove_accented_chars, test_df_remove_accented_chars)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8037708129285015


In [118]:
train_df_remove_stopwords = preprocess_ml_pipeline(train_df, ["remove_stopwords"])
val_df_remove_stopwords = preprocess_ml_pipeline(val_df, ["remove_stopwords"])
test_df_remove_stopwords = preprocess_ml_pipeline(test_df, ["remove_stopwords"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_stopwords, val_df_remove_stopwords, test_df_remove_stopwords)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.60      0.34      0.43      2191
     neutral       0.66      0.87      0.75      4915
    positive       0.69      0.55      0.62      3104

    accuracy                           0.66     10210
   macro avg       0.65      0.59      0.60     10210
weighted avg       0.66      0.66      0.64     10210

0.800048971596474


In [122]:
train_df_lemmatize = preprocess_ml_pipeline(train_df, ["lemmatize"])
val_df_lemmatize = preprocess_ml_pipeline(val_df, ["lemmatize"])
test_df_lemmatize = preprocess_ml_pipeline(test_df, ["lemmatize"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_lemmatize, val_df_lemmatize, test_df_lemmatize)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.60      0.39      0.47      2191
     neutral       0.67      0.86      0.76      4915
    positive       0.70      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.60      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8032321253672869


In [120]:
train_df_filter_valid_words = preprocess_ml_pipeline(train_df, ["filter_valid_words"])
val_df_filter_valid_words = preprocess_ml_pipeline(val_df, ["filter_valid_words"])
test_df_filter_valid_words = preprocess_ml_pipeline(test_df, ["filter_valid_words"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_filter_valid_words, val_df_filter_valid_words, test_df_filter_valid_words)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.59      0.36      0.45      2191
     neutral       0.65      0.87      0.74      4915
    positive       0.68      0.51      0.59      3104

    accuracy                           0.65     10210
   macro avg       0.64      0.58      0.59     10210
weighted avg       0.65      0.65      0.63     10210

0.794466209598433


# Results

| Preprocessing Step     | L_score               |
|------------------------|-----------------------|
| baseline               | 0.8040646425073458    |
| expand_contractions    | 0.8057296767874633    |
| remove_emails          | 0.804407443682664     |
| remove_urls            | 0.8041625857002939    |

We can see that the preprocessing steps have a small impact on the model performance. The best performing model is the one with the "expand_contractions" step, which has a slight improvement over the baseline. The other steps have negligible effects on the performance.

To attain better results, we will explore with fine-tuning Roberta in the next notebook. We will explore ways to sanitize and augment the dataset using LLMs and various other techniques.