# Standard Machine Learning models

In [1]:
import importlib
import sys

sys.path.append("..")  # Ensure the parent directory is in the path

# --- Local Application/Module Imports ---
import data_loader.data_loader
import data_preprocessing.data_preprocessing
import models.standard_ml_models
import visualizations.visualizations
import utils.utils

importlib.reload(data_loader.data_loader)
from data_loader.data_loader import *

importlib.reload(data_preprocessing.data_preprocessing)
from data_preprocessing.data_preprocessing import *

importlib.reload(models.standard_ml_models)
from models.standard_ml_models import *

importlib.reload(visualizations.visualizations)
from visualizations.visualizations import *

importlib.reload(utils.utils)
from utils.utils import *

# --- Notebook Configuration ---
%matplotlib inline
%config InlineBackend.figure_format='retina'

# --- Global Settings ---
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tnorlha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tnorlha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/tnorlha/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tnorlha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tnorlha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tnorlha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/tnorlha/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tnorlha/nltk_data...
[nltk_data]   Package wordnet is alre

## Loading data and splitting into train, validation, and test sets

In [2]:
train_df, val_df, test_df = load_and_split_data()

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 81677
Validation size: 10210
Test size: 10210


# Bag-of-words - Baseline models

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df, val_df, test_df)

### Logistic Regression

In [4]:
# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

In [5]:
print_evaluation(y_test, y_test_pred_lr)

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210



In [6]:
print(L_score(y_test, y_test_pred_lr))

0.8040646425073458


### Random Forest

In [7]:
# Train a Random Forest model
y_train_pred_rf, y_val_pred_rf, y_test_pred_rf, model_rf = train_and_predict_random_forest(X_train, y_train, X_val, X_test)

In [8]:
print_evaluation(y_test, y_test_pred_rf)

              precision    recall  f1-score   support

    negative       0.61      0.28      0.39      2191
     neutral       0.63      0.89      0.73      4915
    positive       0.68      0.49      0.57      3104

    accuracy                           0.64     10210
   macro avg       0.64      0.55      0.57     10210
weighted avg       0.64      0.64      0.61     10210



In [9]:
print(L_score(y_test, y_test_pred_rf))

0.7923604309500489


### XGBoost

In [10]:
# Train a XGBoost model
y_train_pred_xgb, y_val_pred_xgb, y_test_pred_xgb, model_xgb = train_and_predict_xgboost(X_train, y_train, X_val, X_test)

In [11]:
print_evaluation(y_test, y_test_pred_xgb)

              precision    recall  f1-score   support

    negative       0.67      0.29      0.41      2191
     neutral       0.62      0.92      0.74      4915
    positive       0.74      0.47      0.57      3104

    accuracy                           0.65     10210
   macro avg       0.68      0.56      0.58     10210
weighted avg       0.67      0.65      0.62     10210



In [12]:
print(L_score(y_test, y_test_pred_xgb))

0.8027913809990206


### MLP

In [13]:
# Train a MLP model
y_train_pred_mlp, y_val_pred_mlp, y_test_pred_mlp, model_mlp = train_and_predict_mlp(X_train, y_train, X_val, X_test)

In [14]:
print_evaluation(y_test, y_test_pred_mlp)

              precision    recall  f1-score   support

    negative       0.53      0.40      0.46      2191
     neutral       0.69      0.77      0.73      4915
    positive       0.61      0.60      0.60      3104

    accuracy                           0.64     10210
   macro avg       0.61      0.59      0.60     10210
weighted avg       0.63      0.64      0.63     10210



In [15]:
print(L_score(y_test, y_test_pred_mlp))

0.7782076395690499


# Data preprocessing

### Lowercase

In [16]:
train_df_lowercase = preprocess_ml_pipeline(train_df, ["lowercase"])
val_df_lowercase = preprocess_ml_pipeline(val_df, ["lowercase"])
test_df_lowercase = preprocess_ml_pipeline(test_df, ["lowercase"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_lowercase, val_df_lowercase, test_df_lowercase)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


bag of words transforms to lowercase by default, so this step is redundant

### Expand contractions

In [17]:
train_df_expand_contractions = preprocess_ml_pipeline(train_df, ["expand_contractions"])
val_df_expand_contractions = preprocess_ml_pipeline(val_df, ["expand_contractions"])
test_df_expand_contractions = preprocess_ml_pipeline(test_df, ["expand_contractions"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_expand_contractions, val_df_expand_contractions, test_df_expand_contractions)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.62      0.40      0.49      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.70      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8057296767874633


### Remove emails

In [18]:
train_df_remove_emails = preprocess_ml_pipeline(train_df, ["remove_emails"])
val_df_remove_emails = preprocess_ml_pipeline(val_df, ["remove_emails"])
test_df_remove_emails = preprocess_ml_pipeline(test_df, ["remove_emails"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_emails, val_df_remove_emails, test_df_remove_emails)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.70      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.804407443682664


### Remove urls

In [19]:
train_df_remove_urls = preprocess_ml_pipeline(train_df, ["remove_urls"])
val_df_remove_urls = preprocess_ml_pipeline(val_df, ["remove_urls"])
test_df_remove_urls = preprocess_ml_pipeline(test_df, ["remove_urls"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_urls, val_df_remove_urls, test_df_remove_urls)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8041625857002939


### Remove digits

In [20]:
train_df_remove_digits = preprocess_ml_pipeline(train_df, ["remove_digits"])
val_df_remove_digits = preprocess_ml_pipeline(val_df, ["remove_digits"])
test_df_remove_digits = preprocess_ml_pipeline(test_df, ["remove_digits"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_digits, val_df_remove_digits, test_df_remove_digits)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.47      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.63      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8036728697355534


### Remove special characters

In [21]:
train_df_remove_special_chars = preprocess_ml_pipeline(train_df, ["remove_special_chars"])
val_df_remove_special_chars = preprocess_ml_pipeline(val_df, ["remove_special_chars"])
test_df_remove_special_chars = preprocess_ml_pipeline(test_df, ["remove_special_chars"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_special_chars, val_df_remove_special_chars, test_df_remove_special_chars)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


### Collapse spaces

In [22]:
train_df_collapse_spaces = preprocess_ml_pipeline(train_df, ["collapse_spaces"])
val_df_collapse_spaces = preprocess_ml_pipeline(val_df, ["collapse_spaces"])
test_df_collapse_spaces = preprocess_ml_pipeline(test_df, ["collapse_spaces"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_collapse_spaces, val_df_collapse_spaces, test_df_collapse_spaces)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8040646425073458


This step is also redundant as it is already done.

### Remove accented characters

In [23]:
train_df_remove_accented_chars = preprocess_ml_pipeline(train_df, ["remove_accented_chars"])
val_df_remove_accented_chars = preprocess_ml_pipeline(val_df, ["remove_accented_chars"])
test_df_remove_accented_chars = preprocess_ml_pipeline(test_df, ["remove_accented_chars"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_accented_chars, val_df_remove_accented_chars, test_df_remove_accented_chars)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.69      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8037708129285015


### Remove stopwords

In [24]:
train_df_remove_stopwords = preprocess_ml_pipeline(train_df, ["remove_stopwords"])
val_df_remove_stopwords = preprocess_ml_pipeline(val_df, ["remove_stopwords"])
test_df_remove_stopwords = preprocess_ml_pipeline(test_df, ["remove_stopwords"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_remove_stopwords, val_df_remove_stopwords, test_df_remove_stopwords)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.60      0.34      0.43      2191
     neutral       0.66      0.87      0.75      4915
    positive       0.69      0.55      0.62      3104

    accuracy                           0.66     10210
   macro avg       0.65      0.59      0.60     10210
weighted avg       0.66      0.66      0.64     10210

0.800048971596474


### Lemmatize

In [25]:
train_df_lemmatize = preprocess_ml_pipeline(train_df, ["lemmatize"])
val_df_lemmatize = preprocess_ml_pipeline(val_df, ["lemmatize"])
test_df_lemmatize = preprocess_ml_pipeline(test_df, ["lemmatize"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_lemmatize, val_df_lemmatize, test_df_lemmatize)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.60      0.39      0.47      2191
     neutral       0.67      0.86      0.76      4915
    positive       0.70      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.60      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8032321253672869


### Filter valid words

In [26]:
train_df_filter_valid_words = preprocess_ml_pipeline(train_df, ["filter_valid_words"])
val_df_filter_valid_words = preprocess_ml_pipeline(val_df, ["filter_valid_words"])
test_df_filter_valid_words = preprocess_ml_pipeline(test_df, ["filter_valid_words"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_filter_valid_words, val_df_filter_valid_words, test_df_filter_valid_words)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.59      0.36      0.45      2191
     neutral       0.65      0.87      0.74      4915
    positive       0.68      0.51      0.59      3104

    accuracy                           0.65     10210
   macro avg       0.64      0.58      0.59     10210
weighted avg       0.65      0.65      0.63     10210

0.794466209598433


### Removing non-English inputs in the training set

In [28]:
# train_df['language'] = train_df['text'].apply(safe_detect)
train_df = pd.read_csv('../generated/language_detected/train_df_with_languages.csv')

# Get indices of samples for each language as a dictionary
language_indices = {lang: train_df.index[train_df['language'] == lang].tolist() for lang in train_df['language'].unique()}

In [29]:
language_counts = {lang: len(indices) for lang, indices in language_indices.items()}
print(language_counts)

{'en': 77388, 'it': 223, 'fr': 533, 'so': 254, 'af': 504, 'unknown': 44, 'nl': 270, 'ca': 137, 'tl': 173, 'no': 279, 'sw': 62, 'et': 122, 'da': 304, 'sv': 78, 'id': 109, 'de': 293, 'sq': 46, 'fi': 38, 'pl': 64, 'pt': 82, 'vi': 69, 'zh-cn': 4, 'cy': 192, 'ro': 112, 'es': 124, 'hu': 33, 'sl': 22, 'tr': 34, 'lt': 13, 'hr': 15, 'cs': 24, 'lv': 10, 'sk': 17, 'ja': 3, 'uk': 2}


In [30]:
train_df.loc[language_indices['it'][:5]].copy()

Unnamed: 0,text,labels,language,transformer_language
15,I give a 5 on ambiance.,2,it,en-US
76,Hello!,1,it,en-US
514,Second: I've never had tomatoes ll pizza.,1,it,en-US
549,I'm a carnivore.,1,it,en-US
563,I prefer Spinatos,1,it,ro-RO


Using this library leads to many classification error.


However, let's just keep the english samples and remove the rest for training, just to have an idea of if we are going into the right direction.

In [31]:
english_train_df = train_df.loc[language_indices['en']].copy()

In [32]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(english_train_df, val_df, test_df)

In [33]:
# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

In [34]:
print_evaluation(y_test, y_test_pred_lr)

              precision    recall  f1-score   support

    negative       0.61      0.39      0.48      2191
     neutral       0.67      0.86      0.76      4915
    positive       0.69      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210



In [35]:
print(L_score(y_test, y_test_pred_lr))

0.8034280117531831


Trying a different language detection method: using a fine-tuned classification model, which is more robust and has better performance.

In [36]:
# train_df_with_languages = add_detected_language_column(train_df)
train_df_with_languages = train_df

In [37]:
# Get indices of samples for each language as a dictionary
language_indices = {lang: train_df_with_languages.index[train_df_with_languages['transformer_language'] == lang].tolist() for lang in train_df_with_languages['transformer_language'].unique()}

In [38]:
language_counts = {lang: len(indices) for lang, indices in language_indices.items()}
print(language_counts)

{'en-US': 79095, 'fr-FR': 373, 'ar-SA': 815, 'nl-NL': 166, 'km-KH': 106, 'es-ES': 69, 'jv-ID': 45, 'ro-RO': 67, 'de-DE': 146, 'sv-SE': 36, 'tl-PH': 72, 'ms-MY': 18, 'pt-PT': 38, 'cy-GB': 51, 'lv-LV': 36, 'zh-CN': 5, 'fi-FI': 46, 'id-ID': 12, 'pl-PL': 38, 'sw-KE': 40, 'tr-TR': 10, 'hu-HU': 6, 'nb-NO': 18, 'zh-TW': 21, 'it-IT': 39, 'af-ZA': 124, 'te-IN': 3, 'mn-MN': 12, 'th-TH': 11, 'az-AZ': 22, 'kn-IN': 26, 'hi-IN': 9, 'bn-BD': 7, 'ur-PK': 3, 'ja-JP': 27, 'is-IS': 2, 'vi-VN': 9, 'sl-SL': 9, 'ka-GE': 11, 'da-DK': 8, 'ml-IN': 2, 'he-IL': 7, 'sq-AL': 6, 'hy-AM': 3, 'ru-RU': 2, 'ta-IN': 5, 'am-ET': 1}


In [39]:
train_df_with_languages.loc[language_indices['fr-FR'][:5]].copy()

Unnamed: 0,text,labels,language,transformer_language
20,Une chance que les portions étaient aussi géné...,1,fr,fr-FR
271,Bon appétit !,2,fr,fr-FR
301,One ca n't deny its seriousness and quality .,2,en,fr-FR
354,2 for 1 coup-ons.,1,en,fr-FR
765,"They got your head, ear, neck, hands and your ...",1,en,fr-FR


In [40]:
english_train_df = train_df_with_languages.loc[language_indices['en-US']].copy()

In [41]:
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(english_train_df, val_df, test_df)

In [42]:
# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

In [43]:
print_evaluation(y_test, y_test_pred_lr)

              precision    recall  f1-score   support

    negative       0.60      0.39      0.47      2191
     neutral       0.67      0.86      0.76      4915
    positive       0.69      0.56      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.66      0.60      0.62     10210
weighted avg       0.66      0.67      0.65     10210



In [44]:
print(L_score(y_test, y_test_pred_lr))

0.8024485798237022


### Remove emails + urls and expand contractions

In [45]:
train_df_final = preprocess_ml_pipeline(train_df, ["remove_emails", "remove_urls", "expand_contractions"])
val_df_final = preprocess_ml_pipeline(val_df, ["remove_emails", "remove_urls", "expand_contractions"])
test_df_final = preprocess_ml_pipeline(test_df, ["remove_emails", "remove_urls", "expand_contractions"])

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = preprocess_text_with_count_vectorizer(train_df_final, val_df_final, test_df_final)

# Train a Logistic Regression model
y_train_pred_lr, y_val_pred_lr, y_test_pred_lr, model_lr = train_and_predict_logistic_regression(X_train, y_train, X_val, X_test)

print_evaluation(y_test, y_test_pred_lr)
print(L_score(y_test, y_test_pred_lr))

              precision    recall  f1-score   support

    negative       0.62      0.40      0.49      2191
     neutral       0.68      0.86      0.76      4915
    positive       0.70      0.57      0.62      3104

    accuracy                           0.67     10210
   macro avg       0.67      0.61      0.62     10210
weighted avg       0.67      0.67      0.66     10210

0.8062683643486778


# Results

| Model                                      | L_score | Accuracy | Precision (N/Ne/P)   | Recall (N/Ne/P)       | F1-score (N/Ne/P)     | Weighted F1 |
|--------------------------------------------|---------|----------|---------------------|-----------------------|-----------------------|-------------|
| LogReg                                     | 0.8041  | 0.67     | 0.61 / 0.68 / 0.69  | 0.39 / 0.86 / 0.57    | 0.48 / 0.76 / 0.62    | 0.66        |
|--------------------------------------------|---------|----------|---------------------|-----------------------|-----------------------|-------------|
| LogReg + expand contractions                | 0.8057  | 0.67     | 0.62 / 0.68 / 0.70  | 0.40 / 0.86 / 0.56    | 0.49 / 0.76 / 0.62    | 0.66        |
| LogReg + remove emails                       | 0.8044  | 0.67     | 0.61 / 0.68 / 0.70  | 0.39 / 0.86 / 0.57    | 0.48 / 0.76 / 0.62    | 0.66        |
| LogReg + remove urls                         | 0.8042  | 0.67     | 0.61 / 0.68 / 0.69  | 0.39 / 0.86 / 0.57    | 0.48 / 0.76 / 0.62    | 0.66        |
| LogReg combined (expand contractions, remove urls + emails) | 0.8063  | 0.67     | 0.62 / 0.68 / 0.70  | 0.40 / 0.86 / 0.57    | 0.49 / 0.76 / 0.62    | 0.66        |
|--------------------------------------------|---------|----------|---------------------|-----------------------|-----------------------|-------------|
| RF                                         | 0.7924  | 0.64     | 0.61 / 0.63 / 0.68  | 0.28 / 0.89 / 0.49    | 0.39 / 0.73 / 0.57    | 0.61        |
| XGBoost                                    | 0.8028  | 0.65     | 0.67 / 0.62 / 0.74  | 0.29 / 0.92 / 0.47    | 0.41 / 0.74 / 0.57    | 0.62        |
| MLP                                        | 0.7782  | 0.64     | 0.53 / 0.69 / 0.61  | 0.40 / 0.77 / 0.60    | 0.46 / 0.73 / 0.60    | 0.63        |


We can see that the preprocessing steps have a small impact on the model performance. The best performing model is the one with the "expand_contractions" step, which has a slight improvement over the baseline. The other steps have negligible effects on the performance.

To attain better results, we will explore with fine-tuning Roberta in the next notebook. We will explore ways to sanitize and augment the dataset using LLMs and various other techniques.