## 0 - Imports

In [24]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
import re
import string
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import string

from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Text extraction 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


import contractions
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import gensim.downloader as api

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

import pickle

# Deep Learning libraries
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout, Flatten, Input
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dropout, Flatten, GRU
from tensorflow.keras.optimizers import Adam

# Set pd options to display all columns and rows
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 30)
pd.set_option('display.max_colwidth', None)  # Show full text without truncation


# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alexg\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 1 - EDA (missing, just copy the other notebook)

In [3]:
# Define the base directory (where the notebook is)
BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

# Construct full paths to the CSV files
train_path = os.path.join(BASE_DIR, "data", "train.csv")
test_path = os.path.join(BASE_DIR, "data", "test.csv")

# Load the datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

## 2 - Pre-Processing

# missing oversampling

The effectiveness of pre-processing techniques is model-dependent. For classical machine learning approaches, the optimal combination from **Symeonidis, Effrosynidis, and Arampatzis (2018). A comparative evaluation of pre-processing techniques and their interactions for twitter sentiment analysis** includes:

- **URL/User Mention Replacement**: Replace URLs and user mentions with tags, as they do not contain sentiment information (Agarwal et al., 2011).
- **Contraction Handling**: Replacing contractions improves accuracy, as contractions are common in tweets and often exempt from sentiment lexicons (Chalil et al., 2015).
- **Number Removal**: While many researchers remove numbers (He, Lin, & Alani, 2011; Zhao, 2015), some argue that keeping numbers may improve classification effectiveness (Lin & He, 2009).
- **Replace Punctuation Repetition**: Normalizes language and generalizes vocabulary to represent sentiment (Balahur, 2013).
- **Lemmatization**: Passes baseline results for both datasets, especially for classic algorithms. However, it may ignore semantic information in large datasets (Shotaroo, Takamura, & Okumura, 2005).

In [4]:
lemmatizer = WordNetLemmatizer()

# Source: https://www.nltk.org/api/nltk.tokenize.casual.html
# Difference between TweetTokenizer and Word_Tokenize: https://stackoverflow.com/questions/61919670/how-nltk-tweettokenizer-different-from-nltk-word-tokenize
tokenizer = TweetTokenizer()

stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

In [5]:
"""
standard_nltk = word_tokenize(text)
print(standard_nltk)
# output: ['The', 'quick', 'brown', 'fox', 'is', "n't", 'jumping', 'over', 
# 'the', 'lazy', 'dog', ',', 'co-founder', 'multi-word', 'expression', '.', 
# '#', 'yes', '!']

twitter_nltk = tweet_tokenizer.tokenize(text)
print(twitter_nltk)
# output: ['The', 'quick', 'brown', 'fox', "isn't", 'jumping', 'over', 
# 'the', 'lazy', 'dog', ',', 'co-founder', 'multi-word', 'expression', '.', 
# '#yes', '!']

"""

'\nstandard_nltk = word_tokenize(text)\nprint(standard_nltk)\n# output: [\'The\', \'quick\', \'brown\', \'fox\', \'is\', "n\'t", \'jumping\', \'over\', \n# \'the\', \'lazy\', \'dog\', \',\', \'co-founder\', \'multi-word\', \'expression\', \'.\', \n# \'#\', \'yes\', \'!\']\n\ntwitter_nltk = tweet_tokenizer.tokenize(text)\nprint(twitter_nltk)\n# output: [\'The\', \'quick\', \'brown\', \'fox\', "isn\'t", \'jumping\', \'over\', \n# \'the\', \'lazy\', \'dog\', \',\', \'co-founder\', \'multi-word\', \'expression\', \'.\', \n# \'#yes\', \'!\']\n\n'

We can try to remove stopwords and see how the model reacts, from **Symeonidis, Effrosynidis, and Arampatzis (2018). A comparative evaluation of pre-processing techniques and their interactions for twitter sentiment analysis**:

The technique of removing stopwords yielded ambiguous results. For the SS-Twitter dataset, none of the algorithms was over the baseline accuracy but for the SemEval dataset, on three classic algorithms, the results were satisfactory. 

The reasons for failure are: first, stopwords like ‘I’, ‘me’, ‘you’, present and are associated with expressions of sentiment ( Thelwall et al., 2012 ),second, the domain of Tweets for each dataset, and third the vocabulary and the age of users. According to Haas et al. (2011) young people tend to use more and more short text with slangs and many stopwords to express their feelings about themselves. 

In [6]:
def clean_text_column(text,lemmatizer=None, stemmer=None, remove_stopwords=None):
    text = text.lower()

    # Replace URLs and user mentions
    text = re.sub(r"http\S+|www\.\S+", "URL", text)
    text = re.sub(r"@\w+", "USER", text)

    # Expand contractions (we use contractions library for this)
    # Contractions library Source: https://pypi.org/project/contractions/
    text = contractions.fix(text)

    # # Replace numbers with [NUM]
    # text = re.sub(r"\d+(\.\d+)?", "[NUM]", text)

    # Convert to tickers (e.g., $AAPL to [TICKER])
    text = re.sub(r"\$[a-z]{1,5}", "[TICKER]", text)

    #Remove numbers
    text = re.sub(r"\d+", "", text)

    # Normalize punctuation repetitions
    text = re.sub(r"([!?\.])\1+", r"\1", text)

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Optionally remove stopwords and punctuation

    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    else:
        tokens = [token for token in tokens if token not in string.punctuation]
    
    # Lemmatization OR stemming (not both!)
    if lemmatizer is not None and stemmer is None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif stemmer is not None and lemmatizer is None:
        tokens = [stemmer.stem(token) for token in tokens]
    elif lemmatizer is not None and stemmer is not None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Else, leave tokens as is

    # Source: https://www.nltk.org/api/nltk.tokenize.treebank.html 
    # TreebankWordDetokenizer from NLTK takes care of the correct spacing and formatting, 
    # we you get a well-formed sentence that looks like natural English (e.g. without TreebankWordDetokinzer: This is an example tweet ! , With: This is an example tweet!)
    return TreebankWordDetokenizer().detokenize(tokens)

In [7]:
# Example usage of the contractions library
test = "Hello how're you doing? I'm doing fine!"
test_fix = contractions.fix(test)
print(test_fix)

Hello how are you doing? I am doing fine!


In [8]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

### Try the different combinations of pre-processing

In [9]:
# Define the combinations to try
combinations = [
    {'lemmatizer': None, 'stemmer': None, 'remove_stopwords': False, 'name': 'no_lemma_no_stem_with_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': None, 'remove_stopwords': False, 'name': 'lemma_no_stem_with_stopwords'},
    {'lemmatizer': None, 'stemmer': stemmer, 'remove_stopwords': False, 'name': 'no_lemma_stem_with_stopwords'},
    {'lemmatizer': None, 'stemmer': None, 'remove_stopwords': True, 'name': 'no_lemma_no_stem_no_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': None, 'remove_stopwords': True, 'name': 'lemma_no_stem_no_stopwords'},
    {'lemmatizer': None, 'stemmer': stemmer, 'remove_stopwords': True, 'name': 'no_lemma_stem_no_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': stemmer, 'remove_stopwords': False, 'name': 'lemma_stem_with_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': stemmer, 'remove_stopwords': True, 'name': 'lemma_stem_no_stopwords'}
]

# Process each combination and add to the dataframe
for combo in combinations:
    column_name = f"text_{combo['name']}"
    print(f"Processing {column_name}...")
    
    # Apply the clean_text_column function with the current combination
    df_train_cleaned[column_name] = df_train_cleaned['text'].apply(
        lambda x: clean_text_column(
            x, 
            lemmatizer=combo['lemmatizer'], 
            stemmer=combo['stemmer'], 
            remove_stopwords=combo['remove_stopwords']
        )
    )

# Also apply the best combination to the test set later after evaluation
print("Processing complete")

# Display the first few rows with all the combinations
df_train_cleaned.iloc[:10, :10].head()

Processing text_no_lemma_no_stem_with_stopwords...
Processing text_lemma_no_stem_with_stopwords...
Processing text_no_lemma_stem_with_stopwords...
Processing text_no_lemma_no_stem_no_stopwords...
Processing text_lemma_no_stem_no_stopwords...
Processing text_no_lemma_stem_no_stopwords...
Processing text_lemma_stem_with_stopwords...
Processing text_lemma_stem_no_stopwords...
Processing complete


Unnamed: 0,text,label,text_no_lemma_no_stem_with_stopwords,text_lemma_no_stem_with_stopwords,text_no_lemma_stem_with_stopwords,text_no_lemma_no_stem_no_stopwords,text_lemma_no_stem_no_stopwords,text_no_lemma_stem_no_stopwords,text_lemma_stem_with_stopwords,text_lemma_stem_no_stopwords
0,$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT,0,TICKER jpmorgan reels in expectations on beyond meat URL,TICKER jpmorgan reel in expectation on beyond meat URL,ticker jpmorgan reel in expect on beyond meat url,TICKER jpmorgan reels expectations beyond meat URL,TICKER jpmorgan reel expectation beyond meat URL,ticker jpmorgan reel expect beyond meat url,TICKER jpmorgan reel in expectation on beyond meat URL,TICKER jpmorgan reel expectation beyond meat URL
1,$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3,0,TICKER TICKER nomura points to bookings weakness at carnival and royal caribbean URL,TICKER TICKER nomura point to booking weakness at carnival and royal caribbean URL,ticker ticker nomura point to book weak at carniv and royal caribbean url,TICKER TICKER nomura points bookings weakness carnival royal caribbean URL,TICKER TICKER nomura point booking weakness carnival royal caribbean URL,ticker ticker nomura point book weak carniv royal caribbean url,TICKER TICKER nomura point to booking weakness at carnival and royal caribbean URL,TICKER TICKER nomura point booking weakness carnival royal caribbean URL
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb",0,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,ticker cemex cut at credit suiss j p morgan on weak build outlook url,TICKER cemex cut credit suisse j p morgan weak building outlook URL,TICKER cemex cut credit suisse j p morgan weak building outlook URL,ticker cemex cut credit suiss j p morgan weak build outlook url,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,TICKER cemex cut credit suisse j p morgan weak building outlook URL
3,$ESS: BTIG Research cuts to Neutral https://t.co/MCyfTsXc2N,0,TICKER]: btig research cuts to neutral URL,TICKER]: btig research cut to neutral URL,ticker]: btig research cut to neutral url,TICKER]: btig research cuts neutral URL,TICKER]: btig research cut neutral URL,ticker]: btig research cut neutral url,TICKER]: btig research cut to neutral URL,TICKER]: btig research cut neutral URL
4,$FNKO - Funko slides after Piper Jaffray PT cut https://t.co/z37IJmCQzB,0,TICKER funko slides after piper jaffray pt cut URL,TICKER funko slide after piper jaffray pt cut URL,ticker funko slide after piper jaffray pt cut url,TICKER funko slides piper jaffray pt cut URL,TICKER funko slide piper jaffray pt cut URL,ticker funko slide piper jaffray pt cut url,TICKER funko slide after piper jaffray pt cut URL,TICKER funko slide piper jaffray pt cut URL


In [10]:
# Using stratify to maintain the distribution of classes in the train, validation, and test sets
train_df, val_test_df = train_test_split(df_train_cleaned, test_size=0.3, stratify=df_train_cleaned['label'], random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df['label'], random_state=42)

In [11]:
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

## 3 - Feature Engineering

Note: (to remove later) 3.3 Sentence Encoders (using Transformers for Embeddings only)
Here, you use models like all-mpnet-base-v2, all-MiniLM-L6-v2, or USE, LASER, etc. to generate fixed-size sentence embeddings (vectors).

After you get these embeddings, you feed them to traditional ML models (like logistic regression, SVM, XGB, LSTM, etc.).

You do NOT fine-tune the transformer or use its classification head. You just use it as a feature extractor.

✅ This fits 3.3, because your core model is not a transformer. The transformer is just making better features.



We decided to follow a general pipeline, where based on the feature extraction technique we employ , and that it is adequate to the classification model we first define:

- 3.1. - Statistical Methods: Bag of Words, and TF-IDF -> 3.1.1 Classification models: SVC, XGB, Logistic Regression and KNN -> 3.1.2 Hyperparamter Tuning for the best feature extraction technique and for the best model

- 3.2. - Fixed Word Embedding Encoders -> Word2Vec, FastText , Glove-Twitter -> 3.2.1 Classification Models -> Keep the best traditional ML model from 3.1 and add BiLSTM , BiGRU , BiLSTM + Attention , BiGRU + Attention, CNN  (Source: https://sbert.net/docs/sentence_transformer/pretrained_models.html) 

- 3.3. - Contextual Word Embedding Encoders -> ELMO (mean and concat) -> 3.3.1 Classification Models -> Keep the best traditional ML model from 3.1 and add BiLSTM , BiGRU , BiLSTM + Attention , BiGRU + Attention, CNN 

- 3.4. - Sentence Encoders -> all-mpnet-base-v2 , all-distilroberta-v1 , all-MiniLM-L12-v2 , paraphrase-multilingual-mpnet-base-v2 -> 3.4.1 Classification Models -> Keep the best traditional ML model from 3.1 and add BiLSTM , BiGRU , BiLSTM + Attention , BiGRU + Attention, CNN

- 3.5 -> Transformers -> BERT base, BERT Large, XLNET base, XLNET large, Roberta Base, Roberta Large distilbert large, distilbert base, ALBERT x large-v1 , ALBERT-xxlarge-v2 , XLM-MLM-en-2048 , BART-LARGE  

- 3.6 -> Domain Specific Transformers: FinBert , BERTweet , FinTwitBERT (https://huggingface.co/StephanAkkerman/FinTwitBERT)


### 3.1 - Statistical Methods

### 3.1.1 - Bag of Words

In [12]:
combinations = [
    ("text_no_lemma_no_stem_with_stopwords", train_df["text_no_lemma_no_stem_with_stopwords"]),
    ("text_lemma_no_stem_with_stopwords", train_df["text_lemma_no_stem_with_stopwords"]),
    ("text_no_lemma_stem_with_stopwords", train_df["text_no_lemma_stem_with_stopwords"]),
    ("text_no_lemma_no_stem_no_stopwords", train_df["text_no_lemma_no_stem_no_stopwords"]),
    ("text_lemma_no_stem_no_stopwords", train_df["text_lemma_no_stem_no_stopwords"]),
    ("text_no_lemma_stem_no_stopwords", train_df["text_no_lemma_stem_no_stopwords"]),
    ("text_lemma_stem_with_stopwords", train_df["text_lemma_stem_with_stopwords"]),
    ("text_lemma_stem_no_stopwords", train_df["text_lemma_stem_no_stopwords"]),
]


In [72]:
# Fit vectorizer (BoW)
bow_vectorizer = CountVectorizer(ngram_range=(1,2), max_features=15_000)

bow_vectors = {}
for column_name, train_series in combinations:
    print(f"Fitting bow vectorizer for {column_name}...")
    
    bow_vectorizer.fit(train_series)
    X_train_bow = bow_vectorizer.transform(train_df[column_name])
    X_val_bow = bow_vectorizer.transform(val_df[column_name])
    X_test_bow = bow_vectorizer.transform(test_df[column_name])
    
    bow_vectors[column_name] = {
        "train": X_train_bow,
        "val": X_val_bow,
        "test": X_test_bow,
    }


Fitting bow vectorizer for text_no_lemma_no_stem_with_stopwords...
Fitting bow vectorizer for text_lemma_no_stem_with_stopwords...
Fitting bow vectorizer for text_no_lemma_stem_with_stopwords...
Fitting bow vectorizer for text_no_lemma_no_stem_no_stopwords...
Fitting bow vectorizer for text_lemma_no_stem_no_stopwords...
Fitting bow vectorizer for text_no_lemma_stem_no_stopwords...
Fitting bow vectorizer for text_lemma_stem_with_stopwords...
Fitting bow vectorizer for text_lemma_stem_no_stopwords...


### 3.1.2 - Traditional ML Classifiers using Bag of Words

#### SVC, XGB , Logistic Regression and KNN

### 3.1.2.1 - Without Oversampling 

In [14]:
models = {
    "SVC": SVC(class_weight='balanced', random_state=42),  # Add class_weight for imbalanced data
    "XGB": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=300, class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}


In [13]:
results_bow = []


for col in combinations:
    column_name = col[0]
    print(f"\n=== Results for {column_name} ===")
    X_train = bow_vectors[column_name]["train"]
    X_val = bow_vectors[column_name]["val"]
    # (add X_test as needed)

    for name, model in models.items():
        print(f"\nTraining {name}...")
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_val)

        # print(f"\n{name} - Validation set results:")
        # print(classification_report(y_val, y_pred, digits=3))

        report = classification_report(y_val, y_pred, output_dict=True)
        results_bow.append({
            "variant": column_name,
            "model": name,
            "accuracy": report["accuracy"],
            "macro_f1": report["macro avg"]["f1-score"],
            "macro_precision": report["macro avg"]["precision"],
            "macro_recall": report["macro avg"]["recall"],
            "weighted_f1": report["weighted avg"]["f1-score"],
            "weighted_precision": report["weighted avg"]["precision"],
            "weighted_recall": report["weighted avg"]["recall"],
        })

traditional_ml_bow = pd.DataFrame(results_bow)


=== Results for text_no_lemma_no_stem_with_stopwords ===


NameError: name 'bow_vectors' is not defined

In [None]:
logistic_regression_results = traditional_ml_bow[traditional_ml_bow['model'] == 'LogisticRegression']
logistic_regression_results.sort_values(by='macro_f1', ascending=False)


Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
10,text_no_lemma_stem_with_stopwords,LogisticRegression,0.804333,0.729893,0.733927,0.726412,0.803083,0.802119,0.804333
2,text_no_lemma_no_stem_with_stopwords,LogisticRegression,0.804333,0.728305,0.734357,0.722844,0.802591,0.801212,0.804333
6,text_lemma_no_stem_with_stopwords,LogisticRegression,0.801537,0.72402,0.730592,0.718254,0.799584,0.798097,0.801537
26,text_lemma_stem_with_stopwords,LogisticRegression,0.801537,0.72402,0.730592,0.718254,0.799584,0.798097,0.801537
22,text_no_lemma_stem_no_stopwords,LogisticRegression,0.796646,0.722437,0.723031,0.721913,0.796537,0.79647,0.796646
18,text_lemma_no_stem_no_stopwords,LogisticRegression,0.789658,0.711204,0.7145,0.708107,0.788621,0.787707,0.789658
30,text_lemma_stem_no_stopwords,LogisticRegression,0.789658,0.711204,0.7145,0.708107,0.788621,0.787707,0.789658
14,text_no_lemma_no_stem_no_stopwords,LogisticRegression,0.786862,0.707967,0.712828,0.703438,0.78543,0.784258,0.786862


In [None]:
KNN_results = traditional_ml_bow[traditional_ml_bow['model'] == 'KNN']
KNN_results.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
11,text_no_lemma_stem_with_stopwords,KNN,0.691824,0.42519,0.77469,0.422158,0.608885,0.732466,0.691824
23,text_no_lemma_stem_no_stopwords,KNN,0.691824,0.424945,0.72905,0.422537,0.609742,0.710497,0.691824
19,text_lemma_no_stem_no_stopwords,KNN,0.689727,0.423141,0.701842,0.421497,0.608681,0.697303,0.689727
31,text_lemma_stem_no_stopwords,KNN,0.689727,0.423141,0.701842,0.421497,0.608681,0.697303,0.689727
7,text_lemma_no_stem_with_stopwords,KNN,0.691125,0.421091,0.770968,0.419825,0.606864,0.730511,0.691125
27,text_lemma_stem_with_stopwords,KNN,0.691125,0.421091,0.770968,0.419825,0.606864,0.730511,0.691125
15,text_no_lemma_no_stem_no_stopwords,KNN,0.686233,0.414729,0.679428,0.416129,0.60345,0.684915,0.686233
3,text_no_lemma_no_stem_with_stopwords,KNN,0.687631,0.413012,0.768899,0.414456,0.601183,0.728253,0.687631


In [None]:
SVC_results = traditional_ml_bow[traditional_ml_bow['model'] == 'SVC']
SVC_results.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
4,text_lemma_no_stem_with_stopwords,SVC,0.795248,0.708807,0.733814,0.690534,0.788879,0.78682,0.795248
24,text_lemma_stem_with_stopwords,SVC,0.795248,0.708807,0.733814,0.690534,0.788879,0.78682,0.795248
8,text_no_lemma_stem_with_stopwords,SVC,0.79385,0.706168,0.731985,0.686963,0.787265,0.785231,0.79385
16,text_lemma_no_stem_no_stopwords,SVC,0.791055,0.70311,0.719451,0.691943,0.786712,0.785097,0.791055
28,text_lemma_stem_no_stopwords,SVC,0.791055,0.70311,0.719451,0.691943,0.786712,0.785097,0.791055
20,text_no_lemma_stem_no_stopwords,SVC,0.78826,0.697505,0.710938,0.687732,0.784155,0.782031,0.78826
0,text_no_lemma_no_stem_with_stopwords,SVC,0.78826,0.696358,0.722241,0.677824,0.781179,0.778831,0.78826
12,text_no_lemma_no_stem_no_stopwords,SVC,0.783368,0.693948,0.713456,0.681265,0.778399,0.777025,0.783368


In [None]:
XGB_results = traditional_ml_bow[traditional_ml_bow['model'] == 'XGB']
XGB_results.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
9,text_no_lemma_stem_with_stopwords,XGB,0.803634,0.706439,0.793473,0.662359,0.788428,0.80125,0.803634
5,text_lemma_no_stem_with_stopwords,XGB,0.793152,0.684711,0.778643,0.64155,0.775179,0.788909,0.793152
25,text_lemma_stem_with_stopwords,XGB,0.793152,0.684711,0.778643,0.64155,0.775179,0.788909,0.793152
1,text_no_lemma_no_stem_with_stopwords,XGB,0.793152,0.683973,0.786771,0.637981,0.774109,0.791356,0.793152
21,text_no_lemma_stem_no_stopwords,XGB,0.791754,0.680158,0.781548,0.635747,0.772481,0.788734,0.791754
17,text_lemma_no_stem_no_stopwords,XGB,0.777079,0.654559,0.765019,0.612439,0.754866,0.773446,0.777079
29,text_lemma_stem_no_stopwords,XGB,0.777079,0.654559,0.765019,0.612439,0.754866,0.773446,0.777079
13,text_no_lemma_no_stem_no_stopwords,XGB,0.769392,0.643963,0.75689,0.601302,0.746,0.765352,0.769392


### 3.1.2.2 - With Oversampling 

### 3.1.3 - TF-IDF

In [74]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=15_000)


tfidf_vectors = {}

for column_name, train_series in combinations:
    print(f"Fitting tf idf vectorizer for {column_name}...")
    
    tfidf_vectorizer.fit(train_series)
    X_train_tfidf = tfidf_vectorizer.transform(train_df[column_name])
    X_val_tfidf = tfidf_vectorizer.transform(val_df[column_name])
    X_test_tfidf = tfidf_vectorizer.transform(test_df[column_name])
    
    tfidf_vectors[column_name] = {
        "train": X_train_tfidf,
        "val": X_val_tfidf,
        "test": X_test_tfidf,
    }


Fitting tf idf vectorizer for text_no_lemma_no_stem_with_stopwords...
Fitting tf idf vectorizer for text_lemma_no_stem_with_stopwords...
Fitting tf idf vectorizer for text_no_lemma_stem_with_stopwords...
Fitting tf idf vectorizer for text_no_lemma_no_stem_no_stopwords...
Fitting tf idf vectorizer for text_lemma_no_stem_no_stopwords...
Fitting tf idf vectorizer for text_no_lemma_stem_no_stopwords...
Fitting tf idf vectorizer for text_lemma_stem_with_stopwords...
Fitting tf idf vectorizer for text_lemma_stem_no_stopwords...


### 3.1.4 - Traditional ML Classifiers using TF-IDF

### 3.1.4.1 - Without Oversampling 

In [None]:
results_tfidf = []


for col in combinations:
    column_name = col[0]
    # print(f"\n=== Results for {column_name} ===")
    X_train = tfidf_vectors[column_name]["train"]
    X_val = tfidf_vectors[column_name]["val"]
    # (X_test if we needed)

    for name, model in models.items():
        # print(f"\nTraining {name}...")
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_val)

        # print(f"\n{name} - Validation set results:")
        # print(classification_report(y_val, y_pred, digits=3))

        report = classification_report(y_val, y_pred, output_dict=True)
        results_tfidf.append({
            "variant": column_name,
            "model": name,
            "accuracy": report["accuracy"],
            "macro_f1": report["macro avg"]["f1-score"],
            "macro_precision": report["macro avg"]["precision"],
            "macro_recall": report["macro avg"]["recall"],
            "weighted_f1": report["weighted avg"]["f1-score"],
            "weighted_precision": report["weighted avg"]["precision"],
            "weighted_recall": report["weighted avg"]["recall"],
        })

traditional_ml_tfidf = pd.DataFrame(results_tfidf)


=== Results for text_no_lemma_no_stem_with_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_lemma_no_stem_with_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_no_lemma_stem_with_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_no_lemma_no_stem_no_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_lemma_no_stem_no_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_no_lemma_stem_no_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_lemma_stem_with_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...

=== Results for text_lemma_stem_no_stopwords ===

Training SVC...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LogisticRegression...

Training KNN...


In [78]:
logreg_tfidf = traditional_ml_tfidf[traditional_ml_tfidf['model'] == 'LogisticRegression']
logreg_tfidf.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
10,text_no_lemma_stem_with_stopwords,LogisticRegression,0.794549,0.726905,0.717443,0.737918,0.797449,0.80158,0.794549
2,text_no_lemma_no_stem_with_stopwords,LogisticRegression,0.791055,0.722747,0.714299,0.73255,0.793628,0.797305,0.791055
6,text_lemma_no_stem_with_stopwords,LogisticRegression,0.787561,0.720361,0.710561,0.731926,0.790626,0.795116,0.787561
26,text_lemma_stem_with_stopwords,LogisticRegression,0.787561,0.720361,0.710561,0.731926,0.790626,0.795116,0.787561
22,text_no_lemma_stem_no_stopwords,LogisticRegression,0.781971,0.710177,0.70095,0.721531,0.785385,0.790555,0.781971
18,text_lemma_no_stem_no_stopwords,LogisticRegression,0.779175,0.703476,0.695583,0.712576,0.782105,0.786011,0.779175
30,text_lemma_stem_no_stopwords,LogisticRegression,0.779175,0.703476,0.695583,0.712576,0.782105,0.786011,0.779175
14,text_no_lemma_no_stem_no_stopwords,LogisticRegression,0.772187,0.693545,0.68631,0.701839,0.775084,0.77884,0.772187


In [79]:
svc_tfidf = traditional_ml_tfidf[traditional_ml_tfidf['model'] == 'SVC']
svc_tfidf.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
8,text_no_lemma_stem_with_stopwords,SVC,0.815514,0.731998,0.778293,0.703326,0.806934,0.808865,0.815514
0,text_no_lemma_no_stem_with_stopwords,SVC,0.809224,0.718657,0.778195,0.68669,0.798557,0.803592,0.809224
4,text_lemma_no_stem_with_stopwords,SVC,0.806429,0.716831,0.769087,0.687642,0.796493,0.799723,0.806429
24,text_lemma_stem_with_stopwords,SVC,0.806429,0.716831,0.769087,0.687642,0.796493,0.799723,0.806429
20,text_no_lemma_stem_no_stopwords,SVC,0.798742,0.702859,0.745125,0.67814,0.789097,0.789579,0.798742
16,text_lemma_no_stem_no_stopwords,SVC,0.797345,0.7022,0.74591,0.676204,0.787496,0.788126,0.797345
28,text_lemma_stem_no_stopwords,SVC,0.797345,0.7022,0.74591,0.676204,0.787496,0.788126,0.797345
12,text_no_lemma_no_stem_no_stopwords,SVC,0.791754,0.690145,0.745385,0.663118,0.780304,0.78425,0.791754


In [80]:
knn_tfidf = traditional_ml_tfidf[traditional_ml_tfidf['model'] == 'KNN']
knn_tfidf.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
7,text_lemma_no_stem_with_stopwords,KNN,0.686233,0.400782,0.832377,0.407397,0.594207,0.757238,0.686233
27,text_lemma_stem_with_stopwords,KNN,0.686233,0.400782,0.832377,0.407397,0.594207,0.757238,0.686233
3,text_no_lemma_no_stem_with_stopwords,KNN,0.684137,0.39942,0.798271,0.406317,0.592871,0.739143,0.684137
11,text_no_lemma_stem_with_stopwords,KNN,0.684137,0.397212,0.806693,0.405141,0.591853,0.744217,0.684137
19,text_lemma_no_stem_no_stopwords,KNN,0.681342,0.393211,0.735729,0.402485,0.589159,0.709517,0.681342
31,text_lemma_stem_no_stopwords,KNN,0.681342,0.393211,0.735729,0.402485,0.589159,0.709517,0.681342
23,text_no_lemma_stem_no_stopwords,KNN,0.680643,0.389669,0.753325,0.40057,0.587281,0.71697,0.680643
15,text_no_lemma_no_stem_no_stopwords,KNN,0.677149,0.376276,0.719072,0.392809,0.579482,0.700264,0.677149


In [81]:
xgb_tfidf = traditional_ml_tfidf[traditional_ml_tfidf['model'] == 'XGB']
xgb_tfidf.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
9,text_no_lemma_stem_with_stopwords,XGB,0.791055,0.687158,0.767445,0.64631,0.774816,0.785059,0.791055
5,text_lemma_no_stem_with_stopwords,XGB,0.78826,0.678494,0.770458,0.63584,0.76997,0.783068,0.78826
25,text_lemma_stem_with_stopwords,XGB,0.78826,0.678494,0.770458,0.63584,0.76997,0.783068,0.78826
1,text_no_lemma_no_stem_with_stopwords,XGB,0.783368,0.667776,0.763118,0.625007,0.763354,0.777473,0.783368
21,text_no_lemma_stem_no_stopwords,XGB,0.775681,0.666082,0.73927,0.628942,0.758951,0.766108,0.775681
17,text_lemma_no_stem_no_stopwords,XGB,0.77638,0.652027,0.740427,0.614012,0.755425,0.765978,0.77638
29,text_lemma_stem_no_stopwords,XGB,0.77638,0.652027,0.740427,0.614012,0.755425,0.765978,0.77638
13,text_no_lemma_no_stem_no_stopwords,XGB,0.763802,0.631598,0.73419,0.59292,0.739667,0.754626,0.763802


### 3.1.5 - Hyperparameter Optimization for the best model coming from BoW and TF-IDF

#### 1) BoW

The best variant and model that results from BoW is: **text_no_lemma_stem_with_stopwords** with **LogisticRegression**	having a **0.729893 f1 macro average score**.

In [None]:
pipe_bow = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression(class_weight='balanced', random_state=42))
])

# Define the parameter grid for GridSearchCV

param_grid = {
    'vect__ngram_range': [(1,1), (1,2), (1,3)],
    'vect__max_features': [10000, 15000, 20000],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [200, 300, 500],
    'clf__solver': ['lbfgs', 'saga'], #'saga' is better with large vocabularies and can be faster with sparse data.
    'vect__binary': [False, True],
    'vect__min_df': [1, 2, 5],
    'vect__max_df': [0.8, 0.9, 1.0] 
}

X_train_hyperparam = train_df['text_no_lemma_stem_with_stopwords']

grid_search = GridSearchCV(pipe_bow, param_grid, cv=3, scoring='f1_macro', n_jobs=-1, verbose=2)

grid_search.fit(X_train_hyperparam, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV f1_macro score:", grid_search.best_score_)

Fitting 3 folds for each of 4860 candidates, totalling 14580 fits
Best parameters: {'clf__C': 1, 'clf__max_iter': 200, 'clf__solver': 'saga', 'vect__binary': True, 'vect__max_df': 0.8, 'vect__max_features': 15000, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
Best CV f1_macro score: 0.7265723241238174




In [83]:
model_tuned_bow = grid_search.best_estimator_
# Evaluate the tuned model on the validation set
X_val_hyperparam = val_df['text_no_lemma_stem_with_stopwords']
y_val_pred = model_tuned_bow.predict(X_val_hyperparam)
print("Validation set results after hyperparameter tuning:")
print(classification_report(y_val, y_val_pred, digits=3))

#Evaluate on the test set
X_test_hyperparam = test_df['text_no_lemma_stem_with_stopwords']
y_test_pred = model_tuned_bow.predict(X_test_hyperparam)
print("Test set results after hyperparameter tuning:")
print(classification_report(y_test, y_test_pred, digits=3))


Validation set results after hyperparameter tuning:
              precision    recall  f1-score   support

           0      0.627     0.590     0.608       217
           1      0.701     0.726     0.713       288
           2      0.881     0.883     0.882       926

    accuracy                          0.807      1431
   macro avg      0.736     0.733     0.734      1431
weighted avg      0.806     0.807     0.806      1431

Test set results after hyperparameter tuning:
              precision    recall  f1-score   support

           0      0.609     0.583     0.596       216
           1      0.702     0.727     0.714       289
           2      0.875     0.874     0.874       927

    accuracy                          0.800      1432
   macro avg      0.729     0.728     0.728      1432
weighted avg      0.800     0.800     0.800      1432



#### 2) TF-IDF

In [None]:
# Define pipeline
pipe_tfidf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression(class_weight='balanced', random_state=42))
])


param_grid_tfidf = {
    'vect__ngram_range': [(1,1), (1,2), (1,3)],
    'vect__max_features': [10000, 15000, 20000],
    'vect__use_idf': [True, False],
    'vect__smooth_idf': [True, False],
    'vect__sublinear_tf': [True, False],
    'vect__min_df': [1, 2, 5],
    'vect__max_df': [0.8, 0.9, 1.0],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__max_iter': [200, 300, 500],
    'clf__solver': ['lbfgs', 'saga'],
}

X_train_tfidf_hyperparam = train_df['text_no_lemma_stem_with_stopwords']

grid_search_tfidf = GridSearchCV(pipe_tfidf, param_grid_tfidf, cv=3, scoring='f1_macro', n_jobs=-1, verbose=2)
grid_search_tfidf.fit(X_train_hyperparam, y_train)

print("Best parameters:", grid_search_tfidf.best_params_)
print("Best CV f1_macro score:", grid_search_tfidf.best_score_)


### 3.2 - Fixed Word Embedding Encoders

### 3.2.1 - Word2Vec

In [29]:
# Same hyperparameters as in AStudy of Feature Extraction techniques for Sentiment Analysis -> to be tuned later
# Source: https://link.springer.com/chapter/10.1007/978-981-13-1501-5_41


w2v_vectors = {}
w2v_models = {}
vector_size = 100  # Set your embedding size

for column_name, train_series in combinations:
    print(f"Training Word2Vec for {column_name}...")
    
    # Tokenize the tweets (lists of tokens)
    train_sentences = [tweet.split() for tweet in train_df[column_name]]
    val_sentences = [tweet.split() for tweet in val_df[column_name]]
    test_sentences = [tweet.split() for tweet in test_df[column_name]]

    # Train Word2Vec on the train set for this variant
    w2v_model = Word2Vec(sentences=train_sentences, vector_size=vector_size, window=10, min_count=1, workers=7)
    
    # Store the model for this variant
    w2v_models[column_name] = w2v_model
    
    # Function to get sentence embeddings
    def avg_vector(tokens, model, size):
        valid = [t for t in tokens if t in model.wv]
        return np.mean(model.wv[valid], axis=0) if valid else np.zeros(size)
    
    # Transform each split into sentence vectors
    X_train_w2v = np.vstack([avg_vector(tokens, w2v_model, vector_size) for tokens in train_sentences])
    X_val_w2v   = np.vstack([avg_vector(tokens, w2v_model, vector_size) for tokens in val_sentences])
    X_test_w2v  = np.vstack([avg_vector(tokens, w2v_model, vector_size) for tokens in test_sentences])

    w2v_vectors[column_name] = {
        "train": X_train_w2v,
        "val": X_val_w2v,
        "test": X_test_w2v,
    }

Training Word2Vec for text_no_lemma_no_stem_with_stopwords...
Training Word2Vec for text_lemma_no_stem_with_stopwords...
Training Word2Vec for text_no_lemma_stem_with_stopwords...
Training Word2Vec for text_no_lemma_no_stem_no_stopwords...
Training Word2Vec for text_lemma_no_stem_no_stopwords...
Training Word2Vec for text_no_lemma_stem_no_stopwords...
Training Word2Vec for text_lemma_stem_with_stopwords...
Training Word2Vec for text_lemma_stem_no_stopwords...


In [27]:
w2v_vectors['text_no_lemma_stem_with_stopwords']['train'].shape, w2v_vectors['text_no_lemma_stem_with_stopwords']['val'].shape, w2v_vectors['text_no_lemma_stem_with_stopwords']['test'].shape

((6680, 100), (1431, 100), (1432, 100))

### 3.2.2 - Classifiers using Word2Vec

SEE: https://github.com/f-data/finSENT/blob/master/models/models.py


1) Logistic regression

In [None]:
models_w2v = {
    "LogisticRegression": LogisticRegression(max_iter=300, class_weight='balanced', random_state=42),
    # "XGB": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

In [22]:
results_log_reg_w2v = []


for col in combinations:
    column_name = col[0]
    print(f"\n=== Results for {column_name} ===")
    X_train = w2v_vectors[column_name]["train"]
    X_val = w2v_vectors[column_name]["val"]
    # (add X_test as needed)

    for name, model in models_w2v.items():
        print(f"\nTraining {name}...")
        # Fit model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_val)

        # print(f"\n{name} - Validation set results:")
        # print(classification_report(y_val, y_pred, digits=3))

        report = classification_report(y_val, y_pred, output_dict=True)
        results_log_reg_w2v.append({
            "variant": column_name,
            "model": name,
            "accuracy": report["accuracy"],
            "macro_f1": report["macro avg"]["f1-score"],
            "macro_precision": report["macro avg"]["precision"],
            "macro_recall": report["macro avg"]["recall"],
            "weighted_f1": report["weighted avg"]["f1-score"],
            "weighted_precision": report["weighted avg"]["precision"],
            "weighted_recall": report["weighted avg"]["recall"],
        })

df_log_reg_w2v = pd.DataFrame(results_log_reg_w2v)


=== Results for text_no_lemma_no_stem_with_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_lemma_no_stem_with_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_no_lemma_stem_with_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_no_lemma_no_stem_no_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_lemma_no_stem_no_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_no_lemma_stem_no_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_lemma_stem_with_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Results for text_lemma_stem_no_stopwords ===

Training LogisticRegression...

Training XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
df_log_reg_w2v.sort_values(by='macro_f1', ascending=False)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
5,text_no_lemma_stem_with_stopwords,XGB,0.683438,0.48276,0.54398,0.472243,0.643369,0.636971,0.683438
4,text_no_lemma_stem_with_stopwords,LogisticRegression,0.580014,0.473574,0.471797,0.484054,0.598479,0.625033,0.580014
2,text_lemma_no_stem_with_stopwords,LogisticRegression,0.566737,0.467652,0.465707,0.483296,0.587973,0.623073,0.566737
12,text_lemma_stem_with_stopwords,LogisticRegression,0.563941,0.464939,0.463591,0.479842,0.585751,0.621245,0.563941
3,text_lemma_no_stem_with_stopwords,XGB,0.672956,0.46386,0.522372,0.457733,0.631418,0.62339,0.672956
1,text_no_lemma_no_stem_with_stopwords,XGB,0.668064,0.461937,0.505388,0.456848,0.628857,0.615968,0.668064
0,text_no_lemma_no_stem_with_stopwords,LogisticRegression,0.561845,0.459819,0.460126,0.471087,0.583497,0.615974,0.561845
13,text_lemma_stem_with_stopwords,XGB,0.672257,0.457794,0.519038,0.452249,0.627285,0.619146,0.672257
7,text_no_lemma_no_stem_no_stopwords,XGB,0.679944,0.451631,0.536439,0.449192,0.625759,0.624721,0.679944
11,text_no_lemma_stem_no_stopwords,XGB,0.675751,0.447985,0.518588,0.444979,0.623955,0.617442,0.675751


2) BiLSTM

In [None]:
max_len = 64
vector_size = 100

In [None]:
def tweet_to_sequence(tweet, w2v_model, vector_size, maxlen):
    tokens = tweet.split()
    seq = []
    for token in tokens:
        if token in w2v_model.wv:
            seq.append(w2v_model.wv[token])
        else:
            seq.append(np.zeros(vector_size))
    # Pad or truncate
    if len(seq) < maxlen:
        seq += [np.zeros(vector_size)] * (maxlen - len(seq))
    else:
        seq = seq[:maxlen]
    return np.array(seq)


In [43]:
def build_bilstm(units, input_length, embed_size):
    input_ = Input(shape=(input_length, embed_size))
    x = Bidirectional(LSTM(units, return_sequences=False, dropout=0.25, recurrent_dropout=0.25))(input_)
    out = Dense(3, activation='softmax')(x) # nr of classes 
    model = Model(inputs=input_, outputs=out)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

In [33]:
bilstm_inputs = {}

for column_name, train_series in combinations:
    print(f"Building BiLSTM input arrays for {column_name}...")

    # Load or train Word2Vec for this variant
    w2v_model = w2v_models[column_name]

    # Prepare the sequences for each split
    X_train_bilstm = np.stack([tweet_to_sequence(tweet, w2v_model, vector_size, max_len) for tweet in train_df[column_name]])
    X_val_bilstm   = np.stack([tweet_to_sequence(tweet, w2v_model, vector_size, max_len) for tweet in val_df[column_name]])
    X_test_bilstm  = np.stack([tweet_to_sequence(tweet, w2v_model, vector_size, max_len) for tweet in test_df[column_name]])

    # Store
    bilstm_inputs[column_name] = {
        "train": X_train_bilstm,
        "val": X_val_bilstm,
        "test": X_test_bilstm,
    }

Building BiLSTM input arrays for text_no_lemma_no_stem_with_stopwords...
Building BiLSTM input arrays for text_lemma_no_stem_with_stopwords...
Building BiLSTM input arrays for text_no_lemma_stem_with_stopwords...
Building BiLSTM input arrays for text_no_lemma_no_stem_no_stopwords...
Building BiLSTM input arrays for text_lemma_no_stem_no_stopwords...
Building BiLSTM input arrays for text_no_lemma_stem_no_stopwords...
Building BiLSTM input arrays for text_lemma_stem_with_stopwords...
Building BiLSTM input arrays for text_lemma_stem_no_stopwords...


In [None]:
from keras.utils import to_categorical

In [None]:
# Choose our variant
column_name = "text_no_lemma_stem_with_stopwords"

# Get data for that variant
X_train = bilstm_inputs[column_name]["train"]
X_val   = bilstm_inputs[column_name]["val"]
X_test  = bilstm_inputs[column_name]["test"]

# Convert your y labels to categorical (one-hot)
y_train_cat = to_categorical(y_train, num_classes=3)
y_val_cat   = to_categorical(y_val, num_classes=3)
y_test_cat  = to_categorical(y_test, num_classes=3)

# Build the BiLSTM model
model = build_bilstm(
    units=64,
    input_length=X_train.shape[1],  # Should be maxlen
    embed_size=X_train.shape[2]     # Should be vector_size from w2v (e.g., 100)
)

# Train the model
model.fit(
    X_train, y_train_cat,
    validation_data=(X_val, y_val_cat),
    epochs=30,
    batch_size=64
)

# Evaluate
val_preds = np.argmax(model.predict(X_val), axis=1)
print(classification_report(y_val, val_preds, digits=3))

In [None]:
# for column_name in bilstm_inputs:
#     print(f"Training BiLSTM for {column_name}...")
#     X_train = bilstm_inputs[column_name]["train"]
#     X_val = bilstm_inputs[column_name]["val"]
#     X_test = bilstm_inputs[column_name]["test"]

#     # Assume y_train, y_val, y_test already exist as integer class labels
#     y_train_cat = to_categorical(y_train, num_classes=3)
#     y_val_cat = to_categorical(y_val, num_classes=3)
#     y_test_cat = to_categorical(y_test, num_classes=3)

#     model = build_bilstm(
#         units=64,
#         input_length=X_train.shape[1], #max_len,
#         embed_size=X_train.shape[2]   # vector size from Word2Vec, e.g. 100
#     )

#     # Train
#     model.fit(X_train, y_train_cat, validation_data=(X_val, y_val_cat), epochs=30, batch_size=64)

#     # Predict and evaluate
#     val_preds = np.argmax(model.predict(X_val), axis=1)
#     print(classification_report(y_val, val_preds, digits=3))

Training BiLSTM for text_no_lemma_no_stem_with_stopwords...
Epoch 1/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 94ms/step - accuracy: 0.6314 - loss: 0.9167 - val_accuracy: 0.6471 - val_loss: 0.8820
Epoch 2/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.6535 - loss: 0.8629 - val_accuracy: 0.6527 - val_loss: 0.8568
Epoch 3/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.6412 - loss: 0.8715 - val_accuracy: 0.6541 - val_loss: 0.8548
Epoch 4/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 108ms/step - accuracy: 0.6642 - loss: 0.8365 - val_accuracy: 0.6632 - val_loss: 0.8444
Epoch 5/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 125ms/step - accuracy: 0.6603 - loss: 0.8474 - val_accuracy: 0.6604 - val_loss: 0.8402
Epoch 6/30
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 117ms/step - accuracy: 0.6653 - loss: 0.

KeyboardInterrupt: 