## 0 - Imports

In [16]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
import re
import string
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import string

from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Text extraction 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import contractions
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import gensim.downloader as api

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

import pickle

# Deep Learning libraries
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout, Flatten, Input
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dropout, Flatten, GRU
from tensorflow.keras.optimizers import Adam


from imblearn.under_sampling import RandomUnderSampler



import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set pd options to display all columns and rows
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 30)
pd.set_option('display.max_colwidth', None)  # Show full text without truncation


# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugof\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hugof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hugof\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hugof\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 1 - EDA (missing, just copy the other notebook)

In [18]:
# Define the base directory (where the notebook is)
BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

# Construct full paths to the CSV files
train_path = os.path.join(BASE_DIR, "data", "train.csv")
test_path = os.path.join(BASE_DIR, "data", "test.csv")

# Load the datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)







## 2 - Pre-Processing

In [19]:
lemmatizer = WordNetLemmatizer()

# Source: https://www.nltk.org/api/nltk.tokenize.casual.html
# Difference between TweetTokenizer and Word_Tokenize: https://stackoverflow.com/questions/61919670/how-nltk-tweettokenizer-different-from-nltk-word-tokenize
tokenizer = TweetTokenizer()

stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

In [20]:
def clean_text_column(text,lemmatizer=None, stemmer=None, remove_stopwords=None):
    text = text.lower()

    # Replace URLs and user mentions
    text = re.sub(r"http\S+|www\.\S+", "URL", text)
    text = re.sub(r"@\w+", "USER", text)

    # Expand contractions (we use contractions library for this)
    # Contractions library Source: https://pypi.org/project/contractions/
    text = contractions.fix(text)

    # # Replace numbers with [NUM]
    # text = re.sub(r"\d+(\.\d+)?", "[NUM]", text)

    # Convert to tickers (e.g., $AAPL to [TICKER])
    text = re.sub(r"\$[a-z]{1,5}", "[TICKER]", text)

    #Remove numbers
    text = re.sub(r"\d+", "", text)

    # Normalize punctuation repetitions
    text = re.sub(r"([!?\.])\1+", r"\1", text)

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Optionally remove stopwords and punctuation

    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    else:
        tokens = [token for token in tokens if token not in string.punctuation]
    
    # Lemmatization OR stemming (not both!)
    if lemmatizer is not None and stemmer is None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif stemmer is not None and lemmatizer is None:
        tokens = [stemmer.stem(token) for token in tokens]
    elif lemmatizer is not None and stemmer is not None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Else, leave tokens as is

    # Source: https://www.nltk.org/api/nltk.tokenize.treebank.html 
    # TreebankWordDetokenizer from NLTK takes care of the correct spacing and formatting, 
    # we you get a well-formed sentence that looks like natural English (e.g. without TreebankWordDetokinzer: This is an example tweet ! , With: This is an example tweet!)
    return TreebankWordDetokenizer().detokenize(tokens)

In [21]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

### Try the different combinations of pre-processing

In [22]:
# Define the combinations to try
combinations = [
    {'lemmatizer': None, 'stemmer': None, 'remove_stopwords': False, 'name': 'no_lemma_no_stem_with_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': None, 'remove_stopwords': False, 'name': 'lemma_no_stem_with_stopwords'},
    {'lemmatizer': None, 'stemmer': stemmer, 'remove_stopwords': False, 'name': 'no_lemma_stem_with_stopwords'},
    {'lemmatizer': None, 'stemmer': None, 'remove_stopwords': True, 'name': 'no_lemma_no_stem_no_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': None, 'remove_stopwords': True, 'name': 'lemma_no_stem_no_stopwords'},
    {'lemmatizer': None, 'stemmer': stemmer, 'remove_stopwords': True, 'name': 'no_lemma_stem_no_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': stemmer, 'remove_stopwords': False, 'name': 'lemma_stem_with_stopwords'},
    {'lemmatizer': lemmatizer, 'stemmer': stemmer, 'remove_stopwords': True, 'name': 'lemma_stem_no_stopwords'}
]

# Process each combination and add to the dataframe
for combo in combinations:
    column_name = f"text_{combo['name']}"
    print(f"Processing {column_name}...")
    
    # Apply the clean_text_column function with the current combination
    df_train_cleaned[column_name] = df_train_cleaned['text'].apply(
        lambda x: clean_text_column(
            x, 
            lemmatizer=combo['lemmatizer'], 
            stemmer=combo['stemmer'], 
            remove_stopwords=combo['remove_stopwords']
        )
    )

# Also apply the best combination to the test set later after evaluation
print("Processing complete")

# Display the first few rows with all the combinations
df_train_cleaned.iloc[:10, :10].head()

Processing text_no_lemma_no_stem_with_stopwords...
Processing text_lemma_no_stem_with_stopwords...
Processing text_no_lemma_stem_with_stopwords...
Processing text_no_lemma_no_stem_no_stopwords...
Processing text_lemma_no_stem_no_stopwords...
Processing text_no_lemma_stem_no_stopwords...
Processing text_lemma_stem_with_stopwords...
Processing text_lemma_stem_no_stopwords...
Processing complete


Unnamed: 0,text,label,text_no_lemma_no_stem_with_stopwords,text_lemma_no_stem_with_stopwords,text_no_lemma_stem_with_stopwords,text_no_lemma_no_stem_no_stopwords,text_lemma_no_stem_no_stopwords,text_no_lemma_stem_no_stopwords,text_lemma_stem_with_stopwords,text_lemma_stem_no_stopwords
0,$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT,0,TICKER jpmorgan reels in expectations on beyond meat URL,TICKER jpmorgan reel in expectation on beyond meat URL,ticker jpmorgan reel in expect on beyond meat url,TICKER jpmorgan reels expectations beyond meat URL,TICKER jpmorgan reel expectation beyond meat URL,ticker jpmorgan reel expect beyond meat url,TICKER jpmorgan reel in expectation on beyond meat URL,TICKER jpmorgan reel expectation beyond meat URL
1,$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3,0,TICKER TICKER nomura points to bookings weakness at carnival and royal caribbean URL,TICKER TICKER nomura point to booking weakness at carnival and royal caribbean URL,ticker ticker nomura point to book weak at carniv and royal caribbean url,TICKER TICKER nomura points bookings weakness carnival royal caribbean URL,TICKER TICKER nomura point booking weakness carnival royal caribbean URL,ticker ticker nomura point book weak carniv royal caribbean url,TICKER TICKER nomura point to booking weakness at carnival and royal caribbean URL,TICKER TICKER nomura point booking weakness carnival royal caribbean URL
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb",0,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,ticker cemex cut at credit suiss j p morgan on weak build outlook url,TICKER cemex cut credit suisse j p morgan weak building outlook URL,TICKER cemex cut credit suisse j p morgan weak building outlook URL,ticker cemex cut credit suiss j p morgan weak build outlook url,TICKER cemex cut at credit suisse j p morgan on weak building outlook URL,TICKER cemex cut credit suisse j p morgan weak building outlook URL
3,$ESS: BTIG Research cuts to Neutral https://t.co/MCyfTsXc2N,0,TICKER]: btig research cuts to neutral URL,TICKER]: btig research cut to neutral URL,ticker]: btig research cut to neutral url,TICKER]: btig research cuts neutral URL,TICKER]: btig research cut neutral URL,ticker]: btig research cut neutral url,TICKER]: btig research cut to neutral URL,TICKER]: btig research cut neutral URL
4,$FNKO - Funko slides after Piper Jaffray PT cut https://t.co/z37IJmCQzB,0,TICKER funko slides after piper jaffray pt cut URL,TICKER funko slide after piper jaffray pt cut URL,ticker funko slide after piper jaffray pt cut url,TICKER funko slides piper jaffray pt cut URL,TICKER funko slide piper jaffray pt cut URL,ticker funko slide piper jaffray pt cut url,TICKER funko slide after piper jaffray pt cut URL,TICKER funko slide piper jaffray pt cut URL


In [None]:
# Using stratify to maintain the distribution of classes in the train, validation, and test sets

train_df, val_test_df = train_test_split(df_train_cleaned, test_size=0.3, stratify=df_train_cleaned['label'], random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df['label'], random_state=42)

In [None]:
X = df_train_cleaned.loc[:, df_train_cleaned.columns != 'label']
Y = df_train_cleaned['label']

rus = RandomUnderSampler(random_state=0, sampling_strategy="majority")
X_resampled, Y_resampled = rus.fit_resample(X, Y)


label
2    6178
1    1923
0    1442
Name: count, dtype: int64
label
1    1923
0    1442
2    1442
Name: count, dtype: int64


In [8]:
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

In [13]:
y_train.value_counts()

label
2    4325
1    1346
0    1009
Name: count, dtype: int64

In [14]:
4325/(4325+1346+1009)

0.6474550898203593

## 1. sentence-transformers/all-mpnet-base-v2

In [None]:
from sentence_transformers import SentenceTransformer


models = dict(
    mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
    distilroberta = SentenceTransformer('sentence-transformers/all-distilroberta-v1'),
    model_3 = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2'),
    )




In [None]:
datasets = {
    'train': train_df,
    'val': val_df,
    'test': test_df
}

embedding_results = {
    'train': {},
    'val': {},
    'test': {}
}



for split_name, df in datasets.items():
    values = df.loc[:, df.columns != 'label']

    for model_name, model in models.items():
        for col in values.columns:
            print(f"Encoding {col} with {model_name} for {split_name} set...")
            text_data = df[col].astype(str).tolist()
            embeddings = model.encode(text_data, batch_size=64, show_progress_bar=True)
            
            embedding_results[split_name][col] = embeddings


import pickle

with open('embedding_results_all_splits.pkl', 'wb') as f:
    pickle.dump(embedding_results, f)


Encoding text with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with mpnet for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with distilroberta for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with model_3 for train set...


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding text with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with mpnet for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with distilroberta for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with model_3 for val set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with mpnet for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with distilroberta for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_with_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_with_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_with_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_no_stem_no_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_no_stem_no_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_no_lemma_stem_no_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_with_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Encoding text_lemma_stem_no_stopwords with model_3 for test set...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

In [None]:
# Make sure LabelEncoder is ready
le = LabelEncoder()
le.fit(train_df['label'])

# Store results here
results = []

for key in embedding_results['train'].keys():
    print(f"Training model for embedding: {key}")

    # Get embeddings for train/val/test
    X_train = np.array(embedding_results['train'][key])
    X_val = np.array(embedding_results['val'][key])
    y_train = le.transform(train_df['label'])
    y_val = le.transform(val_df['label'])

    # Train logistic regression
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)

    # Evaluate
    acc = accuracy_score(y_val, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_val, preds, average='weighted', zero_division=0
    )

    # Save results
    col_name, model_name = key.rsplit('_', 1)
    results.append({
        'column': col_name,
        'embedding_model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Convert to DataFrame
metrics_df = pd.DataFrame(results)

# Optional: sort by F1-score
metrics_df = metrics_df.sort_values(by='f1_score', ascending=False)

# Save results to CSV
metrics_df.to_csv('embedding_logreg_results.csv', index=False)

print("✅ All models trained. Metrics saved to 'embedding_logreg_results.csv'")


Training model for embedding: text_mpnet
Training model for embedding: text_no_lemma_no_stem_with_stopwords_mpnet
Training model for embedding: text_lemma_no_stem_with_stopwords_mpnet
Training model for embedding: text_no_lemma_stem_with_stopwords_mpnet
Training model for embedding: text_no_lemma_no_stem_no_stopwords_mpnet
Training model for embedding: text_lemma_no_stem_no_stopwords_mpnet
Training model for embedding: text_no_lemma_stem_no_stopwords_mpnet
Training model for embedding: text_lemma_stem_with_stopwords_mpnet
Training model for embedding: text_lemma_stem_no_stopwords_mpnet
Training model for embedding: text_distilroberta
Training model for embedding: text_no_lemma_no_stem_with_stopwords_distilroberta
Training model for embedding: text_lemma_no_stem_with_stopwords_distilroberta
Training model for embedding: text_no_lemma_stem_with_stopwords_distilroberta
Training model for embedding: text_no_lemma_no_stem_no_stopwords_distilroberta
Training model for embedding: text_lemma_n

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Label encoding
le = LabelEncoder()
le.fit(train_df['label'])
num_classes = len(le.classes_)

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 5
LR = 1e-3
HIDDEN_DIM = 128

# BiLSTM model class
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len=1, input_dim]
        output, _ = self.lstm(x)
        out = self.fc(output[:, -1, :])  # Take output of the last time step
        return out

# Store metrics
results_bilstm = []

# Loop over each embedding variant
for key in embedding_results['train'].keys():
    print(f"Training BiLSTM on: {key}")

    # Get embeddings
    X_train = torch.tensor(embedding_results['train'][key], dtype=torch.float32)
    X_val = torch.tensor(embedding_results['val'][key], dtype=torch.float32)
    y_train = torch.tensor(le.transform(train_df['label']), dtype=torch.long)
    y_val = torch.tensor(le.transform(val_df['label']), dtype=torch.long)

    # Add fake sequence dimension: [batch_size, seq_len=1, input_dim]
    X_train = X_train.unsqueeze(1)
    X_val = X_val.unsqueeze(1)

    # Datasets and loaders
    train_ds = TensorDataset(X_train, y_train)
    val_ds = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    # Model setup
    input_dim = X_train.shape[-1]
    model = BiLSTMClassifier(input_dim, HIDDEN_DIM, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    # Train loop
    model.train()
    for epoch in range(EPOCHS):
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    # Evaluate
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            outputs = model(xb)
            predicted = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(predicted)
            all_true.extend(yb.numpy())

    # Metrics
    acc = accuracy_score(all_true, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true, all_preds, average='weighted', zero_division=0
    )

    col_name, model_name = key.rsplit('_', 1)
    results_bilstm.append({
        'column': col_name,
        'embedding_model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Final metrics DataFrame
metrics_df_bilstm = pd.DataFrame(results_bilstm)
metrics_df_bilstm = metrics_df_bilstm.sort_values(by='f1_score', ascending=False)
metrics_df_bilstm.to_csv('embedding_bilstm_results.csv', index=False)

print("✅ All BiLSTM models trained. Results saved to 'embedding_bilstm_results.csv'")


Training BiLSTM on: text_mpnet
Training BiLSTM on: text_no_lemma_no_stem_with_stopwords_mpnet
Training BiLSTM on: text_lemma_no_stem_with_stopwords_mpnet
Training BiLSTM on: text_no_lemma_stem_with_stopwords_mpnet
Training BiLSTM on: text_no_lemma_no_stem_no_stopwords_mpnet
Training BiLSTM on: text_lemma_no_stem_no_stopwords_mpnet
Training BiLSTM on: text_no_lemma_stem_no_stopwords_mpnet
Training BiLSTM on: text_lemma_stem_with_stopwords_mpnet
Training BiLSTM on: text_lemma_stem_no_stopwords_mpnet
Training BiLSTM on: text_distilroberta
Training BiLSTM on: text_no_lemma_no_stem_with_stopwords_distilroberta
Training BiLSTM on: text_lemma_no_stem_with_stopwords_distilroberta
Training BiLSTM on: text_no_lemma_stem_with_stopwords_distilroberta
Training BiLSTM on: text_no_lemma_no_stem_no_stopwords_distilroberta
Training BiLSTM on: text_lemma_no_stem_no_stopwords_distilroberta
Training BiLSTM on: text_no_lemma_stem_no_stopwords_distilroberta
Training BiLSTM on: text_lemma_stem_with_stopwords

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Label encoding
le = LabelEncoder()
le.fit(train_df['label'])
num_classes = len(le.classes_)

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 5
LR = 1e-3
HIDDEN_DIM = 128

# BiGRU model
class BiGRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiGRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # x: [batch_size, seq_len=1, input_dim]
        output, _ = self.gru(x)
        out = self.fc(output[:, -1, :])  # Last time step
        return out

# Store results
results_bigru = []

# Loop over all embedding keys
for key in embedding_results['train'].keys():
    print(f"Training BiGRU on: {key}")

    # Prepare data
    X_train = torch.tensor(embedding_results['train'][key], dtype=torch.float32).unsqueeze(1)
    X_val = torch.tensor(embedding_results['val'][key], dtype=torch.float32).unsqueeze(1)
    y_train = torch.tensor(le.transform(train_df['label']), dtype=torch.long)
    y_val = torch.tensor(le.transform(val_df['label']), dtype=torch.long)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=BATCH_SIZE)

    # Model init
    input_dim = X_train.shape[-1]
    model = BiGRUClassifier(input_dim, HIDDEN_DIM, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(EPOCHS):
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            preds = model(xb)
            pred_labels = torch.argmax(preds, dim=1).cpu().numpy()
            all_preds.extend(pred_labels)
            all_true.extend(yb.numpy())

    acc = accuracy_score(all_true, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true, all_preds, average='weighted', zero_division=0
    )

    col_name, model_name = key.rsplit('_', 1)
    results_bigru.append({
        'column': col_name,
        'embedding_model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Save results
metrics_df_bigru = pd.DataFrame(results_bigru)
metrics_df_bigru = metrics_df_bigru.sort_values(by='f1_score', ascending=False)
metrics_df_bigru.to_csv('embedding_bigru_results.csv', index=False)

print("✅ All BiGRU models trained. Results saved to 'embedding_bigru_results.csv'")


Training BiGRU on: text_mpnet
Training BiGRU on: text_no_lemma_no_stem_with_stopwords_mpnet
Training BiGRU on: text_lemma_no_stem_with_stopwords_mpnet
Training BiGRU on: text_no_lemma_stem_with_stopwords_mpnet
Training BiGRU on: text_no_lemma_no_stem_no_stopwords_mpnet
Training BiGRU on: text_lemma_no_stem_no_stopwords_mpnet
Training BiGRU on: text_no_lemma_stem_no_stopwords_mpnet
Training BiGRU on: text_lemma_stem_with_stopwords_mpnet
Training BiGRU on: text_lemma_stem_no_stopwords_mpnet
Training BiGRU on: text_distilroberta
Training BiGRU on: text_no_lemma_no_stem_with_stopwords_distilroberta
Training BiGRU on: text_lemma_no_stem_with_stopwords_distilroberta
Training BiGRU on: text_no_lemma_stem_with_stopwords_distilroberta
Training BiGRU on: text_no_lemma_no_stem_no_stopwords_distilroberta
Training BiGRU on: text_lemma_no_stem_no_stopwords_distilroberta
Training BiGRU on: text_no_lemma_stem_no_stopwords_distilroberta
Training BiGRU on: text_lemma_stem_with_stopwords_distilroberta
Tr

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Label encoding
le = LabelEncoder()
le.fit(train_df['label'])
num_classes = len(le.classes_)

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 5
LR = 1e-3
HIDDEN_DIM = 128

# Attention layer
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, 1)

    def forward(self, lstm_out):
        # lstm_out: [batch_size, seq_len, hidden_dim*2]
        attn_weights = torch.softmax(self.attn(lstm_out), dim=1)  # [batch_size, seq_len, 1]
        context = torch.sum(attn_weights * lstm_out, dim=1)       # [batch_size, hidden_dim*2]
        return context

# BiLSTM + Attention classifier
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMWithAttention, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_output = self.attention(lstm_out)
        return self.fc(attn_output)

# Store results
results_bilstm_attn = []

# Loop over embeddings
for key in embedding_results['train'].keys():
    print(f"Training BiLSTM + Attention on: {key}")

    # Get data
    X_train = torch.tensor(embedding_results['train'][key], dtype=torch.float32).unsqueeze(1)
    X_val = torch.tensor(embedding_results['val'][key], dtype=torch.float32).unsqueeze(1)
    y_train = torch.tensor(le.transform(train_df['label']), dtype=torch.long)
    y_val = torch.tensor(le.transform(val_df['label']), dtype=torch.long)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=BATCH_SIZE)

    # Model setup
    input_dim = X_train.shape[-1]
    model = BiLSTMWithAttention(input_dim, HIDDEN_DIM, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(EPOCHS):
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            output = model(xb)
            pred_labels = torch.argmax(output, dim=1).cpu().numpy()
            all_preds.extend(pred_labels)
            all_true.extend(yb.numpy())

    # Metrics
    acc = accuracy_score(all_true, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true, all_preds, average='weighted', zero_division=0
    )

    col_name, model_name = key.rsplit('_', 1)
    results_bilstm_attn.append({
        'column': col_name,
        'embedding_model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Save results
metrics_df_bilstm_attn = pd.DataFrame(results_bilstm_attn)
metrics_df_bilstm_attn = metrics_df_bilstm_attn.sort_values(by='f1_score', ascending=False)
metrics_df_bilstm_attn.to_csv('embedding_bilstm_attention_results.csv', index=False)

print("✅ All BiLSTM + Attention models trained. Saved to 'embedding_bilstm_attention_results.csv'")


Training BiLSTM + Attention on: text_mpnet
Training BiLSTM + Attention on: text_no_lemma_no_stem_with_stopwords_mpnet
Training BiLSTM + Attention on: text_lemma_no_stem_with_stopwords_mpnet
Training BiLSTM + Attention on: text_no_lemma_stem_with_stopwords_mpnet
Training BiLSTM + Attention on: text_no_lemma_no_stem_no_stopwords_mpnet
Training BiLSTM + Attention on: text_lemma_no_stem_no_stopwords_mpnet
Training BiLSTM + Attention on: text_no_lemma_stem_no_stopwords_mpnet
Training BiLSTM + Attention on: text_lemma_stem_with_stopwords_mpnet
Training BiLSTM + Attention on: text_lemma_stem_no_stopwords_mpnet
Training BiLSTM + Attention on: text_distilroberta
Training BiLSTM + Attention on: text_no_lemma_no_stem_with_stopwords_distilroberta
Training BiLSTM + Attention on: text_lemma_no_stem_with_stopwords_distilroberta
Training BiLSTM + Attention on: text_no_lemma_stem_with_stopwords_distilroberta
Training BiLSTM + Attention on: text_no_lemma_no_stem_no_stopwords_distilroberta
Training BiLST

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Label encoder (reuse)
le = LabelEncoder()
le.fit(train_df['label'])
num_classes = len(le.classes_)

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 5
LR = 1e-3
KERNEL_SIZE = 3
NUM_FILTERS = 64

# CNN model for 1D embedding vectors
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_filters, kernel_size, output_dim):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(1, num_filters, kernel_size, padding=kernel_size//2)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(num_filters, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # add channel dim here
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

# Store metrics
results_cnn = []

for key in embedding_results['train'].keys():
    print(f"Training CNN on: {key}")

    # Get embeddings & labels
    X_train = torch.tensor(embedding_results['train'][key], dtype=torch.float32)
    X_val = torch.tensor(embedding_results['val'][key], dtype=torch.float32)
    y_train = torch.tensor(le.transform(train_df['label']), dtype=torch.long)
    y_val = torch.tensor(le.transform(val_df['label']), dtype=torch.long)

    # Datasets and loaders
    train_ds = TensorDataset(X_train, y_train)
    val_ds = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    # Model init
    input_dim = X_train.shape[1]
    model = CNNClassifier(input_dim, NUM_FILTERS, KERNEL_SIZE, num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    # Train
    model.train()
    for epoch in range(EPOCHS):
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    # Evaluate
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            preds = model(xb)
            pred_labels = torch.argmax(preds, dim=1).cpu().numpy()
            all_preds.extend(pred_labels)
            all_true.extend(yb.numpy())

    acc = accuracy_score(all_true, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true, all_preds, average='weighted', zero_division=0
    )

    col_name, model_name = key.rsplit('_', 1)
    results_cnn.append({
        'column': col_name,
        'embedding_model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

# Save metrics
metrics_df_cnn = pd.DataFrame(results_cnn).sort_values(by='f1_score', ascending=False)
metrics_df_cnn.to_csv('embedding_cnn_results.csv', index=False)

print("✅ All CNN models trained. Results saved to 'embedding_cnn_results.csv'")


Training CNN on: text_mpnet
Training CNN on: text_no_lemma_no_stem_with_stopwords_mpnet
Training CNN on: text_lemma_no_stem_with_stopwords_mpnet
Training CNN on: text_no_lemma_stem_with_stopwords_mpnet
Training CNN on: text_no_lemma_no_stem_no_stopwords_mpnet
Training CNN on: text_lemma_no_stem_no_stopwords_mpnet
Training CNN on: text_no_lemma_stem_no_stopwords_mpnet
Training CNN on: text_lemma_stem_with_stopwords_mpnet
Training CNN on: text_lemma_stem_no_stopwords_mpnet
Training CNN on: text_distilroberta
Training CNN on: text_no_lemma_no_stem_with_stopwords_distilroberta
Training CNN on: text_lemma_no_stem_with_stopwords_distilroberta
Training CNN on: text_no_lemma_stem_with_stopwords_distilroberta
Training CNN on: text_no_lemma_no_stem_no_stopwords_distilroberta
Training CNN on: text_lemma_no_stem_no_stopwords_distilroberta
Training CNN on: text_no_lemma_stem_no_stopwords_distilroberta
Training CNN on: text_lemma_stem_with_stopwords_distilroberta
Training CNN on: text_lemma_stem_no_