In [24]:
################## IMPORT DATA ##################

import pandas as pd
# Read the Parquet file
file_path = "train-00000-of-00001.parquet"  # Replace with your Parquet file path
df = pd.read_parquet(file_path)

# Inspect the DataFrame
df.head(2)

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0


In [25]:
################## BALANCED DF CREATION ##################

# Count samples for each label
label_counts = df['label'].value_counts()
minority_class_size = label_counts.min()

# Group by the category and sample the minority class size for each group
balanced_df = (
    df.groupby('label')
    .apply(lambda x: x.sample(n=minority_class_size, random_state=42))  # Randomly select samples
    .reset_index(drop=True)  # Reset the index
)

print("Class distribution after undersampling:")
print(balanced_df['label'].value_counts())

Class distribution after undersampling:
label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64


  df.groupby('label')


# Data Preproccesing

In [26]:
df_preprocessed = balanced_df
print(df_preprocessed.shape)
df_preprocessed.head(2)

(89832, 2)


Unnamed: 0,text,label
0,i feel sorry about you because your point of v...,0
1,i feel like he s watching quietly because he s...,0


In [None]:
####### HASN'T BEEN EXECUTED!! TO AVOID FITTING TO THE DATA #######

# # remove all the sentences that are longer than 40 words
# df_preprocessed = df_preprocessed[df_preprocessed['text'].apply(lambda x: len(x.split()) <= 40 and len(x.split()) >= 3)]
# print(df_preprocessed.shape)
# df_preprocessed.head(2)

(84900, 2)


Unnamed: 0,text,label
0,i feel sorry about you because your point of v...,0
1,i feel like he s watching quietly because he s...,0


## Tokenizing The Text

In [27]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True)

from tqdm import tqdm
# Enable tqdm for pandas
tqdm.pandas()

# Tokenize each text and store tokenized output in a new column
df_preprocessed['tokenized_text'] = df_preprocessed['text'].progress_apply(lambda x: tokenizer.tokenize(x))
df_preprocessed.head(2)

100%|██████████| 89832/89832 [00:07<00:00, 11970.51it/s]


Unnamed: 0,text,label,tokenized_text
0,i feel sorry about you because your point of v...,0,"[i, feel, sorry, about, you, because, your, po..."
1,i feel like he s watching quietly because he s...,0,"[i, feel, like, he, s, watching, quietly, beca..."


## Using The Embedding Model

In [6]:
from gensim.models.fasttext import load_facebook_model

fasttext_path = "crawl-300d-2M-subword/crawl-300d-2M-subword.bin"
fasttext = load_facebook_model(fasttext_path)

# Check an example word vector
print(fasttext.wv['happy'])  # This should now work without errors

[ 7.15019275e-03  1.05880331e-02  5.85333593e-02 -1.80088747e-02
  2.17381623e-02 -6.36548251e-02  4.79345582e-02  5.82478940e-04
 -2.71546952e-02  4.41303067e-02 -2.70192716e-02  6.18359353e-03
 -1.98998721e-03  3.27444300e-02  4.41248150e-04  3.95078957e-02
 -3.66007313e-02 -2.67524342e-03  5.90687105e-03  2.27801464e-02
 -3.65645699e-02 -4.19695750e-02 -7.16437120e-03 -4.24787998e-02
  4.29837480e-02  2.15609614e-02  2.45211683e-02  1.77936815e-02
  1.41702415e-02 -2.20726691e-02  3.10446555e-03 -7.80420452e-02
  1.17998419e-03 -9.21797939e-03  6.47169817e-03 -2.28024814e-02
  4.87457886e-02 -4.71891044e-03 -2.68589742e-02  2.32070964e-02
 -1.11313999e-01 -4.35941219e-02 -2.73997393e-02  9.44887754e-03
 -3.80694109e-04  7.03306049e-02  2.92210635e-02  2.60734255e-03
 -1.09368563e-01  3.23175937e-02  1.05939796e-02 -1.16323661e-02
  3.07511836e-02  2.60094590e-02 -8.07798654e-02  1.67720411e-02
  3.48081253e-02  2.77459081e-02 -1.08297467e-01 -5.31493500e-03
  1.44641800e-02 -4.21016

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np

# it's assumed that the FastText model is already loaded and stored in the `fasttext` variable
fasttext

# data
X = df_preprocessed['tokenized_text'].tolist()
y = df_preprocessed['label'].tolist()

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 1: Preprocess data
class TextDataset(Dataset):
    def __init__(self, X, y, fasttext):
        # Use fasttext.wv for word lookup
        self.X = [torch.tensor([fasttext.wv[token] if token in fasttext.wv else np.zeros(fasttext.vector_size)
                                for token in seq]) for seq in X]
        self.y = torch.tensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train, fasttext)
val_dataset = TextDataset(X_val, y_val, fasttext)
test_dataset = TextDataset(X_test, y_test, fasttext)

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import signal
import sys
import os

# Ensure directories exist
os.makedirs('models/LSTM/best_val', exist_ok=True)
os.makedirs('models/LSTM/500_batch', exist_ok=True)
os.makedirs('models/LSTM/full_epochs', exist_ok=True)

# Step 2: Pad sequences
def collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    lengths = torch.tensor([len(x) for x in X_batch])  # Store original sequence lengths
    X_batch = pad_sequence(X_batch, batch_first=True)
    y_batch = torch.tensor(y_batch)
    return X_batch, y_batch, lengths

train_dataloader = DataLoader(train_dataset, batch_size=40, collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=40, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=40, collate_fn=collate_fn, shuffle=True)

# Step 3: Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, lengths):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed_x)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = torch.mean(lstm_out, dim=1)  # Mean pooling across time steps
        out = self.fc(out)  # Use the final hidden state
        return out

# Hyperparameters
input_size = 300  # Size of FastText embedding
hidden_size = 256
num_classes = 6
num_epochs = 30
learning_rate = 0.01

model = LSTMClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Step 4: Graceful stopping handler
def signal_handler(sig, frame):
    print("\nTraining interrupted. Exiting gracefully and saving model.")
    torch.save(model.state_dict(), 'model_checkpoint.pth')
    sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)

# Accuracy function
def calculate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch, lengths in dataloader:
            outputs = model(X_batch, lengths)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    model.train()
    return correct / total

best_val_accuracy = 0
# Step 5: Train the model
for epoch in range(num_epochs):
    total_epoch_loss = 0
    num_batches = len(train_dataloader)  # Total number of batches in the current epoch
    for batch_idx, (X_batch, y_batch, lengths) in enumerate(train_dataloader, start=1):
        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        
        total_epoch_loss += loss.item()
        if batch_idx % 10 == 0:
            # Print loss and accuracies
            train_accuracy = calculate_accuracy(model, train_dataloader)
            val_accuracy = calculate_accuracy(model, val_dataloader)
        
            if val_accuracy >= best_val_accuracy+0.01 and val_accuracy >= 0.9:
                best_val_accuracy = val_accuracy
                torch.save(model.state_dict(), f'models/LSTM/best_val/model_checkpoint_val_{int(val_accuracy*100)}.pth')
                
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{num_batches}] Batch Loss: {loss.item():.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        if batch_idx % 500 == 499:
            # Save the model every 500 batches
            torch.save(model.state_dict(), f'models/LSTM/500_batch/model_checkpoint_epoch_{epoch}_batch_{batch_idx}.pth')
    print(f"Epoch [{epoch+1}/{num_epochs}], Epoch Loss: {total_epoch_loss:.4f}")
    # save the model with the epoch number, in models folder
    torch.save(model.state_dict(), f'models/LSTM/full_epochs/model_checkpoint_epoch_{epoch}.pth')
print("Training completed.")


Epoch [1/30], Batch [10/1438] Batch Loss: 1.8047, Training Accuracy: 0.1703, Validation Accuracy: 0.1682
Epoch [1/30], Batch [20/1438] Batch Loss: 1.8137, Training Accuracy: 0.1674, Validation Accuracy: 0.1702
Epoch [1/30], Batch [30/1438] Batch Loss: 1.7844, Training Accuracy: 0.1828, Validation Accuracy: 0.1809
Epoch [1/30], Batch [40/1438] Batch Loss: 1.7838, Training Accuracy: 0.2020, Validation Accuracy: 0.1975
Epoch [1/30], Batch [50/1438] Batch Loss: 1.7922, Training Accuracy: 0.1945, Validation Accuracy: 0.1984
Epoch [1/30], Batch [60/1438] Batch Loss: 1.7956, Training Accuracy: 0.1756, Validation Accuracy: 0.1737
Epoch [1/30], Batch [70/1438] Batch Loss: 1.7993, Training Accuracy: 0.1903, Validation Accuracy: 0.1926
Epoch [1/30], Batch [80/1438] Batch Loss: 1.7860, Training Accuracy: 0.1994, Validation Accuracy: 0.1990
Epoch [1/30], Batch [90/1438] Batch Loss: 1.7954, Training Accuracy: 0.2570, Validation Accuracy: 0.2596
Epoch [1/30], Batch [100/1438] Batch Loss: 1.8084, Trai

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [55]:
df_preprocessed["text"].apply(lambda x: len(x.split())).describe()

count    89832.000000
mean        19.438986
std         11.085368
min          1.000000
25%         11.000000
50%         17.000000
75%         26.000000
max        100.000000
Name: text, dtype: float64

In [56]:
# Check the percentage of tokens in the dataset that are missing in the FastText vocabulary
missing_tokens = 0
total_tokens = 0

for seq in X_train:
    for token in seq:
        total_tokens += 1
        if token not in fasttext.wv:
            missing_tokens += 1

missing_percentage = (missing_tokens / total_tokens) * 100
print(f"Percentage of missing tokens: {missing_percentage:.2f}%")

Percentage of missing tokens: 0.00%
