In [1]:
################## IMPORT DATA ##################

import pandas as pd
# Read the Parquet file
file_path = "train-00000-of-00001.parquet"  # Replace with your Parquet file path
df = pd.read_parquet(file_path)

# Inspect the DataFrame
df.head(2)

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0


In [2]:
################## BALANCED DF CREATION ##################

# Count samples for each label
label_counts = df['label'].value_counts()
minority_class_size = label_counts.min()

# Group by the category and sample the minority class size for each group
balanced_df = (
    df.groupby('label')
    .apply(lambda x: x.sample(n=minority_class_size, random_state=42))  # Randomly select samples
    .reset_index(drop=True)  # Reset the index
)

print("Class distribution after undersampling:")
print(balanced_df['label'].value_counts())

Class distribution after undersampling:
label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64


  df.groupby('label')


# Data Preproccesing

In [3]:
df_preprocessed = balanced_df
print(df_preprocessed.shape)
df_preprocessed.head(2)

(89832, 2)


Unnamed: 0,text,label
0,i feel sorry about you because your point of v...,0
1,i feel like he s watching quietly because he s...,0


In [4]:
# remove all the sentences that are longer than 40 words
df_preprocessed = df_preprocessed[df_preprocessed['text'].apply(lambda x: len(x.split()) <= 40 and len(x.split()) >= 3)]
print(df_preprocessed.shape)
df_preprocessed.head(2)

(84900, 2)


Unnamed: 0,text,label
0,i feel sorry about you because your point of v...,0
1,i feel like he s watching quietly because he s...,0


## Tokenizing The Text

In [5]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True)

from tqdm import tqdm
# Enable tqdm for pandas
tqdm.pandas()

# Tokenize each text and store tokenized output in a new column
df_preprocessed['tokenized_text'] = df_preprocessed['text'].progress_apply(lambda x: tokenizer.tokenize(x))
df_preprocessed.head(2)

100%|██████████| 84900/84900 [00:07<00:00, 11281.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preprocessed['tokenized_text'] = df_preprocessed['text'].progress_apply(lambda x: tokenizer.tokenize(x))


Unnamed: 0,text,label,tokenized_text
0,i feel sorry about you because your point of v...,0,"[i, feel, sorry, about, you, because, your, po..."
1,i feel like he s watching quietly because he s...,0,"[i, feel, like, he, s, watching, quietly, beca..."


## Using The Embedding Model

In [6]:
from gensim.models.fasttext import load_facebook_model

fasttext_path = "crawl-300d-2M-subword/crawl-300d-2M-subword.bin"
fasttext = load_facebook_model(fasttext_path)

# Check an example word vector
print(fasttext.wv['happy'])  # This should now work without errors

[ 7.15019275e-03  1.05880331e-02  5.85333593e-02 -1.80088747e-02
  2.17381623e-02 -6.36548251e-02  4.79345582e-02  5.82478940e-04
 -2.71546952e-02  4.41303067e-02 -2.70192716e-02  6.18359353e-03
 -1.98998721e-03  3.27444300e-02  4.41248150e-04  3.95078957e-02
 -3.66007313e-02 -2.67524342e-03  5.90687105e-03  2.27801464e-02
 -3.65645699e-02 -4.19695750e-02 -7.16437120e-03 -4.24787998e-02
  4.29837480e-02  2.15609614e-02  2.45211683e-02  1.77936815e-02
  1.41702415e-02 -2.20726691e-02  3.10446555e-03 -7.80420452e-02
  1.17998419e-03 -9.21797939e-03  6.47169817e-03 -2.28024814e-02
  4.87457886e-02 -4.71891044e-03 -2.68589742e-02  2.32070964e-02
 -1.11313999e-01 -4.35941219e-02 -2.73997393e-02  9.44887754e-03
 -3.80694109e-04  7.03306049e-02  2.92210635e-02  2.60734255e-03
 -1.09368563e-01  3.23175937e-02  1.05939796e-02 -1.16323661e-02
  3.07511836e-02  2.60094590e-02 -8.07798654e-02  1.67720411e-02
  3.48081253e-02  2.77459081e-02 -1.08297467e-01 -5.31493500e-03
  1.44641800e-02 -4.21016

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np

# it's assumed that the FastText model is already loaded and stored in the `fasttext` variable
fasttext

# data
X = df_preprocessed['tokenized_text'].tolist()
y = df_preprocessed['label'].tolist()

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 1: Preprocess data
class TextDataset(Dataset):
    def __init__(self, X, y, fasttext):
        # Use fasttext.wv for word lookup
        self.X = [torch.tensor([fasttext.wv[token] if token in fasttext.wv else np.zeros(fasttext.vector_size)
                                for token in seq]) for seq in X]
        self.y = torch.tensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X_train, y_train, fasttext)

  self.X = [torch.tensor([fasttext.wv[token] if token in fasttext.wv else np.zeros(fasttext.vector_size)


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import signal
import sys

# Step 2: Pad sequences
def collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    X_batch = pad_sequence(X_batch, batch_first=True)
    y_batch = torch.tensor(y_batch)
    return X_batch, y_batch

dataloader = DataLoader(dataset, batch_size=40, collate_fn=collate_fn, shuffle=True, num_workers=0)

# Step 3: Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 100)
        self.fc2 = nn.Linear(100, num_classes)
    
    def forward(self, x):
        _, (hidden, cell) = self.lstm(x)
        out = hidden[-1]  # the last layer's hidden state
        out = self.fc1(out) 
        out = self.fc2(out)
        return out

# Hyperparameters
input_size = 300  # Size of FastText embedding
hidden_size = 128  # Increased hidden size
num_layers = 2  # Increased number of layers
num_classes = 6
num_epochs = 20  # Increased number of epochs
learning_rate = 0.02  # Reduced learning rate

model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Step 4: Graceful stopping handler
def signal_handler(sig, frame):
    print("\nTraining interrupted. Exiting gracefully.")
    sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)

# Accuracy function
def calculate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    model.train()
    return correct / total

# Step 5: Train the model
for epoch in range(num_epochs):
    total_epoch_loss = 0
    num_batches = len(dataloader)  # Total number of batches in the current epoch
    for batch_idx, (X_batch, y_batch) in enumerate(dataloader, start=1):  # Start batch_idx from 1
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_epoch_loss += loss.item()
        
        train_accuracy = calculate_accuracy(model, dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{num_batches}], Batch Loss: {loss.item():.4f}, Training Accuracy: {train_accuracy:.4f}")
        sys.stdout.flush()  # Force immediate printing
    print(f"Epoch [{epoch+1}/{num_epochs}], Epoch Loss: {total_epoch_loss:.4f}")



print("Training completed.")

Epoch [1/20], Batch [1/1359], Batch Loss: 1.7913, Training Accuracy: 0.1680
Epoch [1/20], Batch [2/1359], Batch Loss: 2.1665, Training Accuracy: 0.1683
Epoch [1/20], Batch [3/1359], Batch Loss: 1.7913, Training Accuracy: 0.1670
Epoch [1/20], Batch [4/1359], Batch Loss: 1.7930, Training Accuracy: 0.1670
Epoch [1/20], Batch [5/1359], Batch Loss: 1.8293, Training Accuracy: 0.1665
Epoch [1/20], Batch [6/1359], Batch Loss: 1.8041, Training Accuracy: 0.1696
Epoch [1/20], Batch [7/1359], Batch Loss: 1.8480, Training Accuracy: 0.1674
Epoch [1/20], Batch [8/1359], Batch Loss: 6.1032, Training Accuracy: 0.1671
Epoch [1/20], Batch [9/1359], Batch Loss: 1.8043, Training Accuracy: 0.1669
Epoch [1/20], Batch [10/1359], Batch Loss: 1.7876, Training Accuracy: 0.1652
Epoch [1/20], Batch [11/1359], Batch Loss: 1.9283, Training Accuracy: 0.1672
Epoch [1/20], Batch [12/1359], Batch Loss: 1.7687, Training Accuracy: 0.1643
Epoch [1/20], Batch [13/1359], Batch Loss: 2.1693, Training Accuracy: 0.1682
Epoch [1

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import signal
import sys

# Step 2: Pad sequences
def collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    X_batch = pad_sequence(X_batch, batch_first=True)
    y_batch = torch.tensor(y_batch)
    return X_batch, y_batch

dataloader = DataLoader(dataset, batch_size=20, collate_fn=collate_fn)

# Step 3: Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = torch.mean(lstm_out, dim=1)  # Mean pooling across time steps
        out = self.fc(out)  # Use the final hidden state
        return out

# Hyperparameters
input_size = 300  # Size of FastText embedding
hidden_size = 256
num_classes = 6
num_epochs = 5
learning_rate = 0.01

model = LSTMClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Step 4: Graceful stopping handler
def signal_handler(sig, frame):
    print("\nTraining interrupted. Exiting gracefully.")
    sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)

# Accuracy function
def calculate_accuracy(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    model.train()
    return correct / total

# Step 5: Train the model
for epoch in range(num_epochs):
    total_epoch_loss = 0
    for X_batch, y_batch in dataloader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        
         # Debugging: Print gradient norms
        for p in model.parameters():
            if p.grad is not None:
                print(f"Gradient norm for parameter {p.shape}: {p.grad.norm().item()}")
        
        optimizer.step()
        
        total_epoch_loss += loss.item()
    
        # Print loss and accuracies
        train_accuracy = calculate_accuracy(model, dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_epoch_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

print("Training completed.")


Gradient norm for parameter torch.Size([1024, 300]): 0.01899331621825695
Gradient norm for parameter torch.Size([1024, 256]): 0.01701652817428112
Gradient norm for parameter torch.Size([1024]): 0.041358835995197296
Gradient norm for parameter torch.Size([1024]): 0.041358835995197296
Gradient norm for parameter torch.Size([6, 256]): 0.062035225331783295
Gradient norm for parameter torch.Size([6]): 0.14706331491470337
Epoch [1/5], Loss: 1.7872, Training Accuracy: 0.1675
Gradient norm for parameter torch.Size([1024, 300]): 0.036493152379989624
Gradient norm for parameter torch.Size([1024, 256]): 0.12676629424095154
Gradient norm for parameter torch.Size([1024]): 0.14075160026550293
Gradient norm for parameter torch.Size([1024]): 0.14075160026550293
Gradient norm for parameter torch.Size([6, 256]): 0.19672709703445435
Gradient norm for parameter torch.Size([6]): 0.21300971508026123
Epoch [1/5], Loss: 3.4322, Training Accuracy: 0.1675
Gradient norm for parameter torch.Size([1024, 300]): 0.1

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [55]:
df_preprocessed["text"].apply(lambda x: len(x.split())).describe()

count    89832.000000
mean        19.438986
std         11.085368
min          1.000000
25%         11.000000
50%         17.000000
75%         26.000000
max        100.000000
Name: text, dtype: float64

In [56]:
# Check the percentage of tokens in the dataset that are missing in the FastText vocabulary
missing_tokens = 0
total_tokens = 0

for seq in X_train:
    for token in seq:
        total_tokens += 1
        if token not in fasttext.wv:
            missing_tokens += 1

missing_percentage = (missing_tokens / total_tokens) * 100
print(f"Percentage of missing tokens: {missing_percentage:.2f}%")


Percentage of missing tokens: 0.00%
