In [1]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import numpy as np           # Numerical arrays & math
import pandas as pd          # Tabular data (DataFrames)
import random
import re
import os
import html 
import time
import torch.nn as nn
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
import torch.optim as optim
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from collections import Counter
import torch
import sys
sys.path.append("../src")
import config

print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("CPU cores (logical):", os.cpu_count())

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



CUDA available: True
GPU name: NVIDIA A40
CPU cores (logical): 96


The function converts words into numerical embeddings and supports random - embedding, Pre-Traind - GloVe, or Word2Vec (w2v).

In [2]:
def build_embedding_matrix(vocab, itos, vocab_size=0, embedding_dim=300, embedding_type="glove"):
    """
    Build an embedding matrix.

    Args:
        vocab: vocabulary object
        itos : index to string
        vocab_size (int): size of the vocabulary
        embedding_dim (int): embedding vector size
        embedding_type (str): "glove", "w2v", or "random"

    Returns:
        torch.Tensor: embedding matrix of shape (vocab_size, embedding_dim)
    """
    embedding_matrix = torch.randn(vocab_size, embedding_dim) * 0.6
    
    if embedding_type == "glove":
        glove = GloVe(name="6B", dim=300, cache="./glove_cache")
        for i, word in enumerate(itos):
            if word in glove.stoi:
                embedding_matrix[i] = glove[word]

    elif embedding_type == "w2v":
        w2v_model = Word2Vec.load(str(config.W2V_MODEL_PATH))
        for i, word in enumerate(itos):
            if word in w2v_model.wv.key_to_index:
                embedding_matrix[i] = torch.tensor(w2v_model.wv[word], dtype=torch.float32)
                
    elif embedding_type == "random":
        pass  # already random

    return embedding_matrix

In [3]:
class LSTMClassifier(nn.Module):
    """
    LSTM-based text classification model.
    Architecture:
     Embedding -> Embedding Dropout -> LSTM -> LayerNorm -> Dropout -> Fully Connected (Linear) -> Output
    """

    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
                 pad_idx, embedding_matrix, num_layers=1, bidirectional=True, dropout=0.2, embedding_dropout=0.2, freeze_embeddings=True):
        """
        Initialize the LSTM model.
        Args:
            vocab_size (int): Size of the vocabulary
            embed_dim (int): Dimension of word embeddings
            hidden_dim (int): LSTM hidden size
            num_classes (int): Number of output classes
            pad_idx (int): Index of the <pad> token
            embedding_matrix (torch.Tensor): Pre-trained embeddings
            num_layers (int): Number of LSTM layers
            bidirectional (bool): Use bidirectional LSTM
            dropout (float): Dropout probability
            embedding_dropout (float): Embedding dropout probability
            freeze_embeddings (bool): Freeze embedding weights 
        """
        super().__init__()

        # Embedding layer with padding_idx
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embeddings, padding_idx=pad_idx)
        
             
         # Embedding dropout
        self.embedding_dropout = nn.Dropout(embedding_dropout)

        # LSTM layer
        self.lstm = nn.LSTM(input_size=embed_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=bidirectional,
                            dropout=dropout if num_layers > 1 else 0)

        # Fully connected output layer
        lstm_out_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # LayerNorm on the hidden states
        self.layer_norm = nn.LayerNorm(lstm_out_dim)

        self.fc = nn.Linear(lstm_out_dim, num_classes)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass of the model.
        Args:
            x (torch.LongTensor): Input tensor of shape [batch_size, seq_len]
        Returns:
            logits (torch.FloatTensor): Output tensor of shape [batch_size, num_classes]
        """
        # x -> [batch_size, seq_len] -> embedding -> [batch_size, seq_len, embed_dim]
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded) 

        # LSTM forward
        lstm_out, (h_n, c_n) = self.lstm(embedded)

        # Max pooling
        pooled, _ = torch.max(lstm_out, dim=1)
        
        # Average Pooling 
        # pooled = torch.mean(lstm_out, dim=1)  

        last_hidden = pooled   
        last_hidden = self.layer_norm(last_hidden)

        # Apply dropout
        out = self.dropout(last_hidden)

        # Fully connected layer -> logits
        logits = self.fc(out)
        return logits


In [4]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    """
    Train the model for one epoch.
    Args:
        model (nn.Module): The LSTM model
        dataloader (DataLoader): Training data loader
        optimizer (torch.optim.Optimizer): Optimizer
        criterion (nn.Module): Loss function
        device (torch.device): CPU or GPU
    Returns:
        avg_loss (float): Average loss over the epoch
        avg_acc (float): Average accuracy over the epoch
    """
    model.train()  # Set model to training mode
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in dataloader:
        
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()          # Clear previous gradients
        logits = model(X_batch)        # Forward pass
        loss = criterion(logits, y_batch)
        loss.backward()                # Backpropagation
        optimizer.step()               # Update weights

        total_loss += loss.item() * X_batch.size(0)  # Sum loss over batch
        preds = logits.argmax(dim=1)                # Predicted classes
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(y_batch.cpu().tolist())

    avg_loss = total_loss / len(dataloader.dataset)
    avg_acc = accuracy_score(all_labels, all_preds)
    return avg_loss, avg_acc


In [5]:
def evaluate_func(model, dataloader, criterion, device):
    """
    Evaluate the model on validation or test set.
    Args:
        model (nn.Module): The LSTM model
        dataloader (DataLoader): Validation or test loader
        criterion (nn.Module): Loss function
        device (torch.device): CPU or GPU
    Returns:
        avg_loss (float): Average loss
        avg_acc (float): Average accuracy
    """
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:

            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            loss = criterion(logits, y_batch)

            total_loss += loss.item() * X_batch.size(0)
            
            preds = logits.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(y_batch.cpu().tolist())

    avg_loss = total_loss / len(dataloader.dataset)
    avg_acc = accuracy_score(all_labels, all_preds)
    
    
    # Compute precision and recall for class 1
    precision_1 = precision_score(all_labels, all_preds, pos_label=1)
    recall_1 = recall_score(all_labels, all_preds, pos_label=1)
    
    return avg_loss, avg_acc, precision_1, recall_1

In [6]:
def prepare_texts_and_labels(df):
    texts = df['text'].tolist()
    labels = (df['label'] == 1).astype(int).tolist()
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    return texts, labels, labels_tensor

def word_tokenizer(texts, vocab_size):

    def tokenizer(text):
        tokens = word_tokenize(text)
        return tokens

    token_counts = Counter()

    for text in texts:
        tokens = tokenizer(text)
        token_counts.update(tokens)
         
    most_common = token_counts.most_common(vocab_size - 2)
    vocab = {
        '<pad>': 0,
        '<unk>': 1
    }

    for idx, (token, _) in enumerate(most_common, start=2):
        vocab[token] = idx

    itos = {idx: token for token, idx in vocab.items()}

    return vocab, tokenizer, itos
  
def text_to_indices(text, vocab, tokenizer):
    indices = [vocab.get(token, vocab['<unk>']) for token in tokenizer(text)]  
    return torch.tensor(indices, dtype=torch.long)

def pad_sequence_with_maxlen(sequences, max_len, padding_value=0, dtype=torch.long, batch_first=True):
    """
    Pads or truncates a list of 1D tensors to a fixed maximum length.

    Args:
        sequences (list of torch.Tensor): List of 1D tensors (variable-length sequences).
        max_len (int): Desired maximum sequence length.
        padding_value (int, optional): Value used for padding shorter sequences. Default is 0.
        dtype (torch.dtype, optional): Data type of the output tensor. Default is torch.long.
        batch_first (bool, optional): If True, output shape is [batch_size, max_len].
                                      If False, output shape is [max_len, batch_size].
                                      Default is True.

    Returns:
        torch.Tensor: A tensor where all sequences are either truncated or padded to max_len.
    """
    processed = []
    for seq in sequences:
        seq = seq[:max_len]  # truncate if longer than max_len
        if len(seq) < max_len:
            pad_size = max_len - len(seq)
            seq = torch.cat([seq, torch.full((pad_size,), padding_value, dtype=dtype)])
        processed.append(seq)

    result = torch.stack(processed)

    if not batch_first:
        result = result.transpose(0, 1)  # shape: [max_len, batch_size]

    return result

In [7]:
def prepare_data_pipeline(df_train, df_test, vocab_size, max_len, batch):
    """
    Prepares train/val/test loaders from a DataFrame with tokenized_clean_text.

    Args:
        df (pd.DataFrame): Must contain 'tokenized_clean_text' and 'label'
        vocab_size (int): Max vocabulary size
        max_len (int): Max sequence length for padding
        batch (int): Batch size

    Returns:
        train_loader, val_loader, test_loader, vocab, tokenizer, itos
    """
    # --- Split test data to val and test---
    df_val, df_test = train_test_split(
       df_test, test_size=0.5, random_state=42, stratify=df_test['label'])
    
    
    x_train, x_val, x_test = df_train['text'], df_val['text'], df_test['text']
    y_train, y_val, y_test = df_train['label'], df_val['label'], df_test['label']
  

    # --- Convert to list of lists ---
    train_texts = list(x_train)
    val_texts   = list(x_val)
    test_texts  = list(x_test)

    # --- Build vocabulary ---
    counter = Counter(word for tokens in train_texts for word in tokens)
    most_common = counter.most_common(vocab_size-2)  # reserve <pad> & <unk>
    itos = ["<pad>", "<unk>"] + [w for w, _ in most_common]
    stoi = {w: i for i, w in enumerate(itos)}
    vocab = stoi

    # --- Text to indices (vectorized) ---
    def tokens_to_indices(tokens):
        return [vocab.get(w, vocab["<unk>"]) for w in tokens]

    train_sequences = [tokens_to_indices(tokens) for tokens in train_texts]
    val_sequences   = [tokens_to_indices(tokens) for tokens in val_texts]
    test_sequences  = [tokens_to_indices(tokens) for tokens in test_texts]

    # --- Padding using PyTorch ---
    PAD_INDEX = vocab["<pad>"]

    def pad_sequences(sequences, max_len):
        return torch.tensor([
            seq[:max_len] + [PAD_INDEX]*(max_len - len(seq)) if len(seq)<max_len else seq[:max_len]
            for seq in sequences
        ], dtype=torch.long)

    train_sequences = pad_sequences(train_sequences, max_len)
    val_sequences   = pad_sequences(val_sequences, max_len)
    test_sequences  = pad_sequences(test_sequences, max_len)

    # --- Labels to tensor ---
    y_train_tensor = torch.tensor(list(y_train), dtype=torch.long)
    y_val_tensor   = torch.tensor(list(y_val), dtype=torch.long)
    y_test_tensor  = torch.tensor(list(y_test), dtype=torch.long)

    # --- Create DataLoaders ---
    from torch.utils.data import TensorDataset, DataLoader

    train_loader = DataLoader(TensorDataset(train_sequences, y_train_tensor), batch_size=batch, shuffle=True)
    val_loader   = DataLoader(TensorDataset(val_sequences, y_val_tensor), batch_size=batch)
    test_loader  = DataLoader(TensorDataset(test_sequences, y_test_tensor), batch_size=batch)

    print("Data pipeline preparation completed.")

    return train_loader, val_loader, test_loader, vocab, stoi, itos


In [8]:
def run_lstm_pipeline(
    df_train,
    df_test,
    vocab_size=5000,
    max_len=64,
    embedding_type='glove',  
    embedding_dim=100,       
    batch_size=64,
    num_layers=1,
    hidden_dim=128,
    dropout=0.2,
    embedding_dropout=0.1,
    freeze_embeddings=True
):

    os.makedirs("models", exist_ok=True)
    save_path = os.path.join("models", f"best_lstm_{embedding_type}_model.pt")
    
    # Loaders
    train_loader, val_loader, test_loader, vocab, tokenizer, itos = prepare_data_pipeline(
        df_train = df_train,
        df_test = df_test,
        vocab_size = vocab_size,
        max_len = max_len,
        batch = batch_size)
    
    # Embedding
    embedding_matrix = build_embedding_matrix(
    vocab=vocab,
    itos=itos,
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_type=embedding_type)
    
    # Create the model
    model = LSTMClassifier(
        vocab_size=vocab_size,
        embed_dim=embedding_dim,
        hidden_dim=hidden_dim,
        num_classes=2,
        pad_idx=vocab["<pad>"],
        num_layers=num_layers,
        bidirectional=True,
        dropout=dropout,
        embedding_dropout = embedding_dropout,
        embedding_matrix=embedding_matrix,
        freeze_embeddings = freeze_embeddings
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()  
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
    
    # Training and Evaluation Loops
    EPOCHS = 15
    best_val_acc = 0.0
    patience, counter = 5, 0 

    for epoch in range(EPOCHS):
        start_time = time.time()

        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc, val_prec, val_rec = evaluate_func(model, val_loader, criterion, device)

        epoch_time = time.time() - start_time

        if val_acc > best_val_acc:  # improvement
            best_val_acc = val_acc
            counter = 0
            torch.save(model.state_dict(), save_path)
        else:                       # no improvement
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        print(f"Epoch {epoch+1}/{EPOCHS} | "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f} | "
              f"Val Precision: {val_prec:.4f}, Val Recall: {val_rec:.4f} | "
              f"Time: {epoch_time:.2f}s")
        #scheduler.step()

    # Load best model and evaluate on test set
    model.load_state_dict(torch.load (save_path))
    test_loss, test_acc, test_prec, test_rec = evaluate_func(model, test_loader, criterion, device)
    print("\nbest lstm with " + embedding_type + " embedding result on test:")
    print({test_acc:.4})
    

In [9]:
# Hyperparameters for model
NUM_LAYERS = 2
HIDDEN_DIM = 256 
DROPOUT = 0.3

Train & Evaluate

IMDB

Random Embedding

In [10]:
# Data and Embeddings Parameters
df_train = pd.read_csv(config.IMDB_TRAIN_PATH)
df_test = pd.read_csv(config.IMDB_TEST_PATH)
vocab_size = 8000
max_len = 256
batch_size = 32
embedding_type = 'random' # 'glove', 'w2v' or 'random'
embedding_dim = 256  
embedding_dropout = 0.1
freeze_embedding = False

In [11]:
run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.6361, Train Acc: 0.6246 | Val Loss: 0.5502, Val Acc: 0.7060 | Val Precision: 0.6709, Val Recall: 0.8088 | Time: 35.11s
Epoch 2/15 | Train Loss: 0.5467, Train Acc: 0.7131 | Val Loss: 0.5403, Val Acc: 0.7230 | Val Precision: 0.6705, Val Recall: 0.8768 | Time: 33.44s
Epoch 3/15 | Train Loss: 0.5105, Train Acc: 0.7399 | Val Loss: 0.5229, Val Acc: 0.7348 | Val Precision: 0.8051, Val Recall: 0.6196 | Time: 33.37s
Epoch 4/15 | Train Loss: 0.4793, Train Acc: 0.7641 | Val Loss: 0.4923, Val Acc: 0.7578 | Val Precision: 0.7181, Val Recall: 0.8488 | Time: 33.35s
Epoch 5/15 | Train Loss: 0.4500, Train Acc: 0.7832 | Val Loss: 0.4696, Val Acc: 0.7698 | Val Precision: 0.7501, Val Recall: 0.8092 | Time: 33.35s
Epoch 6/15 | Train Loss: 0.4267, Train Acc: 0.7963 | Val Loss: 0.5055, Val Acc: 0.7580 | Val Precision: 0.7163, Val Recall: 0.8544 | Time: 33.39s
Epoch 7/15 | Train Loss: 0.4062, Train Acc: 0.8106 | Val Loss: 0.5089, Val Acc: 0.7588 

GLOVE Embedding

In [12]:
embedding_type = 'glove'
embedding_dim = 300  
embedding_dropout = 0.1
freeze_embedding = True

run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.7174, Train Acc: 0.5041 | Val Loss: 0.6936, Val Acc: 0.5110 | Val Precision: 0.5064, Val Recall: 0.8740 | Time: 32.96s
Epoch 2/15 | Train Loss: 0.7017, Train Acc: 0.5101 | Val Loss: 0.6874, Val Acc: 0.5524 | Val Precision: 0.5626, Val Recall: 0.4708 | Time: 32.84s
Epoch 3/15 | Train Loss: 0.6904, Train Acc: 0.5413 | Val Loss: 0.6844, Val Acc: 0.5430 | Val Precision: 0.5259, Val Recall: 0.8728 | Time: 32.86s
Epoch 4/15 | Train Loss: 0.6674, Train Acc: 0.5887 | Val Loss: 0.6331, Val Acc: 0.6406 | Val Precision: 0.6231, Val Recall: 0.7116 | Time: 32.88s
Epoch 5/15 | Train Loss: 0.5962, Train Acc: 0.6686 | Val Loss: 0.5617, Val Acc: 0.6924 | Val Precision: 0.6610, Val Recall: 0.7900 | Time: 32.85s
Epoch 6/15 | Train Loss: 0.5422, Train Acc: 0.7152 | Val Loss: 0.5307, Val Acc: 0.7234 | Val Precision: 0.7606, Val Recall: 0.6520 | Time: 32.82s
Epoch 7/15 | Train Loss: 0.5084, Train Acc: 0.7414 | Val Loss: 0.5025, Val Acc: 0.7450 

W2V Embedding

In [13]:
embedding_type = 'w2v'
embedding_dim = 256  
embedding_dropout = 0.1
freeze_embedding = False

run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.6822, Train Acc: 0.5571 | Val Loss: 0.6106, Val Acc: 0.6448 | Val Precision: 0.6381, Val Recall: 0.6692 | Time: 33.33s
Epoch 2/15 | Train Loss: 0.5817, Train Acc: 0.6781 | Val Loss: 0.5541, Val Acc: 0.6974 | Val Precision: 0.7506, Val Recall: 0.5912 | Time: 33.35s
Epoch 3/15 | Train Loss: 0.5393, Train Acc: 0.7141 | Val Loss: 0.5310, Val Acc: 0.7254 | Val Precision: 0.7605, Val Recall: 0.6580 | Time: 33.35s
Epoch 4/15 | Train Loss: 0.5068, Train Acc: 0.7422 | Val Loss: 0.5031, Val Acc: 0.7532 | Val Precision: 0.7390, Val Recall: 0.7828 | Time: 33.36s
Epoch 5/15 | Train Loss: 0.4793, Train Acc: 0.7621 | Val Loss: 0.5288, Val Acc: 0.7216 | Val Precision: 0.8354, Val Recall: 0.5520 | Time: 33.35s
Epoch 6/15 | Train Loss: 0.4556, Train Acc: 0.7775 | Val Loss: 0.4744, Val Acc: 0.7676 | Val Precision: 0.7374, Val Recall: 0.8312 | Time: 33.39s
Epoch 7/15 | Train Loss: 0.4288, Train Acc: 0.7961 | Val Loss: 0.4702, Val Acc: 0.7710 

Rotten Tomatoes 

Random Embedding

In [14]:
# Data and Embeddings Parameters
df_train = pd.read_csv(config.RT_TRAIN_PATH)
df_test = pd.read_csv(config.RT_TEST_PATH)
vocab_size = 5000
max_len = 128
batch_size = 32
embedding_type = 'random' # 'glove', 'w2v' or 'random'
embedding_dim = 256  
embedding_dropout = 0.1
freeze_embedding = False

run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.7384, Train Acc: 0.5199 | Val Loss: 0.7627, Val Acc: 0.5000 | Val Precision: 0.5000, Val Recall: 1.0000 | Time: 3.88s
Epoch 2/15 | Train Loss: 0.6848, Train Acc: 0.5635 | Val Loss: 0.6676, Val Acc: 0.5807 | Val Precision: 0.7108, Val Recall: 0.2720 | Time: 3.86s
Epoch 3/15 | Train Loss: 0.6582, Train Acc: 0.6104 | Val Loss: 0.6115, Val Acc: 0.6614 | Val Precision: 0.6599, Val Recall: 0.6660 | Time: 3.86s
Epoch 4/15 | Train Loss: 0.6217, Train Acc: 0.6590 | Val Loss: 0.5929, Val Acc: 0.6923 | Val Precision: 0.6881, Val Recall: 0.7036 | Time: 3.86s
Epoch 5/15 | Train Loss: 0.5768, Train Acc: 0.6936 | Val Loss: 0.6460, Val Acc: 0.6454 | Val Precision: 0.8112, Val Recall: 0.3790 | Time: 3.87s
Epoch 6/15 | Train Loss: 0.5261, Train Acc: 0.7306 | Val Loss: 0.5642, Val Acc: 0.7336 | Val Precision: 0.7666, Val Recall: 0.6717 | Time: 3.87s
Epoch 7/15 | Train Loss: 0.4848, Train Acc: 0.7633 | Val Loss: 0.5694, Val Acc: 0.7148 | Val 

GLOVE Embedding

In [15]:
embedding_type = 'glove'
embedding_dim = 300  
embedding_dropout = 0.1
freeze_embedding = True

run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.7372, Train Acc: 0.5165 | Val Loss: 0.7047, Val Acc: 0.5019 | Val Precision: 0.5009, Val Recall: 1.0000 | Time: 3.77s
Epoch 2/15 | Train Loss: 0.6986, Train Acc: 0.5295 | Val Loss: 0.6707, Val Acc: 0.6004 | Val Precision: 0.5918, Val Recall: 0.6473 | Time: 3.78s
Epoch 3/15 | Train Loss: 0.6893, Train Acc: 0.5498 | Val Loss: 0.6572, Val Acc: 0.6144 | Val Precision: 0.6282, Val Recall: 0.5610 | Time: 3.78s
Epoch 4/15 | Train Loss: 0.6829, Train Acc: 0.5698 | Val Loss: 0.6460, Val Acc: 0.6341 | Val Precision: 0.6312, Val Recall: 0.6454 | Time: 3.78s
Epoch 5/15 | Train Loss: 0.6589, Train Acc: 0.6129 | Val Loss: 0.6313, Val Acc: 0.6538 | Val Precision: 0.6323, Val Recall: 0.7355 | Time: 3.77s
Epoch 6/15 | Train Loss: 0.6402, Train Acc: 0.6310 | Val Loss: 0.6460, Val Acc: 0.6295 | Val Precision: 0.7518, Val Recall: 0.3865 | Time: 3.77s
Epoch 7/15 | Train Loss: 0.6007, Train Acc: 0.6772 | Val Loss: 0.6017, Val Acc: 0.6698 | Val 

W2V Embedding

In [16]:
embedding_type = 'w2v'
embedding_dim = 256  
embedding_dropout = 0.1
freeze_embedding = False

run_lstm_pipeline(
    df_train=df_train,
    df_test=df_test,
    vocab_size=vocab_size,
    max_len=max_len,
    embedding_type=embedding_type,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    num_layers=NUM_LAYERS,
    hidden_dim=HIDDEN_DIM,
    dropout=DROPOUT,
    embedding_dropout=embedding_dropout,
    freeze_embeddings=freeze_embedding
)

Data pipeline preparation completed.
Epoch 1/15 | Train Loss: 0.7558, Train Acc: 0.4996 | Val Loss: 0.7077, Val Acc: 0.4775 | Val Precision: 0.4831, Val Recall: 0.6435 | Time: 3.87s
Epoch 2/15 | Train Loss: 0.7053, Train Acc: 0.5140 | Val Loss: 0.6829, Val Acc: 0.5685 | Val Precision: 0.5720, Val Recall: 0.5441 | Time: 3.86s
Epoch 3/15 | Train Loss: 0.7094, Train Acc: 0.5220 | Val Loss: 0.6985, Val Acc: 0.4850 | Val Precision: 0.4906, Val Recall: 0.7842 | Time: 3.86s
Epoch 4/15 | Train Loss: 0.6966, Train Acc: 0.5321 | Val Loss: 0.6960, Val Acc: 0.5347 | Val Precision: 0.6637, Val Recall: 0.1407 | Time: 3.86s
Epoch 5/15 | Train Loss: 0.6914, Train Acc: 0.5547 | Val Loss: 0.6717, Val Acc: 0.5807 | Val Precision: 0.5530, Val Recall: 0.8424 | Time: 3.85s
Epoch 6/15 | Train Loss: 0.6834, Train Acc: 0.5641 | Val Loss: 0.6723, Val Acc: 0.5769 | Val Precision: 0.5610, Val Recall: 0.7073 | Time: 3.87s
Epoch 7/15 | Train Loss: 0.6737, Train Acc: 0.5800 | Val Loss: 0.6487, Val Acc: 0.6191 | Val 