In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Define the file path from your Kaggle input directory
# Make sure the path matches what you see in the "Input" section of your notebook
file_path = '/home/ununtu1-5090/Documents/Abdullah/taqi/emotion-emotion_69k.csv'

# 2. Load the dataset into a pandas DataFrame
print("Loading the dataset...")
df = pd.read_csv(file_path)
print(f"Successfully loaded {len(df)} total dialogues.")

# 3. Clean up the data by parsing the customer utterance
# The 'empathetic_dialogues' column contains both customer and agent parts.
# We need to extract just the customer's line.
def extract_customer_utterance(dialogue):
    # Find the text after "Customer :" and before the final "Agent :"
    try:
        # The prompt is everything after the first newline
        return dialogue.split('\n')[1].replace('Customer :', '').strip()
    except IndexError:
        # Handle cases where the format might be different or empty
        return ""

print("Extracting customer utterances...")
df['customer_utterance'] = df['empathetic_dialogues'].apply(extract_customer_utterance)

# Let's rename the 'labels' column to be more intuitive
df.rename(columns={'labels': 'agent_reply'}, inplace=True)

# Display a sample to verify
print("\nSample of processed data:")
print(df[['Situation', 'emotion', 'customer_utterance', 'agent_reply']].head())


# 4. Split the data into 80% train, 10% validation, and 10% test 
# First, split into 80% training and 20% temporary (for validation + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,      # 20% for temp_df
    random_state=42     # A fixed random state ensures the split is the same every time
)

# Now, split the temporary set into two equal halves (10% validation, 10% test)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,      # 50% of the 20% is 10% of the total
    random_state=42
)

# 5. Verify the size of each split
print("\n--- Dataset Split ---")
print(f"Training set size:   {len(train_df)} ({len(train_df)/len(df):.0%})")
print(f"Validation set size: {len(val_df)} ({len(val_df)/len(df):.0%})")
print(f"Test set size:       {len(test_df)} ({len(test_df)/len(df):.0%})")

Loading the dataset...
Successfully loaded 64636 total dialogues.
Extracting customer utterances...

Sample of processed data:
                                           Situation      emotion  \
0  I remember going to the fireworks with my best...  sentimental   
1  I remember going to the fireworks with my best...  sentimental   
2  I remember going to the fireworks with my best...  sentimental   
3  I remember going to the fireworks with my best...  sentimental   
4  I remember going to the fireworks with my best...  sentimental   

  customer_utterance                                        agent_reply  
0            Agent :  Was this a friend you were in love with, or ju...  
1            Agent :                                Where has she gone?  
2            Agent :  Oh was this something that happened because of...  
3            Agent :                This was a best friend. I miss her.  
4            Agent :                                 We no longer talk.  

--- Dataset S

In [8]:
import re
import string

def normalize_text(text):
    """
    Performs text normalization:
    1. Lowercases the text.
    2. Adds space around punctuation.
    3. Removes extra whitespace.
    """
    text = text.lower()
    # Add a space between words and punctuation marks.
    text = re.sub(f"([{string.punctuation}])", r" \1 ", text)
    # Collapse multiple spaces into a single space and strip leading/trailing whitespace.
    text = re.sub(r"\s+", " ", text).strip()
    return text

# The columns we need to clean
text_columns = ['Situation', 'customer_utterance', 'agent_reply']

print("Normalizing text in all data splits...")

# Apply the normalization function to each dataframe
for df in [train_df, val_df, test_df]:
    for col in text_columns:
        df[col] = df[col].apply(normalize_text)

print("Normalization complete.")

# Display a sample from the training set to see the result
print("\nSample of normalized training data:")
print(train_df[['Situation', 'customer_utterance', 'agent_reply']].head())

Normalizing text in all data splits...
Normalization complete.

Sample of normalized training data:
                                               Situation customer_utterance  \
41481  i had to go buy legos for my nephew the other ...            agent :   
52816                   i have a hot date this weekend !            agent :   
31326               i still believe he can make me proud            agent :   
16889  i didn ' t ask the girl i like to the prom and...            agent :   
18465  i was impressed at this workshop i went to ove...            agent :   

                                             agent_reply  
41481  no just this feeling overcame me that my kids ...  
52816      i am taking to a movie and the olive garden .  
31326                oh nice . what type of presentation  
16889  if you could do it all over again , how would ...  
18465                                 ok cool how was it  


In [9]:
# # 1. Install the tokenizers library (if needed)
!pip install tokenizers -q

import os
from tokenizers import BertWordPieceTokenizer

# Create a directory to save the tokenizer
tokenizer_dir = "./tokenizer"
if not os.path.exists(tokenizer_dir):
    os.makedirs(tokenizer_dir)

# 2. Prepare an iterator from our training data
text_iterator = (
    row
    for row in train_df['Situation']
    + train_df['customer_utterance']
    + train_df['agent_reply']
)

# 3. Initialize and train the tokenizer
custom_tokenizer = BertWordPieceTokenizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)

print("Training the tokenizer...")
# --- FIX IS HERE ---
# Add the standard BERT special tokens ([CLS], [SEP]) along with the ones we need.
custom_tokenizer.train_from_iterator(
    text_iterator,
    vocab_size=10000,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[BOS]", "[EOS]"] # Added [CLS] and [SEP]
)

# 4. Save the tokenizer files
custom_tokenizer.save_model(tokenizer_dir)
print(f"Tokenizer saved to {tokenizer_dir}")

# 5. Load the tokenizer and test it
# This part will now work without errors
tokenizer_path = os.path.join(tokenizer_dir, "vocab.txt")
tokenizer = BertWordPieceTokenizer(tokenizer_path, lowercase=False)

# Test with a sample sentence
sample_text = "i remember going to the fireworks ."
encoded = tokenizer.encode(sample_text)

print("\n--- Tokenizer Test ---")
print(f"Original: {sample_text}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training the tokenizer...



Tokenizer saved to ./tokenizer

--- Tokenizer Test ---
Original: i remember going to the fireworks .
Tokens: ['[CLS]', 'i', 'remember', 'going', 'to', 'the', 'fireworks', '.', '[SEP]']
IDs: [2, 51, 750, 274, 121, 124, 6036, 19, 3]


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tokenizers import BertWordPieceTokenizer

# 1. Load the trained tokenizer from the saved file
tokenizer_path = "./tokenizer/vocab.txt"
tokenizer = BertWordPieceTokenizer(tokenizer_path, lowercase=False)

# Define special token IDs
pad_token_id = tokenizer.token_to_id('[PAD]')
bos_token_id = tokenizer.token_to_id('[BOS]')
eos_token_id = tokenizer.token_to_id('[EOS]')

# 2. Create the custom PyTorch Dataset
class EmpatheticDialoguesDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Format the input and target strings
        input_text = f"Emotion: {row['emotion']} | Situation: {row['Situation']} | Customer: {row['customer_utterance']} Agent:"
        target_text = row['agent_reply']

        # Tokenize the texts
        input_encoding = self.tokenizer.encode(input_text)
        target_encoding = self.tokenizer.encode(target_text)

        # Prepare model inputs
        encoder_input = [bos_token_id] + input_encoding.ids + [eos_token_id]
        decoder_input = [bos_token_id] + target_encoding.ids
        label = target_encoding.ids + [eos_token_id]

        # Truncate if necessary
        encoder_input = encoder_input[:self.max_len]
        decoder_input = decoder_input[:self.max_len-1]
        label = label[:self.max_len-1]

        return {
            "encoder_input": torch.tensor(encoder_input, dtype=torch.long),
            "decoder_input": torch.tensor(decoder_input, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.long)
        }

# 3. Create a collate function for padding
def collate_fn(batch):
    encoder_inputs = [item['encoder_input'] for item in batch]
    decoder_inputs = [item['decoder_input'] for item in batch]
    labels = [item['label'] for item in batch]

    encoder_inputs_padded = pad_sequence(encoder_inputs, batch_first=True, padding_value=pad_token_id)
    decoder_inputs_padded = pad_sequence(decoder_inputs, batch_first=True, padding_value=pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=pad_token_id)

    return {
        "encoder_input": encoder_inputs_padded,
        "decoder_input": decoder_inputs_padded,
        "label": labels_padded
    }

# 4. Instantiate Datasets and DataLoaders
print("Creating Datasets and DataLoaders...")
train_dataset = EmpatheticDialoguesDataset(train_df, tokenizer)
val_dataset = EmpatheticDialoguesDataset(val_df, tokenizer)

BATCH_SIZE = 32

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("DataLoaders are ready.")

# 5. Test the DataLoader
print("\n--- Testing one batch from the train_dataloader ---")
sample_batch = next(iter(train_dataloader))
print("Encoder input shape:", sample_batch['encoder_input'].shape)
print("Decoder input shape:", sample_batch['decoder_input'].shape)
print("Label shape:", sample_batch['label'].shape)

Creating Datasets and DataLoaders...


NameError: name 'train_df' is not defined

In [2]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        """
        Initializes the PositionalEncoding module.

        Args:
            d_model (int): The embedding dimension of the model.
            max_len (int): The maximum possible length of a sequence.
            dropout (float): The dropout rate.
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create a positional encoding matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)

        # Create a position tensor of shape (max_len, 1)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Calculate the division term for the sine and cosine functions
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine to even indices in the array; 2i
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices in the array; 2i+1
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension to the positional encoding matrix so it can be added to the input embeddings
        pe = pe.unsqueeze(0) # Shape: (1, max_len, d_model)

        # Register 'pe' as a buffer. Buffers are part of the model's state,
        # but they are not considered parameters to be trained.
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Adds positional encoding to the input tensor.

        Args:
            x (torch.Tensor): The input tensor (embeddings) of shape (batch_size, seq_len, d_model).

        Returns:
            torch.Tensor: The output tensor with positional information added.
        """
        # Add the positional encoding to the input embeddings.
        # x.size(1) is the sequence length of the current batch.
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

print("PositionalEncoding class defined.")

PositionalEncoding class defined.


In [3]:
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h, dropout=0.1):
        """
        Initializes the MultiHeadAttention module.

        Args:
            d_model (int): The embedding dimension of the model.
            h (int): The number of attention heads.
            dropout (float): The dropout rate.
        """
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0, "d_model must be divisible by h"

        self.d_model = d_model
        self.h = h
        self.d_k = d_model // h # Dimension of keys/queries/values per head

        # Linear layers for Query, Key, Value, and the final output
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        """
        Performs the forward pass for multi-head attention.

        Args:
            query (torch.Tensor): Query tensor, shape (batch_size, seq_len_q, d_model)
            key (torch.Tensor): Key tensor, shape (batch_size, seq_len_k, d_model)
            value (torch.Tensor): Value tensor, shape (batch_size, seq_len_v, d_model)
            mask (torch.Tensor, optional): Mask to prevent attention to certain positions.

        Returns:
            torch.Tensor: The output of the multi-head attention, shape (batch_size, seq_len_q, d_model)
        """
        batch_size = query.size(0)

        # 1. Linearly project and split into h heads
        q = self.w_q(query).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.w_k(key).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.w_v(value).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        # Shape of q, k, v is now (batch_size, h, seq_len, d_k)

        # 2. Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)

        # 3. Apply mask (if provided)
        if mask is not None:
            # The mask needs to be broadcastable to the scores shape
            scores = scores.masked_fill(mask == 0, -1e9)

        # 4. Apply softmax to get attention weights
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)

        # 5. Multiply weights by values
        context = torch.matmul(attention_weights, v) # (batch_size, h, seq_len_q, d_k)

        # 6. Concatenate heads and apply final linear layer
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.w_o(context)

        return output

print("MultiHeadAttention class defined.")

MultiHeadAttention class defined.


In [4]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        Initializes the PositionwiseFeedForward module.

        Args:
            d_model (int): The embedding dimension of the model.
            d_ff (int): The dimension of the inner feed-forward layer.
                        A common value is d_model * 4.
            dropout (float): The dropout rate.
        """
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        Performs the forward pass.

        Args:
            x (torch.Tensor): The input tensor, shape (batch_size, seq_len, d_model)

        Returns:
            torch.Tensor: The output tensor, shape (batch_size, seq_len, d_model)
        """
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

print("PositionwiseFeedForward class defined.")

PositionwiseFeedForward class defined.


In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout):
        """
        Initializes a single EncoderLayer.
        """
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, h, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        """
        Forward pass for the EncoderLayer.
        src: input tensor (batch_size, src_len, d_model)
        src_mask: mask for the source sequence
        """
        # Sub-layer 1: Multi-head attention
        attn_output = self.self_attn(src, src, src, src_mask)
        # Residual connection and layer normalization
        src = self.norm1(src + self.dropout(attn_output))

        # Sub-layer 2: Position-wise feed-forward network
        ff_output = self.feed_forward(src)
        # Residual connection and layer normalization
        src = self.norm2(src + self.dropout(ff_output))

        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout):
        """
        Initializes a single DecoderLayer.
        """
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, h, dropout)
        self.cross_attn = MultiHeadAttention(d_model, h, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, encoder_output, tgt_mask, src_mask):
        """
        Forward pass for the DecoderLayer.
        tgt: target tensor (batch_size, tgt_len, d_model)
        encoder_output: output from the encoder (batch_size, src_len, d_model)
        tgt_mask: mask for the target sequence (causal mask)
        src_mask: mask for the source sequence
        """
        # Sub-layer 1: Masked multi-head self-attention
        attn_output = self.self_attn(tgt, tgt, tgt, tgt_mask)
        tgt = self.norm1(tgt + self.dropout(attn_output))

        # Sub-layer 2: Multi-head cross-attention
        # Query comes from the decoder, Key and Value from the encoder
        cross_attn_output = self.cross_attn(tgt, encoder_output, encoder_output, src_mask)
        tgt = self.norm2(tgt + self.dropout(cross_attn_output))

        # Sub-layer 3: Position-wise feed-forward network
        ff_output = self.feed_forward(tgt)
        tgt = self.norm3(tgt + self.dropout(ff_output))

        return tgt

print("EncoderLayer and DecoderLayer classes defined.")

EncoderLayer and DecoderLayer classes defined.


In [6]:
# First, let's get our vocabulary size from the tokenizer, which is needed for the model
vocab_size = tokenizer.get_vocab_size()

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, d_ff, dropout):
        super(Transformer, self).__init__()
        self.d_model = d_model

        # Embedding layers for source and target
        self.src_embedding = nn.Embedding(vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)

        # Encoder and Decoder stacks
        self.encoder_layers = nn.ModuleList(
            [EncoderLayer(d_model, nhead, d_ff, dropout) for _ in range(num_encoder_layers)]
        )
        self.decoder_layers = nn.ModuleList(
            [DecoderLayer(d_model, nhead, d_ff, dropout) for _ in range(num_decoder_layers)]
        )

        # Final linear layer to project to vocab size
        self.fc_out = nn.Linear(d_model, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        """Generates a square causal mask for the decoder."""
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def create_padding_mask(self, seq, pad_token_id):
        """Creates a boolean mask for padding tokens."""
        # seq shape: (batch_size, seq_len)
        # mask shape: (batch_size, 1, 1, seq_len) to be broadcastable
        return (seq == pad_token_id).unsqueeze(1).unsqueeze(2)

    def forward(self, src, tgt, pad_token_id):
        # src shape: (batch_size, src_len)
        # tgt shape: (batch_size, tgt_len)

        # 1. Create masks
        src_padding_mask = self.create_padding_mask(src, pad_token_id)
        tgt_padding_mask = self.create_padding_mask(tgt, pad_token_id)

        # The causal mask needs to be combined with the padding mask for the decoder's self-attention
        tgt_len = tgt.size(1)
        device = tgt.device
        tgt_causal_mask = self.generate_square_subsequent_mask(tgt_len).to(device)

        # The combined decoder mask prevents both looking ahead and attending to padding tokens
        # We take the maximum of the two masks (since one is boolean and one is float)
        # Or more simply, we can just add them since padding is 0 and causal is -inf
        combined_tgt_mask = tgt_padding_mask | (tgt_causal_mask == float('-inf'))


        # 2. Process inputs with embeddings and positional encoding
        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt) * math.sqrt(self.d_model))

        # 3. Pass through the encoder stack
        encoder_output = src_emb
        for encoder_layer in self.encoder_layers:
            encoder_output = encoder_layer(encoder_output, src_padding_mask)

        # 4. Pass through the decoder stack
        decoder_output = tgt_emb
        for decoder_layer in self.decoder_layers:
            decoder_output = decoder_layer(decoder_output, encoder_output, combined_tgt_mask, src_padding_mask)

        # 5. Final output layer
        output = self.fc_out(decoder_output)
        return output

# Now, let's instantiate the model with the suggested hyperparameters from the PDF
# Now, let's instantiate the model with the suggested hyperparameters from the PDF
D_MODEL = 512        # Embedding dimension
NHEAD = 2            # Number of heads
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
D_FF = D_MODEL * 4   # Inner feed-forward dimension, a common choice
DROPOUT = 0.1

model = Transformer(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    nhead=NHEAD,
    num_encoder_layers=NUM_ENCODER_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS,
    d_ff=D_FF,
    dropout=DROPOUT
)

print("Transformer model defined and instantiated successfully!")
# Print the total number of trainable parameters
print(f"Model Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Transformer model defined and instantiated successfully!
Model Parameters: 30,082,832


In [7]:
# 1. Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device
model.to(device)

# 2. Define the optimizer as specified in the project PDF
# [cite_start]Adam optimizer with betas=(0.9, 0.98) and a learning rate of 1e-4. [cite: 44, 45]
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

# 3. Define the loss function
# We use CrossEntropyLoss, which is standard for classification tasks like predicting the next token.
# We must ignore the padding token's index so it doesn't contribute to the loss.
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

print("Optimizer and loss function are ready.")

Using device: cpu
Optimizer and loss function are ready.


In [None]:
import time
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, criterion, device, pad_token_id):
    """
    Performs one full training epoch.
    """
    model.train() # Set the model to training mode
    total_loss = 0
    
    # Use tqdm for a progress bar
    for batch in tqdm(dataloader, desc="Training"):
        # Move batch to the correct device
        src = batch['encoder_input'].to(device)
        tgt = batch['decoder_input'].to(device)
        labels = batch['label'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass - teacher forcing is used here
        output = model(src, tgt, pad_token_id)
        
        # Reshape for the loss function
        # Output: (batch_size, tgt_len, vocab_size) -> (batch_size * tgt_len, vocab_size)
        # Labels: (batch_size, tgt_len) -> (batch_size * tgt_len)
        output_flat = output.view(-1, output.shape[-1])
        labels_flat = labels.view(-1)

        # Calculate the loss
        loss = criterion(output_flat, labels_flat)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# def evaluate(model, dataloader, criterion, device, pad_token_id):
#     """
#     Evaluates the model on the validation set.
#     """
#     model.eval() # Set the model to evaluation mode
#     total_loss = 0

#     with torch.no_grad(): # No need to calculate gradients during evaluation
#         for batch in tqdm(dataloader, desc="Validating"):
#             # Move batch to the correct device
#             src = batch['encoder_input'].to(device)
#             tgt = batch['decoder_input'].to(device)
#             labels = batch['label'].to(device)

#             # Forward pass
#             output = model(src, tgt, pad_token_id)
            
#             # Reshape for the loss function
#             output_flat = output.view(-1, output.shape[-1])
#             labels_flat = labels.view(-1)
            
#             # Calculate the loss
#             loss = criterion(output_flat, labels_flat)
#             total_loss += loss.item()

#     return total_loss / len(dataloader)

# # --- Main Training Loop ---
# NUM_EPOCHS = 5 # Start with a few epochs to see how it goes
# best_val_loss = float('inf')

# print("Starting training...")

# for epoch in range(1, NUM_EPOCHS + 1):
#     start_time = time.time()
    
#     train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device, pad_token_id)
#     val_loss = evaluate(model, val_dataloader, criterion, device, pad_token_id)
    
#     end_time = time.time()
#     epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
#     print(f"\nEpoch: {epoch:02} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s")
#     print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
#     print(f"\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}")

#     # Save the best model
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), 'best_model.pt')
#         print("\t-> Saved best model (based on validation loss)")

# print("\nTraining complete.")

Starting training...


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.26it/s]
Validating: 100%|██████████| 202/202 [00:01<00:00, 143.13it/s]



Epoch: 01 | Time: 0m 25s
	Train Loss: 3.017 | Train PPL:  20.426
	 Val. Loss: 2.026 |  Val. PPL:   7.585
	-> Saved best model (based on validation loss)


Training: 100%|██████████| 1616/1616 [00:23<00:00, 69.11it/s]
Validating: 100%|██████████| 202/202 [00:01<00:00, 146.47it/s]



Epoch: 02 | Time: 0m 25s
	Train Loss: 1.927 | Train PPL:   6.867
	 Val. Loss: 1.583 |  Val. PPL:   4.871
	-> Saved best model (based on validation loss)


Training: 100%|██████████| 1616/1616 [00:23<00:00, 69.06it/s]
Validating: 100%|██████████| 202/202 [00:01<00:00, 143.53it/s]



Epoch: 03 | Time: 0m 25s
	Train Loss: 1.590 | Train PPL:   4.905
	 Val. Loss: 1.362 |  Val. PPL:   3.904
	-> Saved best model (based on validation loss)


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.39it/s]
Validating: 100%|██████████| 202/202 [00:01<00:00, 142.45it/s]



Epoch: 04 | Time: 0m 25s
	Train Loss: 1.390 | Train PPL:   4.016
	 Val. Loss: 1.222 |  Val. PPL:   3.393
	-> Saved best model (based on validation loss)


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.83it/s]
Validating: 100%|██████████| 202/202 [00:01<00:00, 144.57it/s]



Epoch: 05 | Time: 0m 25s
	Train Loss: 1.255 | Train PPL:   3.508
	 Val. Loss: 1.115 |  Val. PPL:   3.050
	-> Saved best model (based on validation loss)

Training complete.


In [8]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device):
    src = src.to(device)
    src_mask = src_mask.to(device)

    with torch.no_grad():
        encoder_output = model.src_embedding(src) * math.sqrt(D_MODEL)
        encoder_output = model.pos_encoder(encoder_output)
        for layer in model.encoder_layers:
            encoder_output = layer(encoder_output, src_mask)

    # Start with the [BOS] token
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)

    for i in range(max_len - 1):
        with torch.no_grad():
            tgt_mask = (model.generate_square_subsequent_mask(ys.size(1))
                        .type(torch.bool)).to(device)

            tgt_emb = model.tgt_embedding(ys) * math.sqrt(D_MODEL)
            tgt_emb = model.pos_encoder(tgt_emb)

            decoder_output = tgt_emb
            for layer in model.decoder_layers:
                decoder_output = layer(decoder_output, encoder_output, tgt_mask, None) # No src_mask needed in cross-attn here

            prob = model.fc_out(decoder_output[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == eos_token_id:
            break
    return ys

In [9]:
!pip install sacrebleu rouge-score -q
print("Installed sacrebleu and rouge-score.")

Installed sacrebleu and rouge-score.


  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [20]:
import sacrebleu
from rouge_score import rouge_scorer

def evaluate(model, dataloader, criterion, tokenizer, device, pad_token_id, bos_token_id, eos_token_id):
    model.eval()
    total_loss = 0
    
    all_predictions = []
    all_references = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            src = batch['encoder_input'].to(device)
            tgt = batch['decoder_input'].to(device)
            labels = batch['label'].to(device)

            # --- Loss Calculation (same as before) ---
            output = model(src, tgt, pad_token_id)
            output_flat = output.view(-1, output.shape[-1])
            labels_flat = labels.view(-1)
            loss = criterion(output_flat, labels_flat)
            total_loss += loss.item()

            # --- Metric Calculation ---
            # Generate predictions for each item in the batch
            for i in range(src.size(0)):
                input_seq = src[i:i+1] # Shape: (1, seq_len)
                label_seq = labels[i]  # Shape: (seq_len)

                src_padding_mask = model.create_padding_mask(input_seq, pad_token_id)
                
                # Generate a prediction using greedy decoding
                prediction_ids = greedy_decode(model, input_seq, src_padding_mask, max_len=50, start_symbol=bos_token_id, device=device)
                
                # Decode prediction and label back to text
                pred_text = tokenizer.decode(prediction_ids.squeeze(0).tolist(), skip_special_tokens=True)
                ref_text = tokenizer.decode(label_seq.tolist(), skip_special_tokens=True)
                
                all_predictions.append(pred_text)
                all_references.append([ref_text]) # sacrebleu expects a list of references

    # Calculate metrics over the entire validation set
    bleu = sacrebleu.corpus_bleu(all_predictions, all_references)
    chrf = sacrebleu.corpus_chrf(all_predictions, all_references)
    
    # ROUGE-L
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_sum = 0
    for pred, ref in zip(all_predictions, all_references):
        scores = scorer.score(ref[0], pred)
        rouge_l_sum += scores['rougeL'].fmeasure
    rouge_l_avg = rouge_l_sum / len(all_predictions)

    return total_loss / len(dataloader), bleu.score, chrf.score, rouge_l_avg * 100


# # --- New Main Training Loop ---
# NUM_EPOCHS = 10 # Let's train for a bit longer
# best_bleu_score = -1.0

# print("\nStarting training with full metric evaluation...")

# for epoch in range(1, NUM_EPOCHS + 1):
#     start_time = time.time()
    
#     train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device, pad_token_id)
#     val_loss, bleu, chrf, rouge_l = evaluate(model, val_dataloader, criterion, tokenizer, device, pad_token_id, bos_token_id, eos_token_id)
    
#     end_time = time.time()
#     epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
#     print(f"\nEpoch: {epoch:02} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s")
#     print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
#     print(f"\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}")
#     print(f"\t Val. BLEU: {bleu:.2f} | ROUGE-L: {rouge_l:.2f} | chrF: {chrf:.2f}")

#     # Save the best model based on BLEU score
#     if bleu > best_bleu_score:
#         best_bleu_score = bleu
#         torch.save(model.state_dict(), 'best_model_bleu.pt')
#         print(f"\t-> New best model saved with BLEU score: {bleu:.2f}")

# print("\nTraining complete.")

In [11]:
import torch.nn.functional as F

def beam_search_decode(model, src, max_len, start_symbol, device, beam_size=3):
    """
    Performs beam search decoding.
    """
    # Move input to device and create masks
    src = src.to(device)
    src_padding_mask = model.create_padding_mask(src, pad_token_id)

    # --- Step 1: Encode the input sequence ---
    with torch.no_grad():
        encoder_output = model.src_embedding(src) * math.sqrt(D_MODEL)
        encoder_output = model.pos_encoder(encoder_output)
        for layer in model.encoder_layers:
            encoder_output = layer(encoder_output, src_padding_mask)

    # --- Step 2: Start the beam search ---
    # Start with the [BOS] token. Beams are stored as (sequence, score).
    # Scores are log probabilities, so they are negative and we want to maximize them.
    beams = [(torch.tensor([start_symbol], device=device).unsqueeze(0), 0.0)]

    # --- Step 3: Loop until max_len or all beams end with [EOS] ---
    for _ in range(max_len - 1):
        new_beams = []
        all_candidates_ended = True

        for seq, score in beams:
            # If the last token is [EOS], this sequence is finished.
            if seq[0, -1].item() == eos_token_id:
                new_beams.append((seq, score))
                continue
            
            all_candidates_ended = False

            # --- Get predictions for the next word ---
            with torch.no_grad():
                tgt_mask = model.generate_square_subsequent_mask(seq.size(1)).to(device)
                tgt_emb = model.tgt_embedding(seq) * math.sqrt(D_MODEL)
                tgt_emb = model.pos_encoder(tgt_emb)

                decoder_output = tgt_emb
                for layer in model.decoder_layers:
                    decoder_output = layer(decoder_output, encoder_output, tgt_mask, None)

                # Get the log probabilities for the next word
                logits = model.fc_out(decoder_output[:, -1])
                log_probs = F.log_softmax(logits, dim=-1)

            # --- Find the top k words for the current beam ---
            top_log_probs, top_indices = torch.topk(log_probs, beam_size, dim=1)

            # --- Create new candidate sequences ---
            for i in range(beam_size):
                next_word_idx = top_indices[0, i].item()
                next_word_log_prob = top_log_probs[0, i].item()

                new_seq = torch.cat([seq, torch.tensor([[next_word_idx]], device=device)], dim=1)
                new_score = score + next_word_log_prob
                new_beams.append((new_seq, new_score))

        if all_candidates_ended:
            break
            
        # --- Step 4: Prune the beams ---
        # Sort all new candidates by their score and keep only the top k
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

    # Return the sequence from the best-scoring beam
    best_seq, _ = beams[0]
    return best_seq

In [22]:
# from torch.optim.lr_scheduler import StepLR

# --- Re-initialize Model and Optimizer ---
# We re-initialize the model to train it from scratch with the new strategy
model = Transformer(
    vocab_size=vocab_size, d_model=D_MODEL, nhead=NHEAD,
    num_encoder_layers=NUM_ENCODER_LAYERS, num_decoder_layers=NUM_DECODER_LAYERS,
    d_ff=D_FF, dropout=DROPOUT
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

# --- New Main Training Loop ---
NUM_EPOCHS = 15  # Train for more epochs (Method 2)
best_bleu_score = -1.0

print(f"\nStarting training with a larger model (d_model={D_MODEL})...")
print(f"Model Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

print("\nStarting training with more epochs and LR scheduler...")

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device, pad_token_id)
    val_loss, bleu, chrf, rouge_l = evaluate(model, val_dataloader, criterion, tokenizer, device, pad_token_id, bos_token_id, eos_token_id)
    
    # --- Step the scheduler ---
    # scheduler.step()

    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    print(f"\nEpoch: {epoch:02} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s")
    print(f"\tCurrent LR: {optimizer.param_groups[0]['lr']:.6f}") # Display current learning rate
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}")
    print(f"\t Val. BLEU: {bleu:.2f} | ROUGE-L: {rouge_l:.2f} | chrF: {chrf:.2f}")

    # Save the best model based on BLEU score
    if bleu > best_bleu_score:
        best_bleu_score = bleu
        # Let's give this improved model a new name
        torch.save(model.state_dict(), 'best_model_improved.pt')
        print(f"\t-> New best model saved with BLEU score: {bleu:.2f}")

print("\nImproved training complete.")


Starting training with a larger model (d_model=512)...
Model Parameters: 30,082,832

Starting training with more epochs and LR scheduler...


Training: 100%|██████████| 1616/1616 [00:23<00:00, 67.93it/s]
Validating: 100%|██████████| 202/202 [08:06<00:00,  2.41s/it]



Epoch: 01 | Time: 8m 32s
	Current LR: 0.000100
	Train Loss: 3.032 | Train PPL:  20.732
	 Val. Loss: 2.021 |  Val. PPL:   7.548
	 Val. BLEU: 2.02 | ROUGE-L: 1.25 | chrF: 9.03
	-> New best model saved with BLEU score: 2.02


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.55it/s]
Validating: 100%|██████████| 202/202 [08:09<00:00,  2.43s/it]



Epoch: 02 | Time: 8m 35s
	Current LR: 0.000100
	Train Loss: 1.935 | Train PPL:   6.922
	 Val. Loss: 1.564 |  Val. PPL:   4.776
	 Val. BLEU: 2.02 | ROUGE-L: 1.25 | chrF: 9.03


Training: 100%|██████████| 1616/1616 [00:23<00:00, 67.97it/s]
Validating: 100%|██████████| 202/202 [07:39<00:00,  2.28s/it]



Epoch: 03 | Time: 8m 5s
	Current LR: 0.000100
	Train Loss: 1.601 | Train PPL:   4.960
	 Val. Loss: 1.349 |  Val. PPL:   3.854
	 Val. BLEU: 2.02 | ROUGE-L: 1.25 | chrF: 9.03


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.34it/s]
Validating: 100%|██████████| 202/202 [07:42<00:00,  2.29s/it]



Epoch: 04 | Time: 8m 7s
	Current LR: 0.000100
	Train Loss: 1.400 | Train PPL:   4.054
	 Val. Loss: 1.217 |  Val. PPL:   3.376
	 Val. BLEU: 2.02 | ROUGE-L: 1.25 | chrF: 9.03


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.59it/s]
Validating: 100%|██████████| 202/202 [08:25<00:00,  2.50s/it]



Epoch: 05 | Time: 8m 50s
	Current LR: 0.000100
	Train Loss: 1.262 | Train PPL:   3.532
	 Val. Loss: 1.125 |  Val. PPL:   3.079
	 Val. BLEU: 2.02 | ROUGE-L: 1.25 | chrF: 9.03


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.73it/s]
Validating: 100%|██████████| 202/202 [08:47<00:00,  2.61s/it]



Epoch: 06 | Time: 9m 13s
	Current LR: 0.000100
	Train Loss: 1.161 | Train PPL:   3.194
	 Val. Loss: 1.040 |  Val. PPL:   2.829
	 Val. BLEU: 1.27 | ROUGE-L: 2.08 | chrF: 9.00


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.16it/s]
Validating: 100%|██████████| 202/202 [07:38<00:00,  2.27s/it]



Epoch: 07 | Time: 8m 3s
	Current LR: 0.000100
	Train Loss: 1.077 | Train PPL:   2.934
	 Val. Loss: 1.002 |  Val. PPL:   2.724
	 Val. BLEU: 13.54 | ROUGE-L: 4.75 | chrF: 12.40
	-> New best model saved with BLEU score: 13.54


Training: 100%|██████████| 1616/1616 [00:23<00:00, 68.29it/s]
Validating: 100%|██████████| 202/202 [07:56<00:00,  2.36s/it]



Epoch: 08 | Time: 8m 21s
	Current LR: 0.000100
	Train Loss: 1.015 | Train PPL:   2.759
	 Val. Loss: 0.951 |  Val. PPL:   2.588
	 Val. BLEU: 13.54 | ROUGE-L: 4.75 | chrF: 12.40


Training: 100%|██████████| 1616/1616 [00:22<00:00, 70.51it/s]
Validating: 100%|██████████| 202/202 [08:30<00:00,  2.53s/it]



Epoch: 09 | Time: 8m 54s
	Current LR: 0.000100
	Train Loss: 0.958 | Train PPL:   2.608
	 Val. Loss: 0.920 |  Val. PPL:   2.510
	 Val. BLEU: 13.54 | ROUGE-L: 4.75 | chrF: 12.40


Training: 100%|██████████| 1616/1616 [00:23<00:00, 70.18it/s]
Validating: 100%|██████████| 202/202 [08:43<00:00,  2.59s/it]



Epoch: 10 | Time: 9m 9s
	Current LR: 0.000100
	Train Loss: 0.913 | Train PPL:   2.493
	 Val. Loss: 0.892 |  Val. PPL:   2.440
	 Val. BLEU: 1.80 | ROUGE-L: 1.10 | chrF: 12.60


Training: 100%|██████████| 1616/1616 [00:23<00:00, 70.05it/s]
Validating: 100%|██████████| 202/202 [08:35<00:00,  2.55s/it]



Epoch: 11 | Time: 9m 2s
	Current LR: 0.000100
	Train Loss: 0.876 | Train PPL:   2.401
	 Val. Loss: 0.870 |  Val. PPL:   2.387
	 Val. BLEU: 1.80 | ROUGE-L: 1.10 | chrF: 12.60


Training: 100%|██████████| 1616/1616 [00:22<00:00, 70.39it/s]
Validating: 100%|██████████| 202/202 [08:03<00:00,  2.39s/it]



Epoch: 12 | Time: 8m 29s
	Current LR: 0.000100
	Train Loss: 0.840 | Train PPL:   2.316
	 Val. Loss: 0.842 |  Val. PPL:   2.322
	 Val. BLEU: 1.80 | ROUGE-L: 1.10 | chrF: 12.60


Training: 100%|██████████| 1616/1616 [00:23<00:00, 69.80it/s]
Validating: 100%|██████████| 202/202 [07:14<00:00,  2.15s/it]



Epoch: 13 | Time: 7m 39s
	Current LR: 0.000100
	Train Loss: 0.813 | Train PPL:   2.255
	 Val. Loss: 0.839 |  Val. PPL:   2.314
	 Val. BLEU: 1.80 | ROUGE-L: 1.26 | chrF: 11.33


Training: 100%|██████████| 1616/1616 [00:22<00:00, 70.31it/s]
Validating: 100%|██████████| 202/202 [08:26<00:00,  2.51s/it]



Epoch: 14 | Time: 8m 52s
	Current LR: 0.000100
	Train Loss: 0.778 | Train PPL:   2.178
	 Val. Loss: 0.816 |  Val. PPL:   2.262
	 Val. BLEU: 2.66 | ROUGE-L: 1.42 | chrF: 13.47


Training: 100%|██████████| 1616/1616 [00:22<00:00, 70.45it/s]
Validating: 100%|██████████| 202/202 [07:24<00:00,  2.20s/it]



Epoch: 15 | Time: 7m 50s
	Current LR: 0.000100
	Train Loss: 0.757 | Train PPL:   2.132
	 Val. Loss: 0.814 |  Val. PPL:   2.256
	 Val. BLEU: 2.57 | ROUGE-L: 1.42 | chrF: 13.15

Improved training complete.
