In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import json
import os
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from torch.utils.data import Dataset, DataLoader
import swifter
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from sacrebleu.metrics import BLEU
from torch.cuda.amp import autocast, GradScaler
from tqdm.notebook import tqdm
warnings.simplefilter("ignore")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

        pe = torch.zeros(max_seq_len, self.d_model)
        for pos in range(max_seq_len):
            for i in range(0,self.d_model,2):
                pe[pos, i] = math.sin(pos / (10000 ** (i/self.d_model)))
                if i+1 < self.d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** (i/self.d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) #dont use for training


    def forward(self, x):
        seq_len = x.size(1)
        x = x + torch.Tensor(self.pe[:,:seq_len])
        return x

In [4]:
#d_model = 512, num_heads = 8
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        #We use weights of size (d_model, d_model) to represent the weights of all heads, the first (d_model, d_k) would be the weights of 1st head and so on.
        self.W_q = nn.Linear(d_model, d_model) #weights for Queries
        self.W_k = nn.Linear(d_model, d_model) #weights for Keys
        self.W_v = nn.Linear(d_model, d_model) #weights for Values
        self.W_o = nn.Linear(d_model, d_model) #weights for Outputs
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None): #Q,K,V are in the shape [batch_size, num_heads, seq_length, d_k]
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)  #dot product
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e4)  #masking so softmax assigns them 0 probability
        attn_probs = torch.softmax(attn_scores, dim=-1)  #softmax
        output = torch.matmul(attn_probs, V) #weighted distribution of Values using softmax
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)  #splits into d_k length vectors for multihead
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model) #reverses what split_heads does to return a d_model vector output
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q)) #split_heads is used here to represent the multiple heads 
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask) #Gets attention
        output = self.W_o(self.combine_heads(attn_output)) #Linear output layer
        return output

In [5]:
#two linear transformations and a ReLU activation
#d_ff = 2048
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask) #get self attention
        x = self.norm1(x + self.dropout(attn_output)) #residual add and norm
        ff_output = self.feed_forward(x) #feed forward
        x = self.norm2(x + self.dropout(ff_output)) #residual add and norm
        return x #encoder output

In [7]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask): #masks are used so the decoder layer can't look further than the words it has predicted
        attn_output = self.self_attn(x, x, x, tgt_mask) #masked self attention
        x = self.norm1(x + self.dropout(attn_output)) #residual add and norm
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask) #masked cross attention
        x = self.norm2(x + self.dropout(attn_output)) #residual add and norm
        ff_output = self.feed_forward(x) #feed forward
        x = self.norm3(x + self.dropout(ff_output)) #residual add and norm
        return x

In [8]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) #word embedding for source language
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model) #word embedding for target language
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) #positional encoding

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #encoder layers (6)
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #decoder layers (6)

        self.fc = nn.Linear(d_model, tgt_vocab_size) #linear layer for decoding output
        self.dropout = nn.Dropout(dropout) #dropout after positional encoding

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2) #creates a boolean mask of shape (batch_size, 1, 1, seq_length)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3) #makes it ignore padding tokens
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device) #makes it not look ahead
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src))) #word and positional embedding
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded  #run through num_layers of encoders
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded  #run through num_layers of decoders
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output) #final linear layer
        tgt_word_probs = torch.softmax(output, dim=-1)  #softmax
        return tgt_word_probs #final decoded output

In [9]:
def create_tokenizer(file_path, lang, vocab_size=20000):
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"]
    )
    def batch_iterator():
        batch_size = 1000
        for chunk in pd.read_csv(file_path, chunksize=batch_size, on_bad_lines="skip", encoding="utf-8", lineterminator='\n'):  # Read in chunks
            chunk = chunk.dropna()
            yield chunk[lang].tolist()
    
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
    return tokenizer

In [10]:
def get_or_create_tokenizers(filepath, base_path="tokenizers"):
    os.makedirs(base_path, exist_ok=True)

    src_path = os.path.join(base_path, "src_tokenizer.json")
    tgt_path = os.path.join(base_path, "tgt_tokenizer.json")
    
    if os.path.exists(src_path):
        print(f"Loading existing source tokenizer from {src_path}")
        src_tokenizer = Tokenizer.from_file(src_path)
    else:
        print(f"Creating new source tokenizer and saving to {src_path}")
        src_tokenizer = create_tokenizer(filepath, "de")
        src_tokenizer.save(src_path)
    
    if os.path.exists(tgt_path):
        print(f"Loading existing target tokenizer from {tgt_path}")
        tgt_tokenizer = Tokenizer.from_file(tgt_path)
    else:
        print(f"Creating new target tokenizer and saving to {tgt_path}")
        tgt_tokenizer = create_tokenizer(filepath, "en")
        tgt_tokenizer.save(tgt_path)
    
    return src_tokenizer, tgt_tokenizer

In [11]:
src_tokenizer, tgt_tokenizer = get_or_create_tokenizers("wmt14_translate_de-en_train.csv")

Loading existing source tokenizer from tokenizers\src_tokenizer.json
Loading existing target tokenizer from tokenizers\tgt_tokenizer.json


In [12]:
train_data = pd.read_csv("wmt14_translate_de-en_train.csv", on_bad_lines="skip", encoding="utf-8", lineterminator='\n').dropna()
test_data = pd.read_csv("wmt14_translate_de-en_test.csv", on_bad_lines="skip", encoding="utf-8", lineterminator='\n').dropna()

In [13]:
def pad_sequence(tokens, tokenizer, max_length=100):
    pad_id = tokenizer.token_to_id("[PAD]")
    if len(tokens) < max_length:
        tokens = tokens + [pad_id] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length-1] + [tokenizer.token_to_id("[EOS]")]
    return tokens

def tokenize_line(line, tokenizer):
    tokens = tokenizer.encode(line).ids
    
    # Add BOS and EOS tokens
    tokens = [tokenizer.token_to_id("[BOS]")] + tokens + [tokenizer.token_to_id("[EOS]")]
    
    # Pad sequences
    tokens = pad_sequence(tokens, tokenizer)
    
    return torch.tensor(tokens)

In [14]:
test_data['de'] = test_data['de'].swifter.apply(tokenize_line, tokenizer=src_tokenizer)
test_data['de'] = test_data['en'].swifter.apply(tokenize_line, tokenizer=tgt_tokenizer)
train_data['de'] = train_data['de'].swifter.apply(tokenize_line, tokenizer=src_tokenizer)
train_data['en'] = train_data['en'].swifter.apply(tokenize_line, tokenizer=tgt_tokenizer)

Pandas Apply:   0%|          | 0/3003 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/3003 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4508785 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(train_data.iloc[1]['de'])
print(len(train_data))

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        
        return item['de'], item['en']

In [None]:
def decode_tokens_to_sentences(token_ids, tokenizer):
    """Convert token IDs back to sentences."""
    sentences = []
    for seq in token_ids:
        # Remove padding, end tokens, etc
        seq = seq[seq != 0]  # Remove padding
        seq = seq[seq != 2]  # Remove EOS token if present
        sentence = tokenizer.decode(seq.tolist())
        sentences.append(sentence)
    return sentences

def calculate_bleu(predictions, targets, tokenizer):
    """Calculate BLEU score for a batch of predictions."""
    # Convert token ids to sentences
    pred_sentences = decode_tokens_to_sentences(predictions, tokenizer)
    target_sentences = decode_tokens_to_sentences(targets, tokenizer)
    
    # Calculate BLEU score
    bleu = BLEU()
    score = bleu.corpus_score(pred_sentences, [target_sentences])
    
    return score.score  # Returns the BLEU score as a float

In [None]:
def train_transformer(model, train_loader, criterion, optimizer, num_epochs, device, 
                     patience=5, checkpoint_dir='checkpoints', warmup_steps=4000, max_lr=0.01):
    model.train()
    best_loss = float('inf')
    patience_counter = 0
    training_history = []

    scaler = GradScaler()
    
    # Learning rate scheduler setup
    scheduler = CosineAnnealingWarmRestarts(
        optimizer, 
        T_0=warmup_steps,  # First restart cycle length
        T_mult=2,  # Multiply cycle length by 2 after each restart
        eta_min=1e-4  # Minimum learning rate
    )
    
    # Create checkpoint directory if it doesn't exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    def save_checkpoint(model, optimizer, epoch, loss, checkpoint_dir, is_best=False):
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }
        # Save regular checkpoint
        path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pt')
        torch.save(checkpoint, path)
        
        # Save best model separately
        if is_best:
            best_path = os.path.join(checkpoint_dir, 'best_model.pt')
            torch.save(checkpoint, best_path)
            print(f"Saved best model with loss: {loss:.4f}")
    
    for epoch in range(num_epochs):
        epoch_stats = {
            'total_loss': 0,
            'num_correct_predictions': 0,
            'total_predictions': 0,
            'total_tokens': 0,
            'batch_losses': [],
            'bleu_scores': []
        }
        
        for batch_idx, (src, tgt) in enumerate(train_loader):
            src = src.to(device)
            tgt = tgt.to(device)
            
            decoder_input = tgt[:, :-1] #shifted right
            target = tgt[:, 1:] #shifted left
            
            optimizer.zero_grad()

            with autocast():
                output = model(src, decoder_input)
                output_flat = output.contiguous().view(-1, output.size(-1))
                target_flat = target.contiguous().view(-1)
                loss = criterion(output_flat, target_flat)
            
            # Backward pass
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            scheduler.step()
            current_lr = scheduler.get_last_lr()[0]
            
            # Calculate metrics
            with torch.no_grad():
                # Get predictions
                predictions = output.argmax(dim=-1)

                bleu_score = calculate_bleu(predictions.cpu(), target.cpu(), tgt_tokenizer)
                
                # Update statistics
                epoch_stats['total_loss'] += loss.item()
                epoch_stats['batch_losses'].append(loss.item())
                epoch_stats['num_correct_predictions'] += num_correct
                epoch_stats['total_tokens'] += num_tokens
                epoch_stats['bleu_scores'].append(bleu_score)
            
            # Print batch progress
            if batch_idx % 100 == 0:
                current_loss = loss.item()
                avg_loss = epoch_stats['total_loss'] / (batch_idx + 1)
                avg_bleu = np.mean(epoch_stats['bleu_scores'])
                
                print(f'Epoch: {epoch+1}/{num_epochs} | Batch: {batch_idx}/{len(train_loader)} | '
                      f'Loss: {current_loss:.4f} | Avg Loss: {avg_loss:.4f} | '
                      f'LR: {current_lr:.6f}')
        
        # Calculate epoch metrics
        avg_epoch_loss = epoch_stats['total_loss'] / len(train_loader)
        avg_epoch_bleu = np.mean(epoch_stats['bleu_scores'])
        
        # Store training history
        training_history.append({
            'epoch': epoch + 1,
            'avg_loss': avg_epoch_loss,
            'bleu': avg_epoch_bleu,
            'learning_rate': current_lr
        })
        
        # Print epoch statistics
        print(f'\nEpoch {epoch+1} Statistics:')
        print(f'Average Loss: {avg_epoch_loss:.4f}')
        print(f'BLEU Score: {avg_epoch_bleu:.2f}')
        print(f'Learning Rate: {current_lr:.6f}')
        
        # Save checkpoint and check for early stopping
        is_best = avg_epoch_loss < best_loss
        if is_best:
            best_loss = avg_epoch_loss
            patience_counter = 0
            save_checkpoint(model, optimizer, scheduler, scaler, epoch, 
                          avg_epoch_loss, checkpoint_dir, is_best=True)
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'\nEarly stopping triggered after {epoch+1} epochs')
                break
        
        # Save regular checkpoint
        save_checkpoint(model, optimizer, scheduler, scaler, epoch, 
                      avg_epoch_loss, checkpoint_dir)
    
    return model, training_history

In [None]:
src_vocab_size = 20000
tgt_vocab_size = 20000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

num_epochs = 10

train_dataset = TranslationDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, pin_memory=True, num_workers=0)

train_transformer(transformer, train_dataloader, criterion, optimizer, num_epochs, device, patience=2, checkpoint_dir='checkpoints')