In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math


In [2]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import spacy

In [3]:
# Load English/German tokenizers
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]

def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]


In [4]:
train_data = pd.read_csv('wmt14_translate_de-en_train.csv',on_bad_lines='skip', engine='python' )

In [5]:

val_data = pd.read_csv('wmt14_translate_de-en_validation.csv')
test_data = pd.read_csv('wmt14_translate_de-en_test.csv')

In [6]:
train_data =train_data.dropna()

In [7]:
train_data = train_data.sample(n = 1000000, random_state = 42).reset_index(drop= True)

In [8]:
from collections import Counter

# Define special tokens
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"

class Vocab:
    def __init__(self, tokens, min_freq=2):
        self.stoi = {}
        self.itos = {}
        self._build_vocab(tokens, min_freq)

    def _build_vocab(self, tokens, min_freq):
        counter = Counter(tokens)
        vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] + [
            token for token, count in counter.items() if count >= min_freq
        ]
        for idx, token in enumerate(vocab):
            self.stoi[token] = idx
            self.itos[idx] = token

# Build vocab for English and German
en_tokens = [token for sent in train_data['en'] for token in tokenize_en(sent)]
de_tokens = [token for sent in train_data['de'] for token in tokenize_de(sent)]

en_vocab = Vocab(en_tokens)
de_vocab = Vocab(de_tokens)

In [9]:

class TranslationDataset(Dataset):
    def __init__(self, df, en_vocab, de_vocab):
        self.df = df
        self.en_vocab = en_vocab
        self.de_vocab = de_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        en_sent = self.df.iloc[idx]['en']
        de_sent = self.df.iloc[idx]['de']

        en_tokens = [SOS_TOKEN] + tokenize_en(en_sent) + [EOS_TOKEN]
        de_tokens = [SOS_TOKEN] + tokenize_de(de_sent) + [EOS_TOKEN]

        en_indices = [en_vocab.stoi.get(token, en_vocab.stoi[UNK_TOKEN]) 
                     for token in en_tokens]
        de_indices = [de_vocab.stoi.get(token, de_vocab.stoi[UNK_TOKEN]) 
                     for token in de_tokens]

        return torch.tensor(en_indices, dtype=torch.long), \
               torch.tensor(de_indices, dtype=torch.long)

In [10]:
def collate_fn(batch):
    en_batch, de_batch = zip(*batch)
    en_padded = torch.nn.utils.rnn.pad_sequence(
        en_batch, padding_value=en_vocab.stoi[PAD_TOKEN]
    )
    de_padded = torch.nn.utils.rnn.pad_sequence(
        de_batch, padding_value=de_vocab.stoi[PAD_TOKEN]
    )
    return en_padded, de_padded

# Create DataLoaders
train_dataset = TranslationDataset(train_data, en_vocab, de_vocab)
val_dataset = TranslationDataset(val_data, en_vocab, de_vocab)
test_dataset = TranslationDataset(test_data, en_vocab, de_vocab)

BATCH_SIZE = 16

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, 
    shuffle=True, collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, 
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, 
    collate_fn=collate_fn
)

In [11]:
class InputEmbeddings(nn.Module):
    def __init__(self,vocab_size, d_model):
        super(InputEmbeddings, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x) * math.sqrt(self.d_model)

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_seq_length):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length,dtype =torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe.unsqueeze(0))
    def forward(self,x):
        return x + self.pe[:, :x.size(1)] 


In [13]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.head_dim = d_model // num_heads
        self.query_linear = nn.Linear(d_model, d_model, bias=False)
        self.key_linear = nn.Linear(d_model, d_model, bias=False)
        self.value_linear = nn.Linear(d_model, d_model, bias=False)
        self.output_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x):
        # x: [batch, seq_len, d_model]
        batch_size, seq_len, _ = x.size()
        x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
        return x.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
    
    def compute_attention(self, query, key, value, mask=None):
        # scores: [batch, num_heads, query_len, key_len]
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            # Adjust mask dimensions based on its number of dimensions
            if mask.dim() == 2:  # e.g. [batch, key_len] -> key padding mask
                mask = mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, key_len]
            elif mask.dim() == 3:  # e.g. [batch, query_len, key_len]
                mask = mask.unsqueeze(1)  # [batch, 1, query_len, key_len]
            # Fill masked positions with -infinity
            scores = scores.masked_fill(mask == 0, -1e9)
            
        attention = F.softmax(scores, dim=-1)
        return torch.matmul(attention, value)
    
    def combine_heads(self, x):
        # x: [batch, num_heads, seq_len, head_dim]
        batch_size, num_heads, seq_len, head_dim = x.size()
        x = x.transpose(1, 2).contiguous()  # [batch, seq_len, num_heads, head_dim]
        return x.view(batch_size, seq_len, self.d_model)
    
    def forward(self, query, key, value, mask=None):
        # query, key, value: [batch, seq_len, d_model]
        query = self.split_heads(self.query_linear(query))
        key   = self.split_heads(self.key_linear(key))
        value = self.split_heads(self.value_linear(value))
        attention = self.compute_attention(query, key, value, mask)
        output = self.combine_heads(attention)
        return self.output_linear(output)


In [14]:
class FeedForwardSubLayer(nn.Module):
    def __init__(self,d_model,d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    def forward(self,x):
        return self.linear2(self.relu(self.linear1(x)))
    

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x,src_mask):
        attn_output = self.multi_head_attention(x,x,x,src_mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x  

In [16]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads,d_ff,dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff_sublayer = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.Dropout = nn.Dropout(dropout)
    def forward(self,x,enc_output,tgt_mask,cross_mask): 
        self_attn_output = self.self_attn(x,x,x,tgt_mask)
        x = self.norm1(x + self.Dropout(self_attn_output))
        cross_attn_output = self.cross_attn(x,enc_output,enc_output,cross_mask)
        x = self.norm2(x + self.Dropout(cross_attn_output))
        ff_output = self.ff_sublayer(x)
        x = self.norm3(x + self.Dropout(ff_output))
        return x   

In [17]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size,d_model,num_layers,num_heads,d_ff,dropout,max_seq_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = InputEmbeddings(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model,max_seq_length)
        self.layers = nn.ModuleList([EncoderLayer(d_model,num_heads,d_ff,dropout) for _ in range(num_layers)])
    def forward(self,x,src_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x,src_mask)
        return x    

In [18]:
class TransformerDecoder(nn.Module):
    def __init__(self,vocab_size,d_model,num_layers,num_heads,d_ff,dropout,max_seq_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = InputEmbeddings(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model,max_seq_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model,num_heads,d_ff,dropout) for _ in range(num_layers)])
        self.linear = nn.Linear(d_model,vocab_size)
    def forward(self,x,enc_output,tgt_mask,cross_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x,enc_output,tgt_mask,cross_mask)
        return self.linear(x)

In [19]:
class Transformer(nn.Module):
    def __init__(self,vocab_size_en,vocab_size_de,d_model,num_heads,num_layers,d_ff,max_seq_length,dropout):
        super(Transformer,self).__init__()
        self.encoder = TransformerEncoder(vocab_size_en,d_model,num_layers,num_heads,d_ff,dropout,max_seq_length)
        self.decoder = TransformerDecoder(vocab_size_de,d_model,num_layers,num_heads,d_ff,dropout,max_seq_length)

    def forward(self, src, tgt, src_mask, tgt_mask, cross_mask):
        encoder_output = self.encoder(src, src_mask) 
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask, cross_mask) 
        decoder_output = F.log_softmax(decoder_output, dim=-1)
        return decoder_output
      

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    len(en_vocab.stoi), len(de_vocab.stoi),
    d_model=512, num_heads=8,
    num_layers = 6, d_ff=2048, max_seq_length=256,dropout = 0.1
).to(device)


In [21]:
loss_history = []
val_loss_history = []

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def train_and_evaluate(model, train_loader, val_loader, device, num_epochs=10):
    """
    Train a PyTorch model and display train and test loss at each epoch.
    Assumes the model's forward signature is:
      forward(src, tgt, src_mask, tgt_mask, cross_mask)
    """
    # Move the model to the specified device
    model = model.to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=de_vocab.stoi[PAD_TOKEN])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

    # Initialize history lists
    loss_history = []
    val_loss_history = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for src, tgt in train_loader:
            # Ensure that src and tgt have the same batch size.
            # If not, you may want to skip or fix the batch.
            if src.size(0) != tgt.size(0):
                continue  # or raise an error

            src, tgt = src.to(device), tgt.to(device)

            # Build masks from actual input tensor sizes:
            src_mask = (src != en_vocab.stoi[PAD_TOKEN]).unsqueeze(1)  # shape: [batch, 1, src_seq_len]

            # Prepare decoder input and target (teacher forcing)
            decoder_input = tgt[:, :-1]  # [batch, tgt_seq_len - 1]
            target = tgt[:, 1:]          # [batch, tgt_seq_len - 1]

            # Build decoder mask based on the decoder input
            tgt_mask = (decoder_input != de_vocab.stoi[PAD_TOKEN]).unsqueeze(1)  # shape: [batch, 1, tgt_seq_len - 1]

            # For cross-attention, use the encoder mask (adjust if needed)
            cross_mask = src_mask

            # Forward pass
            output = model(src, decoder_input, src_mask, tgt_mask, cross_mask)
            # Expected output shape: [batch, tgt_seq_len - 1, vocab_size]
            
            # Use .reshape instead of .view() to avoid contiguous issues.
            loss = criterion(output.reshape(-1, output.shape[-1]), target.reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Evaluation phase
        model.eval()
        val_running_loss = 0.0

        with torch.no_grad():
            for src, tgt in val_loader:
                if src.size(0) != tgt.size(0):
                    continue

                src, tgt = src.to(device), tgt.to(device)
                src_mask = (src != en_vocab.stoi[PAD_TOKEN]).unsqueeze(1)

                decoder_input = tgt[:, :-1]
                target = tgt[:, 1:]
                tgt_mask = (decoder_input != de_vocab.stoi[PAD_TOKEN]).unsqueeze(1)
                cross_mask = src_mask

                output = model(src, decoder_input, src_mask, tgt_mask, cross_mask)
                loss = criterion(output.reshape(-1, output.shape[-1]), target.reshape(-1))
                val_running_loss += loss.item()

        train_loss = running_loss / len(train_loader)
        val_loss = val_running_loss / len(val_loader)
        loss_history.append(train_loss)
        val_loss_history.append(val_loss)

        print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")


In [23]:
train_and_evaluate(model, train_loader, val_loader, device, num_epochs=10)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.08 GiB. GPU 0 has a total capacity of 12.00 GiB of which 0 bytes is free. Of the allocated memory 11.87 GiB is allocated by PyTorch, and 5.57 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


def calculate_bleu(model, val_loader, device, max_len=50):
    model.eval()
    hypotheses = []
    references = []
    
    sos_token = de_vocab.stoi[SOS_TOKEN]
    eos_token = de_vocab.stoi[EOS_TOKEN]
    pad_token = de_vocab.stoi[PAD_TOKEN]
    
    for src, tgt in val_loader:
        # Add batch size validation like in training
        if src.size(0) != tgt.size(0):
            continue  # Skip mismatched batches
            
        src = src.to(device)
        tgt = tgt.to(device)  # Move tgt to device for consistency
        batch_size = src.size(0)
        
        src_mask = (src != en_vocab.stoi[PAD_TOKEN]).unsqueeze(1)
        decoder_input = torch.full((batch_size, 1), sos_token, 
                                 dtype=torch.long, device=device)
        
        for _ in range(max_len):
            tgt_mask = (decoder_input != pad_token).unsqueeze(1)
            output = model(src, decoder_input, src_mask, tgt_mask, src_mask)
            next_token = output.argmax(dim=-1)[:, -1].unsqueeze(1)
            decoder_input = torch.cat([decoder_input, next_token], dim=-1)
            
            if (next_token == eos_token).all():
                break

        # Process batch elements with explicit size check
        for i in range(tgt.size(0)):  # Use tgt's batch size directly
            # Handle hypothesis
            hyp_sequence = decoder_input[i, 1:].cpu().tolist()
            hyp_tokens = [
                de_vocab.itos[idx] 
                for idx in hyp_sequence 
                if idx not in {eos_token, pad_token}
            ]
            if eos_token in hyp_sequence:
                hyp_tokens = hyp_tokens[:hyp_sequence.index(eos_token)]
            hypotheses.append(hyp_tokens)
            
            # Handle reference with safe slicing
            ref_sequence = tgt[i, 1:].cpu().tolist()  # Now guaranteed to exist
            ref_tokens = [
                de_vocab.itos[idx] 
                for idx in ref_sequence 
                if idx not in {eos_token, pad_token}
            ]
            if eos_token in ref_sequence:
                ref_tokens = ref_tokens[:ref_sequence.index(eos_token)]
            references.append([ref_tokens])  # Wrap in list for corpus_bleu

    bleu_score = corpus_bleu(references, hypotheses,
                            weights=(0.25, 0.25, 0.25, 0.25),
                            smoothing_function=SmoothingFunction().method1)
    return bleu_score

# Usage example after training:
bleu_score = calculate_bleu(model, test_loader, device)
print(f"BLEU-4 Score: {bleu_score:.4f}")

In [None]:
def translate_sentence(model, sentence, en_vocab, de_vocab, device, max_length=50):
    """
    Translate a single sentence using the trained model.
    
    Args:
        model: Trained transformer model
        sentence: Input sentence string (source language)
        en_vocab: Source vocabulary (English)
        de_vocab: Target vocabulary (German)
        device: CUDA/CPU device
        max_length: Maximum generation length
        
    Returns:
        Translated sentence string (target language)
    """
    # Tokenize and numericalize the source sentence
    tokens = [token.lower().strip() for token in sentence.split()]  # Use same preprocessing as training
    tokens = [SOS_TOKEN] + tokens + [EOS_TOKEN]
    
    # Convert to indices using source vocab
    src_indices = [en_vocab.stoi[token] if token in en_vocab.stoi 
                   else en_vocab.stoi[UNK_TOKEN]  # Handle unknown tokens
                   for token in tokens]
    
    # Convert to tensor and add batch dimension
    src = torch.LongTensor(src_indices).unsqueeze(0).to(device)
    src_mask = (src != en_vocab.stoi[PAD_TOKEN]).unsqueeze(1)
    
    # Autoregressive generation
    model.eval()
    with torch.no_grad():
        # Initialize decoder with SOS token
        decoder_input = torch.LongTensor([[de_vocab.stoi[SOS_TOKEN]]]).to(device)
        
        for _ in range(max_length):
            # Create target mask (padding mask only)
            tgt_mask = (decoder_input != de_vocab.stoi[PAD_TOKEN]).unsqueeze(1)
            
            # Forward pass
            output = model(src, decoder_input, src_mask, tgt_mask, src_mask)
            
            # Get most likely next token
            next_token = output.argmax(dim=-1)[:, -1].unsqueeze(1)
            
            # Stop if EOS is generated
            if next_token.item() == de_vocab.stoi[EOS_TOKEN]:
                break
                
            decoder_input = torch.cat([decoder_input, next_token], dim=-1)
    
    # Convert indices to tokens
    translated_indices = decoder_input[0, 1:].cpu().tolist()  # Remove SOS
    translated_tokens = []
    for idx in translated_indices:
        if idx == de_vocab.stoi[EOS_TOKEN]:
            break
        translated_tokens.append(de_vocab.itos[idx])
    
    return ' '.join(translated_tokens)

In [None]:
translate_sentence(model, "I love you", en_vocab, de_vocab, device)