In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import MarianTokenizer
import math

# Hyperparameters
SRC_LANG = 'en'
TGT_LANG = 'de'
BATCH_SIZE = 32
MAX_LEN = 128
SRC_VOCAB_SIZE = tokenizer.vocab_size  # assuming same tokenizer
TGT_VOCAB_SIZE = tokenizer.vocab_size

EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 32
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DROPOUT = 0.1

NUM_EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load Dataset
dataset = load_dataset("wmt16", "de-en")
train_data = dataset['train'].select(range(10000))  # Subset for speed
val_data = dataset['validation']

# 2. Tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

def tokenize(example):
    inputs = tokenizer(example['translation'][SRC_LANG],
                       padding='max_length', truncation=True,
                       max_length=MAX_LEN)
    targets = tokenizer(example['translation'][TGT_LANG],
                        padding='max_length', truncation=True,
                        max_length=MAX_LEN)
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids']
    }

# Tokenize dataset
train_data = train_data.map(tokenize, remove_columns=["translation"])
val_data = val_data.map(tokenize, remove_columns=["translation"])

# 3. Data Collator (Fix: convert lists to tensors)
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item['attention_mask'] for item in batch], dtype=torch.long)
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# 4. DataLoader
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# 5. Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout=0.1, maxlen=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(maxlen, emb_size)
        position = torch.arange(0, maxlen, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() * (-math.log(10000.0) / emb_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return self.dropout(x)

# 6. Transformer Model
class Seq2SeqTransformer(nn.Module):
    def __init__(
        self, num_encoder_layers, num_decoder_layers, emb_size, 
        nhead, src_vocab_size, tgt_vocab_size, dim_feedforward=512, dropout=0.1
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(
            d_model=emb_size, nhead=nhead, 
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, batch_first=True
        )
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.pos_encoder = nn.Sequential(
            PositionalEncoding(emb_size, dropout)
        )
        self.pos_decoder = nn.Sequential(
            PositionalEncoding(emb_size, dropout)
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask,
                src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.pos_encoder(self.src_tok_emb(src))
        tgt_emb = self.pos_decoder(self.tgt_tok_emb(tgt))
        outs = self.transformer(
            src_emb, tgt_emb, src_mask, tgt_mask, 
            None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask
        )
        return self.generator(outs)

# 7. Initialize model
VOCAB_SIZE = tokenizer.vocab_size
model = Seq2SeqTransformer(
    num_encoder_layers=NUM_ENCODER_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS,
    emb_size=EMB_SIZE,
    nhead=NHEAD,
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    dim_feedforward=FFN_HID_DIM,
    dropout=DROPOUT
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# 8. Mask generation
def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# 9. Padding mask generation function
def create_padding_mask(input_ids, pad_token_id):
    return (input_ids == pad_token_id)

# 10. Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        src = batch['input_ids'].to(DEVICE)
        tgt = batch['labels'].to(DEVICE)

        tgt_input = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(DEVICE)

        # Generate padding masks for both source and target sequences
        src_padding_mask = create_padding_mask(src, tokenizer.pad_token_id).to(DEVICE)
        tgt_padding_mask = create_padding_mask(tgt_input, tokenizer.pad_token_id).to(DEVICE)
        memory_key_padding_mask = create_padding_mask(src, tokenizer.pad_token_id).to(DEVICE)

        logits = model(src, tgt_input, src_mask=None, tgt_mask=tgt_mask,
                       src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask,
                       memory_key_padding_mask=memory_key_padding_mask)

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}")




Epoch 1/10, Loss: 6.3220
Epoch 2/10, Loss: 4.5985
Epoch 3/10, Loss: 3.9972
Epoch 4/10, Loss: 3.6712
Epoch 5/10, Loss: 3.4506
Epoch 6/10, Loss: 3.2839
Epoch 7/10, Loss: 3.1484
Epoch 8/10, Loss: 3.0289
Epoch 9/10, Loss: 2.9231
Epoch 10/10, Loss: 2.8235


In [2]:
pip install sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1
Note: you may need to restart the kernel to use updated packages.


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu

# Define translation function
def translate(model, tokenizer, src_sentence, device, max_len=128):
    model.eval()
    
    # Tokenize and prepare the source sentence
    src_input = tokenizer(src_sentence, return_tensors='pt', padding=True, truncation=True, max_length=max_len)
    src_input_ids = src_input['input_ids'].to(device)
    src_attention_mask = src_input['attention_mask'].to(device)
    
    # Generate translation using model's built-in method
    generated_ids = model.generate(input_ids=src_input_ids, attention_mask=src_attention_mask, max_length=max_len)
    
    # Decode the output tokens into a sentence
    output_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return output_sentence

# Define BLEU score computation function
def compute_bleu(model, tokenizer, dataset, num_samples=100):
    references = []
    hypotheses = []
    
    # Loop through dataset to compute BLEU score
    for i in range(num_samples):
        src = dataset[i]['translation']['en']
        ref = dataset[i]['translation']['de']
        
        pred = translate(model, tokenizer, src, device)

        references.append([ref])  # sacrebleu expects a list of references
        hypotheses.append(pred)

    # Compute BLEU score
    bleu = sacrebleu.corpus_bleu(hypotheses, references)
    print(f"\nFinal BLEU score: {bleu.score:.2f}")
    return bleu.score

# Example usage
if __name__ == "__main__":
    # Define device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load the tokenizer and model
    model_name = 't5-small'  # Example: using T5 model; replace with your model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Assuming CUDA (GPU) is available
    
    # Example dataset (replace with your actual dataset)
    dataset = [
        {'translation': {'en': 'Hello, how are you?', 'de': 'Hallo, wie geht es dir?'}},
        {'translation': {'en': 'What is your name?', 'de': 'Wie heißt du?'}}
        # Add more samples as needed
    ]
    
    # Compute BLEU score on test set
    bleu_score = compute_bleu(model, tokenizer, dataset, num_samples=2)



Final BLEU score: 8.17


In [18]:
import torch

def translate_english_to_french(model, tokenizer, sentence: str, device='cuda' if torch.cuda.is_available() else 'cpu', max_length=50):
    model.eval()  # Set model to evaluation mode
    
    # Prepare input text with task prefix if needed (like T5 usually requires)
    input_text = "translate English to French: " + sentence

    # Tokenize input
    inputs = tokenizer.encode(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate output ids
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

    # Decode and return
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


In [20]:
# Sample sentence
english_sentence = "My name is sarvagya"

# Translate
french_translation = translate_english_to_french(model, tokenizer, english_sentence)
print(f"French Translation: {french_translation}")


French Translation: Mon nom est sarvagya
