In [1]:
from transformers import pipeline

# Load the pre-trained MarianMT model for EN-DE translation, as suggested by the project plan.
# The "pipeline" function from Hugging Face handles the model and tokenizer loading for us.
print("Loading the pre-trained MarianMT model...")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=0) # device=0 uses the first GPU

# Let's do a quick test to see the model in action
test_sentence = "A man is sitting on a bench."
translated_output = translator(test_sentence)

print("Model loaded successfully!")
print(f"\n--- Test Translation ---")
print(f"Original English: {test_sentence}")
print(f"Model's German Translation: {translated_output[0]['translation_text']}")
print("------------------------")

Loading the pre-trained MarianMT model...


source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Model loaded successfully!

--- Test Translation ---
Original English: A man is sitting on a bench.
Model's German Translation: Ein Mann sitzt auf einer Bank.
------------------------


In [2]:
from datasets import load_dataset

print("Loading the Multi30k test data...")
# Load the dataset
dataset = load_dataset("bentrevett/multi30k")

# Extract the 'test' split
test_data = dataset['test']

# Create a list of English source sentences
source_sentences = [example['en'] for example in test_data]

# Create a list of German reference translations
reference_translations = [example['de'] for example in test_data]

print(f"Loaded {len(source_sentences)} sentences from the test set.")

# Let's inspect the first two examples to make sure they're correct
print("\n--- Sample Data ---")
print(f"Example 1 Source (EN): {source_sentences[0]}")
print(f"Example 1 Reference (DE): {reference_translations[0]}")
print(f"\nExample 2 Source (EN): {source_sentences[1]}")
print(f"Example 2 Reference (DE): {reference_translations[1]}")
print("-------------------")

Loading the Multi30k test data...
Loaded 1000 sentences from the test set.

--- Sample Data ---
Example 1 Source (EN): A man in an orange hat starring at something.
Example 1 Reference (DE): Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.

Example 2 Source (EN): A Boston Terrier is running on lush green grass in front of a white fence.
Example 2 Reference (DE): Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.
-------------------


In [3]:
from tqdm import tqdm

print("Generating translations for the 1,000 test sentences...")
print("This may take a minute...")

# The 'translator' pipeline can process a list of sentences directly.
# It will automatically use the GPU and batch the data for efficiency.
# We'll specify a batch size for good performance.
model_outputs = translator(source_sentences, batch_size=32)

# The output is a list of dictionaries. We need to extract the actual translated text.
model_translations = [output['translation_text'] for output in model_outputs]

print("\nTranslation generation complete.")
print(f"Successfully generated {len(model_translations)} translations.")

# Let's look at the first 3 examples to see how the model did
print("\n--- Sample Translations Comparison ---")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"  Source (EN):      {source_sentences[i]}")
    print(f"  Reference (DE):   {reference_translations[i]}")
    print(f"  Model Output (DE): {model_translations[i]}")
    print("-" * 20)

Generating translations for the 1,000 test sentences...
This may take a minute...

Translation generation complete.
Successfully generated 1000 translations.

--- Sample Translations Comparison ---
Example 1:
  Source (EN):      A man in an orange hat starring at something.
  Reference (DE):   Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
  Model Output (DE): Ein Mann in einem orangenen Hut, der mit etwas zu tun hat.
--------------------
Example 2:
  Source (EN):      A Boston Terrier is running on lush green grass in front of a white fence.
  Reference (DE):   Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.
  Model Output (DE): Ein Boston Terrier läuft auf üppig grünem Gras vor einem weißen Zaun.
--------------------
Example 3:
  Source (EN):      A girl in karate uniform breaking a stick with a front kick.
  Reference (DE):   Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt.
  Model Output (DE): Ein Mädchen in Karate-Uniform brich

In [4]:
import sacrebleu

# The list of translations generated by the pre-trained model
hypotheses = model_translations

# The list of correct translations from the dataset.
# sacrebleu expects references to be in a list of lists, as there can be
# multiple correct translations for a single source. We only have one.
references = [reference_translations]

# Calculate the BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, references)

print("--- BLEU Score Sanity Check ---")
print(f"The BLEU score is: {bleu.score:.2f}")
print("---------------------------------")

--- BLEU Score Sanity Check ---
The BLEU score is: 36.25
---------------------------------


EVALUATE OUR TRAINED MODEL NOW

In [5]:
import torch
import torch.nn as nn
import math
from tokenizers import Tokenizer

# --- 1. Load Tokenizers ---
print("Loading tokenizers...")
tokenizer_en = Tokenizer.from_file("tokenizer_en.json")
tokenizer_de = Tokenizer.from_file("tokenizer_de.json")
print("Tokenizers loaded successfully.")

# --- 2. Re-define the Model Architecture ---
# We need to define the model's structure again so we can load the weights into it.
# This code is the same as from your training script.

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int, emb_size: int, nhead: int,
                 src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 512, dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead, num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

print("Model architecture defined successfully.")


# --- 3. Define Hyperparameters ---
# These must be identical to the parameters used during training.
# [cite_start]The model size was kept small as suggested by the project plan for the baseline model[cite: 4].
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SRC_VOCAB_SIZE = 10000
TGT_VOCAB_SIZE = 10000
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
print("Hyperparameters defined.")

Loading tokenizers...
Tokenizers loaded successfully.
Model architecture defined successfully.
Hyperparameters defined.


In [6]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Instantiate the model with the defined hyperparameters
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# Load the saved model weights
print("Loading saved model weights from 'best_model.pth'...")
transformer.load_state_dict(torch.load('best_model.pth'))

# Move the model to the GPU
transformer = transformer.to(DEVICE)

# Set the model to evaluation mode
# This is important as it disables layers like dropout during inference
transformer.eval()

print("Model is loaded and ready for inference.")

Using device: cuda
Loading saved model weights from 'best_model.pth'...




Model is loaded and ready for inference.


In [7]:
# --- Helper function for mask creation ---
def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# --- The main translation function ---
def translate_sentence(model, src_sentence: str, device, max_len=50):
    model.eval()

    # Define special symbols and their indices
    UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

    # Tokenize the source sentence and add SOS/EOS tokens
    src_tokens = [SOS_IDX] + tokenizer_en.encode(src_sentence).ids + [EOS_IDX]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(1).to(device)
    src_len = src_tensor.shape[0]

    # Create the source mask
    src_mask = torch.zeros((src_len, src_len), device=device).type(torch.bool)
    
    # Get the encoded sentence (memory) from the encoder
    with torch.no_grad():
        memory = model.encode(src_tensor, src_mask)
    
    # Start the decoding with the SOS token
    ys = torch.ones(1, 1).fill_(SOS_IDX).type(torch.long).to(device)
    
    for i in range(max_len - 1):
        with torch.no_grad():
            tgt_len = ys.shape[0]
            tgt_mask = (generate_square_subsequent_mask(tgt_len, device).type(torch.bool)).to(device)
            
            # Decode the next token
            out = model.decode(ys, memory, tgt_mask)
            out = out.transpose(0, 1)
            
            # Get the probability distribution and find the most likely next token
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.item()

        # Append the new token to our sequence
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src_tensor.data).fill_(next_word)], dim=0)
        
        # If the model predicts the EOS token, we stop generating
        if next_word == EOS_IDX:
            break
            
    # Convert the generated token IDs back to a string
    # We remove the SOS token at the beginning
    tgt_tokens = ys.flatten().tolist()
    return tokenizer_de.decode(tgt_tokens[1:])


# --- Let's test our function on an example ---
# (Using the same test data lists from Part B)
test_sentence_index = 0
source_sentence = source_sentences[test_sentence_index]
reference_translation = reference_translations[test_sentence_index]

# Generate the translation using your model
model_translation = translate_sentence(transformer, source_sentence, DEVICE)

print("--- Inference Test on Your Model ---")
print(f"Source (EN):      {source_sentence}")
print(f"Reference (DE):   {reference_translation}")
print(f"Your Model (DE):  {model_translation}")
print("------------------------------------")

--- Inference Test on Your Model ---
Source (EN):      A man in an orange hat starring at something.
Reference (DE):   Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
Your Model (DE):  Ein Mann mit einem orangefarbenen Hut arbeitet an etwas .
------------------------------------


In [8]:
from tqdm import tqdm

print("Generating translations for the entire test set using your trained model...")
print("This will take a few minutes...")

my_model_translations = []
# Loop through each source sentence and translate it
for sentence in tqdm(source_sentences):
    translation = translate_sentence(transformer, sentence, DEVICE)
    my_model_translations.append(translation)

print("\nTranslation generation complete.")

# --- Let's inspect the first few results from your model ---
print("\n--- Sample Translations Comparison (Your Model) ---")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"  Source (EN):         {source_sentences[i]}")
    print(f"  Reference (DE):      {reference_translations[i]}")
    print(f"  Your Model's Output: {my_model_translations[i]}")
    print("-" * 20)

Generating translations for the entire test set using your trained model...
This will take a few minutes...


100%|██████████| 1000/1000 [00:42<00:00, 23.60it/s]


Translation generation complete.

--- Sample Translations Comparison (Your Model) ---
Example 1:
  Source (EN):         A man in an orange hat starring at something.
  Reference (DE):      Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
  Your Model's Output: Ein Mann mit einem orangefarbenen Hut arbeitet an etwas .
--------------------
Example 2:
  Source (EN):         A Boston Terrier is running on lush green grass in front of a white fence.
  Reference (DE):      Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.
  Your Model's Output: Ein Surfer läuft auf einem grünen Gras vor einem weißen Zaun .
--------------------
Example 3:
  Source (EN):         A girl in karate uniform breaking a stick with a front kick.
  Reference (DE):      Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt.
  Your Model's Output: Ein Mädchen in einem Trikot macht einen Stock vor einem Stock .
--------------------





In [9]:
import sacrebleu

# The list of translations generated by YOUR trained model
hypotheses = my_model_translations

# The list of correct reference translations
references = [reference_translations]

# Calculate the BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, references)

print("--- Final Evaluation of Your Baseline Model ---")
print(f"BLEU Score: {bleu.score:.2f}")
print("---------------------------------------------")

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


--- Final Evaluation of Your Baseline Model ---
BLEU Score: 22.12
---------------------------------------------
