# Inference 

This notebook demonstrates how to use your custom Transformer model for inference.

---

In [15]:
#Import necessary modules, including custom Transformer class, PyTorch utilities, and the HuggingFace tokenizer.
import torch
import torch.nn as nn
from my_transformers.Transformers_archi import Transformer 
from torch.utils.data import DataLoader,Dataset
from transformers import AutoTokenizer
import time

In [16]:
# Example of tiny English-French parallel dataset
train_data = [
    ("Hello", "Bonjour"),
    ("How are you?", "Comment ça va?"),
    ("I am fine", "Je vais bien"),
    ("What is your name?", "Quel est ton nom?"),
    ("I love programming", "J'adore la programmation"),
    ("This is a test", "C'est un test"),
    ("Good morning", "Bonjour"),
    ("Good night", "Bonne nuit"),
    ("I need help", "J'ai besoin d'aide"),
    ("Thank you", "Merci"),
    ("Please", "S'il vous plaît"),
    ("Goodbye", "Au revoir"),
    ("Sorry", "Désolé"),
    ("I am happy.", "Je suis heureux."),
    ("She is reading a book.", "Elle lit un livre."),
    ("We are going to school.", "Nous allons à l'école."),
    ("They are playing outside.", "Ils jouent dehors."),
    ("It is raining.", "Il pleut."),
    ("What is your name?", "Comment tu t'appelles ?"),
    ("I don't understand.", "Je ne comprends pas."),
    ("Please help me.", "S'il vous plaît, aidez-moi."),
    ("I love you.", "Je t'aime."),
    ("Where is the bathroom?", "Où sont les toilettes ?")
]

In [3]:
# Define a custom dataset for tokenized source and target sentences
class CustomDataset(Dataset):
    def __init__(self, src_token, trg_tokens):
        # Store input IDs and attention masks for source, and input IDs for target
        self.src_mask = src_token["attention_mask"]
        self.src_ids = src_token["input_ids"]
        self.trg_ids = trg_tokens["input_ids"]
        
    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.src_ids)
    
    def __getitem__(self, index):
        # Return a dictionary with source input IDs, attention mask, and target labels for the given index
        return {
            "input_ids": self.src_ids[index],
            "attention_mask": self.src_mask[index],
            "labels": self.trg_ids[index] }
        

In [4]:
# Import HuggingFace's AutoTokenizer pretrained English-French tokenizer for tokenizing sentences

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")



In [5]:
actuals,preds= zip(*train_data)

## Tokenize the Data
Set a maximum sequence length and tokenize both the actual (source) and predicted (target) sentences, padding or truncating as necessary. The tokens are returned as PyTorch tensors.

In [6]:
# Set the maximum sequence length for tokenization
max_length = 10

# Tokenize the actual (input/source) sentences
actual_tokens = tokenizer(
    actuals,
    max_length=max_length,
    padding="max_length",
    return_tensors="pt",
    truncation=True)

# Tokenize the predicted (target) sentences
pred_tokens = tokenizer(
    preds,
    max_length=max_length,
    padding="max_length",
    return_tensors="pt",
    truncation=True)


In [7]:
# Create a dataset from the tokenized actual and predicted sentences
dataset = CustomDataset(actual_tokens, pred_tokens)

# Wrap the dataset in a DataLoader for batching and shuffling
data = DataLoader(dataset, batch_size= 2, shuffle=True)

In [8]:
# Initialize the custom Transformer model with specified hyperparameters
model = Transformer(
    d_model=128,
    num_heads=2,
    num_encoder_layer=4,
    src_vocab_size=tokenizer.vocab_size,
    trg_vocab_size=tokenizer.vocab_size,
    seq_len=max_length,
    decoder_layer=2 )


In [13]:
# Set up the Adam optimizer for the model's parameters with a learning rate of 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Use CrossEntropyLoss as the loss function for training/evaluation
criterion = nn.CrossEntropyLoss()

In [18]:
# Set the model to training mode
model.train()
epochs = 20
start = time.time()

for e in range(epochs):
    total_loss = 0
    for batch in data:
        input_ids = batch["input_ids"]           # [B, T]
        attention_mask = batch["attention_mask"] # [B, T]
        labels = batch["labels"]                 # [B, T]

        # Forward pass: output shape [B, T, vocab_size]
        output = model(input_ids, attention_mask, labels)

        # Permute output for CrossEntropyLoss: [B, vocab_size, T]
        output = output.permute(0, 2, 1)  

        # Compute loss
        loss = criterion(output, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    # Print loss after each epoch
    print(f"Epoch {e+1} ---->> LOSS {total_loss:.4f}")

# Print total training time
print(f"Total time taken: {(time.time() - start) / 60:.2f} minutes")

Epoch 1 ---->> LOSS 55.3920
Epoch 2 ---->> LOSS 49.2084
Epoch 3 ---->> LOSS 53.1253
Epoch 4 ---->> LOSS 52.7743
Epoch 5 ---->> LOSS 46.5644
Epoch 6 ---->> LOSS 46.0022
Epoch 7 ---->> LOSS 45.5231
Epoch 8 ---->> LOSS 44.1990
Epoch 9 ---->> LOSS 44.5378
Epoch 10 ---->> LOSS 43.5150
Epoch 11 ---->> LOSS 38.9621
Epoch 12 ---->> LOSS 39.3133
Epoch 13 ---->> LOSS 40.0459
Epoch 14 ---->> LOSS 40.2793
Epoch 15 ---->> LOSS 37.3698
Epoch 16 ---->> LOSS 36.6405
Epoch 17 ---->> LOSS 39.4404
Epoch 18 ---->> LOSS 36.1151
Epoch 19 ---->> LOSS 38.0210
Epoch 20 ---->> LOSS 34.8968
Total time taken: 1.20 minutes


In [19]:
# Tokenize a single source sentence for model input
src_sentence = "hello world"
src_inputs = tokenizer(
    src_sentence,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=max_length)

In [20]:
# Get token ids and attention mask for the source sentence
input_ids = src_inputs["input_ids"]
attention_mask = src_inputs["attention_mask"]

# Pass tokens to the encoder (no gradients needed for inference)
with torch.no_grad():
    encoder_out = model.enc(input_ids, attention_mask)

In [21]:
# Start decoder input with the pad token (can also be sos token for some models)
dec_inp_id = torch.tensor([[tokenizer.pad_token_id]])  # shape: (1, seqlen)

for _ in range(max_length):
    # Get logits from the decoder
    logits = model.dec(dec_inp_id, encoder_out)  # logits: (1, seq_len, vocab_size)

    # Pick the most probable next token (greedy decoding)
    next_token = torch.argmax(logits[:, -1, :], dim=-1)  # shape: (1,)

    # Concatenate the new token to the decoder input
    dec_inp_id = torch.cat([dec_inp_id, next_token.unsqueeze(0)], dim=1)
    # .unsqueeze(0) ensures next_token shape matches for concatenation

    # Stop if end-of-sequence token is generated
    if next_token.item() == tokenizer.eos_token_id:
        break

In [22]:
# Convert generated token IDs to a flat list
output_tokens = dec_inp_id[0].tolist()

# Decode the token IDs to a string, skipping special tokens like <pad> or <eos>
decoded_text = tokenizer.decode(output_tokens, skip_special_tokens=True)

# The final generated text
decoded_text

'- de en▁are cette in leur leur de en'