# Week 3: NLP Transformer Architecture


Applied Learning Assignments 2:
Apply transformers to real-world machine translation and explore
fine-tuning for improved performance.
1. Use the MarianMT model to translate English sentences into
another language of your choice (e.g., French, Spanish, or Arabic).
2. Fine-tune the model using domain-specific text (e.g., medical,
business, or legal vocabulary).
3. Evaluate the translations before and after fine-tuning. Highlight
improvements in handling terminology and context.

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import warnings
import logging
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from sacrebleu import corpus_bleu

# Suppress specific warning related to 'Trainer.tokenizer'
logging.getLogger("transformers").setLevel(logging.ERROR)

# Install necessary libraries
# Run this in your terminal in the virtual environment:
# pip install datasets transformers scikit-learn sacremoses sacrebleu

## 2. Load the Dataset

In [None]:
# Load the dataset from Google Drive (change path to a local file on your PC)
file_path = "english_french.csv"  # Update to your file path
data = pd.read_csv(file_path)
data = data.dropna()  # Remove any null entries

# Use only 1% of the data for this example
sampled_data = data.sample(frac=0.01, random_state=42)

## 3. Prepare Training and Evaluation Data

In [None]:
# Split into 80% training and 20% evaluation
train_size = int(0.8 * len(sampled_data))
train_data = sampled_data[:train_size]
eval_data = sampled_data[train_size:]

# Create Dataset objects for Hugging Face
train_dict = {"English": train_data["English"].tolist(), "French": train_data["French"].tolist()}
eval_dict = {"English": eval_data["English"].tolist(), "French": eval_data["French"].tolist()}
datasets = DatasetDict({
    "train": Dataset.from_dict(train_dict),
    "eval": Dataset.from_dict(eval_dict)
})

## 4. Load the Translation Model and Tokenizer

In [None]:
# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Use CPU only
device = torch.device('cpu')  # Force CPU usage
model.to(device)  # Move model to CPU

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

## 5. Tokenize the Dataset

In [None]:
# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples["English"], text_target=examples["French"], padding=True, truncation=True)

# Map tokenization over dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)

                                                                 

## 6. Configure Training Settings

In [None]:
# Data collator for dynamic padding
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    save_total_limit=1,
    evaluation_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none",
)




## 7. Define Evaluation Metrics

In [None]:
# Define compute_metrics function for evaluation
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Convert tensors to numpy arrays if necessary
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Decode labels, handling -100 masking for tokenizers
    labels = [[label for label in batch if label != -100] for batch in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        decoded_labels, decoded_preds, average="weighted", zero_division=1
    )
    acc = accuracy_score(decoded_labels, decoded_preds)

    # Calculate BLEU score
    bleu = corpus_bleu(decoded_preds, [decoded_labels]).score

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall, "bleu": bleu}


## 8. Initialize Trainer and Train the Model

In [None]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

{'loss': 1.3233, 'grad_norm': 4.788868427276611, 'learning_rate': 4.782608695652174e-05, 'epoch': 0.043478260869565216}
{'loss': 0.1844, 'grad_norm': 1.119953989982605, 'learning_rate': 4.565217391304348e-05, 'epoch': 0.08695652173913043}
{'loss': 0.1815, 'grad_norm': 3.042304515838623, 'learning_rate': 4.347826086956522e-05, 'epoch': 0.13043478260869565}
{'loss': 0.2033, 'grad_norm': 1.426414132118225, 'learning_rate': 4.130434782608696e-05, 'epoch': 0.17391304347826086}
{'loss': 0.196, 'grad_norm': 1.8878957033157349, 'learning_rate': 3.91304347826087e-05, 'epoch': 0.21739130434782608}
{'loss': 0.1707, 'grad_norm': 1.6308921575546265, 'learning_rate': 3.695652173913043e-05, 'epoch': 0.2608695652173913}
{'loss': 0.1605, 'grad_norm': 1.6243313550949097, 'learning_rate': 3.478260869565218e-05, 'epoch': 0.30434782608695654}
{'loss': 0.1331, 'grad_norm': 1.640490174293518, 'learning_rate': 3.260869565217392e-05, 'epoch': 0.34782608695652173}
{'loss': 0.1661, 'grad_norm': 1.800739049911499

TrainOutput(global_step=230, training_loss=0.21841997892960258, metrics={'train_runtime': 676.2788, 'train_samples_per_second': 2.718, 'train_steps_per_second': 0.34, 'train_loss': 0.21841997892960258, 'epoch': 1.0})

## 9. Evaluate the Model

In [None]:
# Evaluate the model and print the results
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

{'eval_loss': 0.19931964576244354, 'eval_accuracy': 0.3217391304347826, 'eval_f1': 0.3217391304347826, 'eval_precision': 1.0, 'eval_recall': 0.3217391304347826, 'eval_bleu': 56.35052131690946, 'eval_runtime': 112.0336, 'eval_samples_per_second': 4.106, 'eval_steps_per_second': 0.518, 'epoch': 1.0}
Evaluation Results: {'eval_loss': 0.19931964576244354, 'eval_accuracy': 0.3217391304347826, 'eval_f1': 0.3217391304347826, 'eval_precision': 1.0, 'eval_recall': 0.3217391304347826, 'eval_bleu': 56.35052131690946, 'eval_runtime': 112.0336, 'eval_samples_per_second': 4.106, 'eval_steps_per_second': 0.518, 'epoch': 1.0}


## 10. Generate Sample Translations

In [None]:
# Generate sample translations
sample_texts = ["Hello, how are you?", "I love machine learning.", "Let's go to the market."]
inputs = tokenizer(sample_texts, return_tensors="pt", padding=True).to(device)
translated = model.generate(**inputs)
translations = tokenizer.batch_decode(translated, skip_special_tokens=True)

# Display translations
for i, (src, tgt) in enumerate(zip(sample_texts, translations)):
    print(f"Source: {src}\nTranslation: {tgt}\n")

Source: Hello, how are you?
Translation: Bonjour, comment allez-vous ?

Source: I love machine learning.
Translation: J'adore l'apprentissage automatique.

Source: Let's go to the market.
Translation: Allons au marché.

