<a href="https://colab.research.google.com/github/SIDHI04/Books-Python/blob/main/Cross%20Lingual%20Question%20Answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate sentencepiece accelerate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
MODEL_CHECKPOINT = "bigscience/mt0-small"
BATCH_SIZE = 8
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 32
OUTPUT_DIR = "./final_cross_lingual_qa"

In [None]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]

    # 1. Replace -100 and Cast to Int (The previous fix)
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    labels = np.where(labels != -100, labels, pad_token_id)

    # Force standard python int to avoid the "integral type" crash
    preds = preds.astype(int)
    labels = labels.astype(int)

    decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

    # --- DEBUG PRINT START ---
    # This will show up in your console so you know what's happening
    print("\n******** DEBUG PREDICTIONS ********")
    for i in range(3):
        print(f"Pred: '{decoded_preds[i]}'")
        print(f"Label: '{decoded_labels[i]}'")
    print("***********************************\n")
    # --- DEBUG PRINT END ---

    em_scores = []
    f1_scores = []

    for pred, label in zip(decoded_preds, decoded_labels):
        pred_clean = pred.strip().lower()
        label_clean = label.strip().lower()
        em_scores.append(1 if pred_clean == label_clean else 0)

        # Simple F1 calculation
        pred_tokens = pred_clean.split()
        label_tokens = label_clean.split()
        common = set(pred_tokens) & set(label_tokens)
        num_same = len(common)
        if len(pred_tokens) == 0 or len(label_tokens) == 0:
            f1_scores.append(int(pred_tokens == label_tokens))
        else:
            precision = num_same / len(pred_tokens)
            recall = num_same / len(label_tokens)
            if precision + recall == 0:
                f1_scores.append(0)
            else:
                f1_scores.append(2 * (precision * recall) / (precision + recall))

    return {"exact_match": np.mean(em_scores) * 100, "f1": np.mean(f1_scores) * 100}

NameError: name 'tokenizer' is not defined

In [None]:
def preprocess_function(examples):
    inputs = [f"question: {q} context: {c}" for q, c in zip(examples["question"], examples["context"])]

    # Handle cases where answer might be missing
    targets = []
    for a in examples["answers"]:
        if len(a["text"]) > 0:
            targets.append(a["text"][0])
        else:
            targets.append("")

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Step 1: Loading dataset...")
dataset = load_dataset("squad")

# ⚠️ TRAINING ON 2000 SAMPLES
dataset["train"] = dataset["train"].select(range(2000))
dataset["validation"] = dataset["validation"].select(range(400))

print(f"Step 2: Loading Tokenizer for {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

print("Step 3: Processing dataset...")
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print(f"Step 4: Loading Multilingual Model ({MODEL_CHECKPOINT})...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

model.config.decoder_start_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,     # 5 Epochs is enough for demo
    load_best_model_at_end=True,

    # --- CRITICAL FOR METRICS ---
    predict_with_generate=True,   # Essential for calculating Accuracy
    generation_num_beams=1,      # Greedy search is much faster than beam search
    generation_max_length=10,      # Better quality generation
    # ----------------------------

    fp16=False,
    logging_steps=50,
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics # Passing the new function
)

In [None]:
final_metrics = {}
print("\nSTARTING TRAINING...")
try:
    trainer.train()
    # Get final evaluation results
    final_metrics = trainer.evaluate()
    print(f"FINAL METRICS: {final_metrics}")
except Exception as e:
    print(f"\n⚠️ Training interrupted: {e}")
    print("Proceeding to save model and graphs...")

print("\nStep 5: Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model Saved.")

In [None]:
from transformers import pipeline

# Point to the folder where the trainer saved the model
model_path = "./final_cross_lingual_qa"

print("Loading model for inference...")
# We load the pipeline using the saved files
qa_pipeline = pipeline("text2text-generation", model=model_path, tokenizer=model_path)

def ask_question(question, context):
    input_text = f"question: {question} context: {context}"
    # mt0/flan-t5 works best with max_length around 32-64 for short answers
    result = qa_pipeline(input_text, max_length=64)
    print(f"Context: {context[:60]}...") # Print first 60 chars of context
    print(f"Q: {question}")
    print(f"A: {result[0]['generated_text']}")
    print("-" * 30)

# --- TEST 1: ENGLISH ---
context_en = "The Taj Mahal is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, India. It was commissioned in 1632 by the Mughal emperor Shah Jahan."
ask_question("Who commissioned the Taj Mahal?", context_en)

# --- TEST 2: HINDI (Cross-Lingual Check) ---
# Context: "Solar energy is energy derived from the sun's radiation."
context_hi = "सौर ऊर्जा वह ऊर्जा है जो सूर्य की विकिरण से प्राप्त होती है। यह एक नवीकरणीय ऊर्जा स्रोत है।"
ask_question("सौर ऊर्जा कहाँ से आती है?", context_hi)

# --- TEST 3: BENGALI (Bangla) ---
# Context: "Kolkata is the capital of West Bengal. It is located on the banks of the Hooghly River."
context_bn = "কলকাতা পশ্চিমবঙ্গের রাজধানী। এটি হুগলি নদীর তীরে অবস্থিত।"

# Question: "What is the capital of West Bengal?"
ask_question("পশ্চিমবঙ্গের রাজধানী কী?", context_bn)