In [1]:
# Install required packages in Google Colab
!pip install evaluate datasets transformers torch scikit-learn

# Import after installation
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering,
    AutoModelForSeq2SeqLM,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    pipeline
)
import evaluate
from transformers import logging
from sklearn.metrics import f1_score
import json

# Silence warnings for cleaner output
logging.set_verbosity_error()

# Use GPU if available (Colab usually provides GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Using device: cuda


In [2]:
# Verify GPU
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("No GPU available - training will be slower")

GPU: Tesla T4
Memory: 14.7 GB


In [4]:
# Setup for Google Colab with Hugging Face authentication
!pip install -q evaluate datasets transformers torch scikit-learn

# Authenticate with Hugging Face
from google.colab import userdata
import os

# Get HF token from Colab secrets
hf_token = userdata.get('HF_TOKEN')
os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token

# Login to Hugging Face Hub
from huggingface_hub import login
login(token=hf_token)

print("✅ Successfully authenticated with Hugging Face!")

SecretNotFoundError: Secret HF_TOKEN does not exist.

In [None]:
# =============================================================================
# PART 1: DATASET PREPARATION
# =============================================================================

print("\n" + "="*60)
print("PART 1: DATASET PREPARATION - SQuAD v1.1")
print("="*60)

# Load SQuAD v1.1 dataset
print("Loading SQuAD v1.1 dataset...")
squad_dataset = load_dataset("squad")

# For demonstration and faster training, use smaller subsets
train_size = 1000  # Reduced for faster training
val_size = 200
train_dataset = squad_dataset["train"].shuffle(seed=42).select(range(train_size))
val_dataset = squad_dataset["validation"].shuffle(seed=42).select(range(val_size))

print(f"Dataset splits: {squad_dataset.keys()}")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"\nExample record:")
example = train_dataset[0]
print(f"Context: {example['context'][:200]}...")
print(f"Question: {example['question']}")
print(f"Answer: {example['answers']}")

# =============================================================================
# DECODER-ONLY MODEL: GPT-2 for Generative QA
# =============================================================================

print("\n" + "="*60)
print("DECODER-ONLY MODEL: GPT-2 for Generative QA")
print("="*60)

# Load GPT-2 tokenizer and model
gpt2_model_name = "gpt2"
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

def preprocess_gpt2_qa(examples):
    """
    Preprocess SQuAD data for GPT-2 generative QA
    Format: "Context: [context] Question: [question] Answer: [answer]"
    """
    inputs = []
    for context, question, answers in zip(examples["context"], examples["question"], examples["answers"]):
        # Take the first answer for training
        answer_text = answers["text"][0] if answers["text"] else ""

        # Format: Context: ... Question: ... Answer: ...
        input_text = f"Context: {context} Question: {question} Answer: {answer_text}"
        inputs.append(input_text)

    # Tokenize
    model_inputs = gpt2_tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # For causal LM, labels are the same as input_ids
    model_inputs["labels"] = model_inputs["input_ids"].copy()

    return model_inputs

print("Preprocessing data for GPT-2...")
train_gpt2 = train_dataset.map(preprocess_gpt2_qa, batched=True, remove_columns=train_dataset.column_names)
val_gpt2 = val_dataset.map(preprocess_gpt2_qa, batched=True, remove_columns=val_dataset.column_names)

print("Sample tokenized GPT-2 input:")
print(gpt2_tokenizer.decode(train_gpt2[0]["input_ids"][:150], skip_special_tokens=True))

# Load GPT-2 model
gpt2_model = AutoModelForCausalLM.from_pretrained(gpt2_model_name)

# Training arguments for GPT-2
training_args_gpt2 = TrainingArguments(
    output_dir="./gpt2-qa",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=50,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    report_to=[],
    load_best_model_at_end=True,
)

# Data collator for causal LM
data_collator_gpt2 = DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False)

# Create trainer
trainer_gpt2 = Trainer(
    model=gpt2_model,
    args=training_args_gpt2,
    train_dataset=train_gpt2,
    eval_dataset=val_gpt2,
    data_collator=data_collator_gpt2,
)

print("Training GPT-2 for QA...")
trainer_gpt2.train()

# Test GPT-2 generation
print("\nTesting GPT-2 generation:")
test_input = "Context: The sky is blue during the day. Question: What color is the sky? Answer:"
inputs = gpt2_tokenizer(test_input, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = gpt2_model.generate(
        inputs["input_ids"],
        max_length=inputs["input_ids"].shape[1] + 20,
        temperature=0.7,
        pad_token_id=gpt2_tokenizer.eos_token_id
    )
generated_text = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated: {generated_text}")

# =============================================================================
# ENCODER-ONLY MODEL: BERT for Extractive QA
# =============================================================================

print("\n" + "="*60)
print("ENCODER-ONLY MODEL: BERT for Extractive QA")
print("="*60)

# Load BERT for Question Answering
bert_model_name = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

def preprocess_bert_qa(examples):
    """
    Preprocess SQuAD data for BERT extractive QA
    """
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    # Tokenize questions and contexts together
    inputs = bert_tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",  # Truncate context if needed
        padding="max_length",
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
        stride=128,
    )

    # Find start and end positions for answers
    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, (answer, offset) in enumerate(zip(examples["answers"], offset_mapping)):
        if answer["answer_start"]:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            # Find token start and end positions
            token_start = None
            token_end = None

            for idx, (start_offset, end_offset) in enumerate(offset):
                if start_offset <= start_char < end_offset and token_start is None:
                    token_start = idx
                if start_offset < end_char <= end_offset and token_end is None:
                    token_end = idx
                    break

            if token_start is None:
                token_start = 0
            if token_end is None:
                token_end = 0

            start_positions.append(token_start)
            end_positions.append(token_end)
        else:
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

print("Preprocessing data for BERT...")
train_bert = train_dataset.map(preprocess_bert_qa, batched=True, remove_columns=train_dataset.column_names)
val_bert = val_dataset.map(preprocess_bert_qa, batched=True, remove_columns=val_dataset.column_names)

# Load BERT QA model
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)

# Training arguments for BERT
training_args_bert = TrainingArguments(
    output_dir="./bert-qa",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),
    report_to=[],
    load_best_model_at_end=True,
)

# Create trainer for BERT
trainer_bert = Trainer(
    model=bert_model,
    args=training_args_bert,
    train_dataset=train_bert,
    eval_dataset=val_bert,
    tokenizer=bert_tokenizer,
)

print("Training BERT for QA...")
trainer_bert.train()

# Test BERT QA
print("\nTesting BERT QA:")
qa_pipeline = pipeline("question-answering", model=bert_model, tokenizer=bert_tokenizer)
test_context = "The sky is blue during the day because of the way sunlight interacts with the atmosphere."
test_question = "What color is the sky?"
result = qa_pipeline(question=test_question, context=test_context)
print(f"Question: {test_question}")
print(f"Answer: {result['answer']} (confidence: {result['score']:.3f})")

# =============================================================================
# ENCODER-DECODER MODEL: T5 for Generative QA
# =============================================================================

print("\n" + "="*60)
print("ENCODER-DECODER MODEL: T5 for Generative QA")
print("="*60)

# Load T5 tokenizer and model
t5_model_name = "t5-small"
t5_tokenizer = AutoTokenizer.from_pretrained(t5_model_name)

def preprocess_t5_qa(examples):
    """
    Preprocess SQuAD data for T5 generative QA
    Format: "question: [question] context: [context]"
    """
    inputs = []
    targets = []

    for context, question, answers in zip(examples["context"], examples["question"], examples["answers"]):
        # Input format for T5
        input_text = f"question: {question} context: {context}"
        inputs.append(input_text)

        # Target is the answer
        answer_text = answers["text"][0] if answers["text"] else ""
        targets.append(answer_text)

    # Tokenize inputs
    model_inputs = t5_tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets
    labels = t5_tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

print("Preprocessing data for T5...")
train_t5 = train_dataset.map(preprocess_t5_qa, batched=True, remove_columns=train_dataset.column_names)
val_t5 = val_dataset.map(preprocess_t5_qa, batched=True, remove_columns=val_dataset.column_names)

print("Sample T5 input:")
sample_input = t5_tokenizer.decode(train_t5[0]["input_ids"][:100], skip_special_tokens=True)
sample_target = t5_tokenizer.decode(train_t5[0]["labels"][:50], skip_special_tokens=True)
print(f"Input: {sample_input}")
print(f"Target: {sample_target}")

# Load T5 model
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_name)

# Data collator for seq2seq
data_collator_t5 = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=t5_model)

# Training arguments for T5
training_args_t5 = TrainingArguments(
    output_dir="./t5-qa",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=100,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    report_to=[],
    load_best_model_at_end=True,
)

# Create trainer for T5
trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=train_t5,
    eval_dataset=val_t5,
    data_collator=data_collator_t5,
    tokenizer=t5_tokenizer,
)

print("Training T5 for QA...")
trainer_t5.train()

# Test T5 generation
print("\nTesting T5 QA:")
test_input = "question: What color is the sky? context: The sky is blue during the day."
inputs = t5_tokenizer(test_input, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = t5_model.generate(**inputs, max_length=50)
generated_answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Input: {test_input}")
print(f"Generated Answer: {generated_answer}")


PART 1: DATASET PREPARATION - SQuAD v1.1
Loading SQuAD v1.1 dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset splits: dict_keys(['train', 'validation'])
Training samples: 1000
Validation samples: 200

Example record:
Context: The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan indepen...
Question: What percentage of Egyptians polled support death penalty for those leaving Islam?
Answer: {'text': ['84%'], 'answer_start': [468]}

DECODER-ONLY MODEL: GPT-2 for Generative QA


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Preprocessing data for GPT-2...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Sample tokenized GPT-2 input:
Context: The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery. Question: What percentage of Egyptians polled support death penalty for those leaving Islam? Answer: 84%


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Training GPT-2 for QA...
{'loss': 3.4158, 'grad_norm': 5.833383560180664, 'learning_rate': 4.9e-05, 'epoch': 0.4}
{'loss': 3.283, 'grad_norm': 4.93460750579834, 'learning_rate': 3.775e-05, 'epoch': 0.8}
{'eval_loss': 3.0946037769317627, 'eval_runtime': 4.5548, 'eval_samples_per_second': 43.91, 'eval_steps_per_second': 21.955, 'epoch': 0.8}
{'loss': 3.1636, 'grad_norm': 5.608552932739258, 'learning_rate': 2.525e-05, 'epoch': 1.2}
{'loss': 3.0766, 'grad_norm': 5.3256611824035645, 'learning_rate': 1.2750000000000002e-05, 'epoch': 1.6}
{'eval_loss': 3.097227096557617, 'eval_runtime': 4.5296, 'eval_samples_per_second': 44.154, 'eval_steps_per_second': 22.077, 'epoch': 1.6}
{'loss': 3.0799, 'grad_norm': 5.2873053550720215, 'learning_rate': 2.5000000000000004e-07, 'epoch': 2.0}
{'train_runtime': 159.6111, 'train_samples_per_second': 12.53, 'train_steps_per_second': 1.566, 'train_loss': 3.2038064575195313, 'epoch': 2.0}

Testing GPT-2 generation:
Generated: Context: The sky is blue during the 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Preprocessing data for BERT...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  trainer_bert = Trainer(


Training BERT for QA...
{'loss': 5.7506, 'grad_norm': 10.413178443908691, 'learning_rate': 2.45e-05, 'epoch': 0.2}
{'loss': 4.3824, 'grad_norm': 33.19166946411133, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.4}
{'eval_loss': 3.8427538871765137, 'eval_runtime': 1.7354, 'eval_samples_per_second': 115.249, 'eval_steps_per_second': 28.812, 'epoch': 0.4}
{'loss': 3.7985, 'grad_norm': 54.58707809448242, 'learning_rate': 4.6230769230769234e-05, 'epoch': 0.6}
{'loss': 3.2482, 'grad_norm': 42.9770393371582, 'learning_rate': 4.238461538461539e-05, 'epoch': 0.8}
{'eval_loss': 3.0566883087158203, 'eval_runtime': 1.6756, 'eval_samples_per_second': 119.359, 'eval_steps_per_second': 29.84, 'epoch': 0.8}
{'loss': 3.0676, 'grad_norm': 33.57253646850586, 'learning_rate': 3.853846153846154e-05, 'epoch': 1.0}
{'loss': 2.2667, 'grad_norm': 36.60321044921875, 'learning_rate': 3.4692307692307694e-05, 'epoch': 1.2}
{'eval_loss': 3.059708595275879, 'eval_runtime': 1.6501, 'eval_samples_per_second': 121

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Preprocessing data for T5...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Sample T5 input:
Input: question: What percentage of Egyptians polled support death penalty for those leaving Islam? context: The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to 
Target: 84%


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer_t5 = Trainer(


Training T5 for QA...
{'loss': 22.0202, 'grad_norm': 81.15165710449219, 'learning_rate': 2.45e-05, 'epoch': 0.4}
{'loss': 7.415, 'grad_norm': 11.007974624633789, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.8}
{'eval_loss': 0.1619606912136078, 'eval_runtime': 2.4083, 'eval_samples_per_second': 83.046, 'eval_steps_per_second': 41.523, 'epoch': 0.8}
{'loss': 0.348, 'grad_norm': 0.8957287669181824, 'learning_rate': 4.109090909090909e-05, 'epoch': 1.2}
{'loss': 0.1773, 'grad_norm': 0.8289073705673218, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.6}
{'eval_loss': 0.11287793517112732, 'eval_runtime': 2.4497, 'eval_samples_per_second': 81.643, 'eval_steps_per_second': 40.822, 'epoch': 1.6}
{'loss': 0.1263, 'grad_norm': 1.025316596031189, 'learning_rate': 2.290909090909091e-05, 'epoch': 2.0}
{'loss': 0.0895, 'grad_norm': 0.6960127353668213, 'learning_rate': 1.3818181818181818e-05, 'epoch': 2.4}
{'eval_loss': 0.039813559502363205, 'eval_runtime': 2.3894, 'eval_samples_per_second':

In [None]:
# Clear memory before evaluation
import gc
torch.cuda.empty_cache()
gc.collect()
print(f"Memory cleared. GPU usage: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")

Memory cleared. GPU usage: 3.35 GB


In [5]:
# Fix for evaluate library issue - add this cell before running Part 2
EVALUATE_AVAILABLE = False
print("Using manual metric implementations (bypassing evaluate library)")

# Also define the manual functions if not already defined
import string
import re
from collections import Counter
import numpy as np

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score_squad(prediction, ground_truth):
    """Compute F1 score between prediction and ground truth."""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def exact_match_score(prediction, ground_truth):
    """Compute exact match score."""
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def compute_squad_metrics(predictions, references):
    """Compute SQuAD metrics manually."""
    f1_scores = []
    em_scores = []

    for pred, ref in zip(predictions, references):
        pred_text = pred["prediction_text"]
        ref_answers = ref["answers"]["text"]

        # Take the best score among all reference answers
        max_f1 = max([f1_score_squad(pred_text, ans) for ans in ref_answers])
        max_em = max([exact_match_score(pred_text, ans) for ans in ref_answers])

        f1_scores.append(max_f1)
        em_scores.append(max_em)

    return {
        "exact_match": np.mean(em_scores) * 100,
        "f1": np.mean(f1_scores) * 100
    }

def compute_bleu_score(predictions, references):
    """Compute BLEU score manually (simplified version)."""
    total_bleu = 0
    count = 0

    for pred, ref_list in zip(predictions, references):
        pred_tokens = pred.split()

        # Handle reference format
        if isinstance(ref_list[0], list):
            ref_tokens = ref_list[0][0].split()  # Take first reference
        else:
            ref_tokens = ref_list[0].split()

        if len(pred_tokens) == 0 or len(ref_tokens) == 0:
            continue

        # Simple unigram BLEU for robustness
        pred_set = set(pred_tokens)
        ref_set = set(ref_tokens)

        if len(pred_set) > 0:
            overlap = len(pred_set & ref_set)
            bleu = overlap / len(pred_set)
        else:
            bleu = 0.0

        total_bleu += bleu
        count += 1

    return total_bleu / count if count > 0 else 0.0

print("✅ Manual metrics loaded successfully!")

Using manual metric implementations (bypassing evaluate library)
✅ Manual metrics loaded successfully!


In [10]:
# --- Recovery cell: reload models/tokenizers for evaluation ---
import os, torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering,
    AutoModelForSeq2SeqLM,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_or_fallback(model_dir, base_name, cls):
    if os.path.isdir(model_dir):
        try:
            print(f"Loading {cls.__name__} from {model_dir} ...")
            return cls.from_pretrained(model_dir)
        except Exception as e:
            print(f"⚠️ Could not load from {model_dir}: {e}. Falling back to {base_name}.")
    print(f"Loading {cls.__name__} from hub: {base_name} ...")
    return cls.from_pretrained(base_name)

# Tokenizers (prefer checkpoint dirs so special tokens/configs match)
if "gpt2_tokenizer" not in globals():
    gpt2_tokenizer = AutoTokenizer.from_pretrained("./gpt2-qa" if os.path.isdir("./gpt2-qa") else "gpt2")
    if gpt2_tokenizer.pad_token is None:
        gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

if "bert_tokenizer" not in globals():
    bert_tokenizer = AutoTokenizer.from_pretrained("./bert-qa" if os.path.isdir("./bert-qa") else "bert-base-uncased")

if "t5_tokenizer" not in globals():
    t5_tokenizer = AutoTokenizer.from_pretrained("./t5-qa" if os.path.isdir("./t5-qa") else "t5-small")

# Models (prefer your fine-tuned checkpoints)
if "gpt2_model" not in globals():
    gpt2_model = load_or_fallback("./gpt2-qa", "gpt2", AutoModelForCausalLM).to(device)

if "bert_model" not in globals():
    bert_model = load_or_fallback("./bert-qa", "bert-base-uncased", AutoModelForQuestionAnswering).to(device)

if "t5_model" not in globals():
    t5_model = load_or_fallback("./t5-qa", "t5-small", AutoModelForSeq2SeqLM).to(device)

print("✅ Models ready:",
      type(gpt2_model).__name__, "|",
      type(bert_model).__name__, "|",
      type(t5_model).__name__)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loading AutoModelForCausalLM from hub: gpt2 ...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading AutoModelForQuestionAnswering from hub: bert-base-uncased ...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading AutoModelForSeq2SeqLM from hub: t5-small ...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Models ready: GPT2LMHeadModel | BertForQuestionAnswering | T5ForConditionalGeneration
