In [2]:
!pip install -q transformers datasets pandas torch sentencepiece nltk

In [6]:
import random
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import torch
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Added this line to download the missing resource
from nltk.tokenize import sent_tokenize
import gc

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


<torch._C.Generator at 0x7c1a80e0a810>

In [4]:
 #SECTION 2: LOAD DATASET
# ============================================================================

print("Loading SQuAD dataset...")
squad = load_dataset("squad")
print("Dataset loaded!")

# ============================================================================
# SECTION 3: UTILITY FUNCTIONS
# ============================================================================

def is_valid_context(text, min_words=40, max_words=250):
    if not text or not isinstance(text, str):
        return False
    word_count = len(text.split())
    return min_words <= word_count <= max_words

def clean_text(text):
    return ' '.join(text.replace('\n', ' ').replace('\r', ' ').split()).strip()

Loading SQuAD dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset loaded!


In [7]:
# SECTION 4: FAST DATASET GENERATION
# ============================================================================

print("\n" + "="*70)
print("GENERATING TRAINING DATA")
print("="*70)

train_data = []
target_size = 6000  # Small dataset for fast training

# Process data efficiently
for i in range(min(8000, len(squad["train"]))):
    sample = squad["train"][i]
    context = clean_text(sample["context"])

    if not is_valid_context(context):
        continue

    # Task 1: Summarization (30% of data)
    if len(train_data) < target_size * 0.3:
        sentences = sent_tokenize(context)
        if len(sentences) >= 3:
            short = sentences[0]
            long = ' '.join(sentences[:3])
            if short != long:
                train_data.append({
                    "input_text": f"summarize: {context}",
                    "target_text": f"short: {short} | long: {long}"
                })

    # Task 2: Question Generation (40% of data)
    elif len(train_data) < target_size * 0.7:
        if len(sample["answers"]["text"]) > 0:
            answer = sample["answers"]["text"][0]
            question = sample["question"]
            train_data.append({
                "input_text": f"generate question: context: {context} answer: {answer}",
                "target_text": question
            })

    # Task 3: MCQ (30% of data)
    else:
        if len(sample["answers"]["text"]) > 0:
            question = sample["question"]
            answer = sample["answers"]["text"][0]

            # Simple distractor generation
            other_answers = []
            for j in range(max(0, i-50), min(len(squad["train"]), i+50)):
                if j != i and len(squad["train"][j]["answers"]["text"]) > 0:
                    other_ans = squad["train"][j]["answers"]["text"][0]
                    if other_ans != answer and len(other_ans.split()) <= 10:
                        other_answers.append(other_ans)

            if len(other_answers) >= 3:
                distractors = random.sample(other_answers, 3)
                options = [answer] + distractors
                random.shuffle(options)

                train_data.append({
                    "input_text": f"generate mcq: question: {question} context: {context}",
                    "target_text": f"options: {' | '.join(options)} | answer: {answer}"
                })

    if len(train_data) >= target_size:
        break

    if len(train_data) % 1000 == 0:
        print(f"Generated {len(train_data)} examples...")

print(f"\nTotal examples: {len(train_data)}")

# Clean memory
del squad
gc.collect()


GENERATING TRAINING DATA
Generated 1000 examples...
Generated 2000 examples...
Generated 3000 examples...
Generated 4000 examples...
Generated 5000 examples...

Total examples: 6000


150

In [8]:
# SECTION 5: SPLIT DATA
# ============================================================================

print("\n" + "="*70)
print("SPLITTING DATA")
print("="*70)

train_df, val_df = train_test_split(pd.DataFrame(train_data), test_size=0.1, random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)}")

dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df.reset_index(drop=True))
})

del train_df, val_df, train_data
gc.collect()

# ============================================================================
# SECTION 6: LOAD MODEL
# ============================================================================

print("\n" + "="*70)
print("LOADING MODEL")
print("="*70)

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Model: {model_name}")
print(f"Parameters: {model.num_parameters():,}")


SPLITTING DATA
Train: 5400 | Val: 600

LOADING MODEL


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model: t5-small
Parameters: 60,506,624


In [9]:
# SECTION 7: TOKENIZATION
# ============================================================================

print("\n" + "="*70)
print("TOKENIZING")
print("="*70)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=384,  # Reduced
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=96,  # Reduced
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=["input_text", "target_text"],
    num_proc=2  # Parallel processing
)

print("Tokenization complete!")


TOKENIZING


Map (num_proc=2):   0%|          | 0/5400 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/600 [00:00<?, ? examples/s]

Tokenization complete!


In [10]:
# SECTION 8: TRAINING CONFIG (OPTIMIZED FOR SPEED)
# ============================================================================

print("\n" + "="*70)
print("TRAINING CONFIGURATION")
print("="*70)

training_args = TrainingArguments(
    output_dir="./notes_quiz_model",
    eval_strategy="steps",
    eval_steps=500,  # Evaluate frequently
    learning_rate=1e-4,  # Higher for faster convergence
    per_device_train_batch_size=32,  # Large batch
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=2,  # Only 2 epochs
    weight_decay=0.01,
    save_total_limit=1,  # Save only best model
    save_steps=500,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    warmup_steps=200,  # Fewer warmup steps
    max_grad_norm=1.0,
    fp16=True,  # Enable for speed on GPU
    dataloader_num_workers=2,  # Parallel data loading
    report_to="none",
    save_safetensors=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

print("Config complete!")


TRAINING CONFIGURATION


  trainer = Trainer(


Config complete!


In [11]:
# SECTION 9: TRAIN
# ============================================================================

print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70)
print("Expected time: 45-60 minutes")
print("Target validation loss: 0.4-0.9")
print("="*70 + "\n")

trainer.train()

print("\nTraining complete!")


STARTING TRAINING
Expected time: 45-60 minutes
Target validation loss: 0.4-0.9



Step,Training Loss,Validation Loss



Training complete!


In [12]:
# SECTION 10: SAVE MODEL
# ============================================================================

print("\n" + "="*70)
print("SAVING MODEL")
print("="*70)

model.save_pretrained("./final_notes_quiz_model")
tokenizer.save_pretrained("./final_notes_quiz_model")

print("Model saved!")


SAVING MODEL
Model saved!


In [14]:
# SECTION 11: QUICK TEST
# ============================================================================

print("\n" + "="*70)
print("TESTING MODEL")
print("="*70)

def generate_output(input_text, max_length=96):
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=384, truncation=True)
    # Move input_ids to the same device as the model
    input_ids = input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=3,
            early_stopping=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test with validation data
val_data = load_dataset("squad", split="validation[:5]")

# Test 1: Summarization
print("\nTest 1: Summarization")
print("-" * 50)
test_text = val_data[0]["context"]
print(f"Input: {test_text[:120]}...")
result = generate_output(f"summarize: {test_text}")
print(f"Output: {result}")

# Test 2: Question Generation
print("\n\nTest 2: Question Generation")
print("-" * 50)
test_answer = val_data[1]["answers"]["text"][0]
print(f"Answer: {test_answer}")
result = generate_output(f"generate question: context: {val_data[1]['context']} answer: {test_answer}")
print(f"Generated Question: {result}")

# Test 3: MCQ
print("\n\nTest 3: MCQ Generation")
print("-" * 50)
test_q = val_data[2]["question"]
print(f"Question: {test_q}")
result = generate_output(f"generate mcq: question: {test_q} context: {val_data[2]['context']}")
print(f"Generated MCQ: {result}")

print("\n" + "="*70)
print("ALL DONE!")
print("="*70)
print("\nDownload Instructions:")
print("1. Click folder icon (left sidebar)")
print("2. Right-click 'final_notes_quiz_model'")
print("3. Download")
print("\nYour model is ready for the Streamlit app!")


TESTING MODEL

Test 1: Summarization
--------------------------------------------------
Input: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015...
Output: super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title.


Test 2: Question Generation
--------------------------------------------------
Answer: Carolina Panthers
Generated Question: What did Denver Broncos beat Denver Broncos?


Test 3: MCQ Generation
--------------------------------------------------
Question: Where did Super Bowl 50 take place?
Generated MCQ: options: The American football game to determine the champion of the National Football League (NFL) for the 2015 season

ALL DONE!

Download Instruction