Section 1: Install & Import Libraries

In [86]:
!pip install kagglehub datasets transformers accelerate sentencepiece faiss-cpu sentence-transformers pypdf2 tqdm



import os
import json
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from PyPDF2 import PdfReader




Section 2: Download the Dataset

In [87]:
import kagglehub

path = kagglehub.dataset_download("buildformacarov/squad-20")
print("Dataset Path:", path)

train_path = os.path.join(path, "train-v2.0.json")
dev_path   = os.path.join(path, "dev-v2.0.json")

with open(train_path, "r") as f:
    train_data = json.load(f)

with open(dev_path, "r") as f:
    dev_data = json.load(f)


Using Colab cache for faster access to the 'squad-20' dataset.
Dataset Path: /kaggle/input/squad-20


Section 4: Extract Context–Question pairs

In [89]:
def extract_pairs(data):
    contexts = []
    questions = []
    for article in data["data"]:
        for p in article["paragraphs"]:
            context = p["context"]
            for qa in p["qas"]:
                if qa["is_impossible"] == False:
                    contexts.append(context)
                    questions.append(qa["question"])
    return contexts, questions

train_contexts, train_questions = extract_pairs(train_data)
dev_contexts, dev_questions     = extract_pairs(dev_data)

print("Train samples:", len(train_contexts))
subset = 8000
train_contexts = train_contexts[:subset]
train_questions = train_questions[:subset]


Train samples: 86821


Section 5: Load T5 Model

In [90]:
model_name = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model     = T5ForConditionalGeneration.from_pretrained(model_name)
model.to("cuda")


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

Section 6: Dataset and Encoding

In [91]:
class QGDataset(Dataset):
    def __init__(self, contexts, questions, tokenizer, max_len=256):
        self.contexts = contexts
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_len  = max_len

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        prompt = "generate question: " + self.contexts[idx]

        inputs = self.tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        labels = self.tokenizer(
            self.questions[idx],
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt"
        )["input_ids"]

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

train_ds = QGDataset(train_contexts, train_questions, tokenizer)
dev_ds   = QGDataset(dev_contexts,   dev_questions,   tokenizer)

train_dl = DataLoader(train_ds, batch_size=4, shuffle=True)
dev_dl   = DataLoader(dev_ds, batch_size=4)


SECTION 7 — Train the Model

In [92]:
optimizer = AdamW(model.parameters(), lr=2e-4)

epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    pbar = tqdm(train_dl, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in pbar:
        batch = {k: v.to("cuda") for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    print(f"=== Epoch {epoch+1}/{epochs} | Avg Loss: {total_loss/len(train_dl):.4f} ===")


Epoch 1/3:   0%|          | 0/2000 [00:00<?, ?it/s]

=== Epoch 1/3 | Avg Loss: 2.1068 ===


Epoch 2/3:   0%|          | 0/2000 [00:00<?, ?it/s]

=== Epoch 2/3 | Avg Loss: 1.8005 ===


Epoch 3/3:   0%|          | 0/2000 [00:00<?, ?it/s]

=== Epoch 3/3 | Avg Loss: 1.5961 ===


SECTION 8 — Save Model

In [93]:
model.save_pretrained("qg_t5_model")
tokenizer.save_pretrained("qg_t5_model")


('qg_t5_model/tokenizer_config.json',
 'qg_t5_model/special_tokens_map.json',
 'qg_t5_model/spiece.model',
 'qg_t5_model/added_tokens.json')

SECTION 9 — Question Generation Function (Single Context)

In [95]:
def generate_question(context, max_len=50):
    input_text = "generate question: " + context

    tokens = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    out = model.generate(
        tokens,
        max_length=max_len,
        num_beams=4,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

    return tokenizer.decode(out[0], skip_special_tokens=True)


SECTION 10 — RAG EMBEDDINGS

In [96]:
print("Building embeddings. This will take a few minutes...")

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(train_contexts, convert_to_numpy=True, show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index built:", index.ntotal)


Building embeddings. This will take a few minutes...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

FAISS index built: 8000


SECTION 11 — Retrieval Function

In [103]:
def retrieve_context(query, top_k=3):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    return [train_contexts[i] for i in I[0]]


SECTION 12 — PDF Text Extraction

In [123]:
from PyPDF2 import PdfReader

def extract_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        txt = page.extract_text()
        if txt:
            text += txt + "\n"
    return text


SECTION 13 — Chunk PDF

In [124]:
def chunk_text(text, chunk_size=300):
    """
    Break text into chunks of ~chunk_size words.
    Each chunk should still represent a coherent idea/paragraph.
    """
    chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk.strip())
    return chunks


Build Embeddings and FAISS Index from PDF only

In [130]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Change this path to the PDF you want to index
#pdf_path = "/content/week1to10.pdf"  # or "/content/week51to54.pdf"

print("Reading and chunking PDF...")
pdf_text = extract_pdf_text(pdf_path)
pdf_chunks = chunk_text(pdf_text, chunk_size=300)

print(f"Total PDF chunks: {len(pdf_chunks)}")

print("Building embeddings from PDF chunks... This will take a bit.")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

pdf_embeddings = embedder.encode(
    pdf_chunks,
    convert_to_numpy=True,
    show_progress_bar=True
)

dimension = pdf_embeddings.shape[1]
pdf_index = faiss.IndexFlatL2(dimension)
pdf_index.add(pdf_embeddings)

print("PDF FAISS index built with", pdf_index.ntotal, "entries.")


Reading and chunking PDF...
Total PDF chunks: 26
Building embeddings from PDF chunks... This will take a bit.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

PDF FAISS index built with 26 entries.


Retrieval from PDF + Question Generation

In [131]:
def retrieve_similar_chunks(chunk, top_k=2):
    """
    Given one chunk of text, find top_k similar chunks from the same PDF.
    """
    q_emb = embedder.encode([chunk], convert_to_numpy=True)
    D, I = pdf_index.search(q_emb, top_k)
    return [pdf_chunks[i] for i in I[0]]


def generate_questions_from_pdf(n_questions=10, top_k=2, questions_per_chunk=1):
    """
    Go through PDF chunks, use RAG to add similar context,
    and generate questions using the fine-tuned T5 model.
    """
    generated = []

    for ch in pdf_chunks:
        # retrieve similar chunks from the same PDF
        similar_chunks = retrieve_similar_chunks(ch, top_k=top_k)
        merged_context = ch + "\n" + "\n".join(similar_chunks)

        # generate multiple questions per chunk if needed
        for _ in range(questions_per_chunk):
            q = generate_question(merged_context)
            generated.append(q)

            if len(generated) >= n_questions:
                return generated

    return generated


Ask User & Print with Serial Numbers

In [132]:
pdf_path = input("Enter the PDF file path: ")
num_q = int(input("How many questions do you want to generate? "))

# Step 1 — Read and chunk PDF
pdf_text = extract_pdf_text(pdf_path)
pdf_chunks = chunk_text(pdf_text, chunk_size=300)

# Step 2 — Build embeddings on PDF chunks
print("Creating PDF embeddings. Please wait...")
pdf_embeddings = embedder.encode(
    pdf_chunks,
    convert_to_numpy=True,
    show_progress_bar=True
)

dimension = pdf_embeddings.shape[1]
pdf_index = faiss.IndexFlatL2(dimension)
pdf_index.add(pdf_embeddings)

# Step 3 — Generate questions
questions = generate_questions_from_pdf(
    n_questions=num_q,
    top_k=2,
    questions_per_chunk=1
)

print("\n===== Generated Questions =====\n")
for i, q in enumerate(questions, start=1):
    print(f"{i}. {q}")


Enter the PDF file path: /content/week11to30.pdf
How many questions do you want to generate? 25
Creating PDF embeddings. Please wait...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


===== Generated Questions =====

1. The CART Algorithm for Classification is an ensemble learning method that combines multiple decision trees to improve what?
2. What is a common method for building robust and reliable classification models?
3. The library provides the class for building decision tree models Key parameters to tune in the tree include what?s?
4. What is a common technique to estimate the model's performance?
5. The tree structure provides a visual representation of the decision - making process , with binary splits at each node The size of the nodes and the proportion of each class label indicate what?
6. What is a common method to estimate the model's performance?
7. What kind of model is used to predict credit risk prediction?
8. What type of model is used for credit risk prediction?
9. What is a method used to estimate the model's performance?
10. Classification and Regression Trees are a tree-like model that recursively partition the data based on what feature?
11