In [1]:
# Install necessary libraries from Hugging Face and PyTorch
!pip install transformers datasets accelerate bitsandbytes torch pypdf



In [2]:
# Log in to your Hugging Face account
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from datasets import load_dataset

TEACHER_MODEL = "google/gemma-3-4b-it"
STUDENT_MODEL = "distilbert-base-uncased"

In [4]:
# PDF Loader
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
  reader = PdfReader(pdf_path)
  text = ""
  for page in reader.pages:
    text += page.extract_text() +"\n"
  return text

full_text = extract_text_from_pdf("./book.pdf")

In [5]:
# Text Chunking

from transformers import AutoTokenizer



tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL)

CHUNK_SIZE = 10000
CHUNK_OVERLAP = 500

def chunk_text_by_tokens(text, tokenizer, max_tokens, overlap):
    """
    Splits text into chunks of a maximum token size with overlap.
    """
    tokenized_input = tokenizer(text, return_tensors='pt',
                                add_special_tokens=False)['input_ids'][0]

    chunks = []
    for i in range(0, len(tokenized_input), max_tokens - overlap):
        chunk_tokens = tokenized_input[i : i + max_tokens]

        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

        if i + max_tokens >= len(tokenized_input):
            break

    return chunks

text_chunks = chunk_text_by_tokens(full_text, tokenizer, CHUNK_SIZE, CHUNK_OVERLAP)

print(f"Total text divided into {len(text_chunks)} chunks of ~{CHUNK_SIZE} tokens each.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Total text divided into 4 chunks of ~10000 tokens each.


In [6]:
import json
import re
from typing import List, Dict, Any

def parse_gemma_json_output(gemma_response: str) -> List[Dict[str, str]]:
    try:
        json_string_match = re.search(r'\{.*\}', gemma_response, re.DOTALL)
        if json_string_match:
            json_data = json.loads(json_string_match.group(0))
            return json_data.get('qa_pairs', [])
        else:
            print("Error: Could not find a valid JSON structure in the response.")
            return []

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return []


In [7]:
from transformers import AutoTokenizer, Gemma3ForConditionalGeneration, BitsAndBytesConfig

quant_config = BitsAndBytesConfig(
    load_in_8bit=True
)

gemma_tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL)
gemma_model = Gemma3ForConditionalGeneration.from_pretrained(
    TEACHER_MODEL,
    device_map="auto",
    quantization_config=quant_config
).eval() # Set to evaluation mode

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
def gemma_inference_call(full_prompt_text: str) -> str:
    # Wrap the single chunk in the expected message format
    messages = [
        {"role": "user", "content": full_prompt_text}
    ]

    inputs = gemma_tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt"
    ).to(gemma_model.device)

    # The max_new_tokens is crucial for controlling cost and ensuring the JSON is complete.
    MAX_OUTPUT_TOKENS = 1500 # Estimate tokens for 10 Q&A pairs + CoT
    TEMPERATURE = 0.8        # High temperature for diverse, creative questions

    # 3. Generate the response
    with torch.inference_mode():
        output_ids = gemma_model.generate(
            input_ids=inputs,
            max_new_tokens=MAX_OUTPUT_TOKENS,
            temperature=TEMPERATURE,
            do_sample=True,
            # Stop generation immediately after the expected JSON closing bracket '}'
            # This helps prevent trailing text/gibberish.
            eos_token_id=gemma_tokenizer.eos_token_id
        )

    # Decode the generated output, excluding the input prompt
    input_length = inputs.shape[1] # Get the length of the input tensor
    decoded_output = gemma_tokenizer.decode(
        output_ids[0][input_length:],
        skip_special_tokens=True
    )

    return decoded_output.strip()

In [14]:
# Clear Torch Cache
def clear_gpu_memory():
    """Clears PyTorch's cache to free up GPU memory."""
    if torch.cuda.is_available():
        print("Clearing CUDA memory cache...")
        torch.cuda.empty_cache()
        # Optional: For better system-level cleanup, force garbage collection
        import gc
        gc.collect()
        print("CUDA memory cleared.")
    else:
        print("CUDA is not available. No GPU memory to clear.")

# Call the function:
clear_gpu_memory()

Clearing CUDA memory cache...


AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [15]:
# Teacher Inference: Dataset Generation

# This will hold the FINAL, combined, structured dataset
gemma_parsed_list = []

for i, chunk in enumerate(text_chunks):
    gemma_raw_response = gemma_inference_call(chunk)
    qa_pairs_for_chunk = parse_gemma_json_output(gemma_raw_response)
    gemma_parsed_list.extend(qa_pairs_for_chunk)

    print(f"Processed Chunk {i+1}. Added {len(qa_pairs_for_chunk)} Q&A pairs.")


AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Final preprocessing

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "distilbert-base-uncased" # Or "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Assuming 'gemma_parsed_list' is your list of dictionaries from Phase 2
# Example structure: [{'id': '001', 'context_text': '...', 'question': '...', 'answer': '...', 'thinking_process': '...'}, ...]
gemma_parsed_list = [...] # <-- Replace with your actual loaded data

# 1. Convert to DataFrame and add Enriched Context
df = pd.DataFrame(gemma_parsed_list)
# Augment context by injecting Gemma's reasoning (the distilled knowledge)
df['enriched_context'] = df.apply(
    lambda row: f"{row['context_text']} [REASONING] {row['thinking_process']}",
    axis=1
)

# Convert to Hugging Face Dataset format
raw_datasets = Dataset.from_pandas(df)

# 2. Define the Preprocessing Function
def preprocess_training_examples(examples):
    # Tokenize the question and the ENRICHED context
    tokenized_examples = tokenizer(
        examples["question"],
        examples["enriched_context"],
        truncation="only_second", # Only truncate the context if needed
        max_length=512, # DistilBERT's max length
        return_offsets_mapping=True, # CRITICAL: Needed to map answer spans
        padding="max_length"
    )

    # Initialize lists for the target labels
    start_positions = []
    end_positions = []

    for i in range(len(examples['answer'])):
        context = examples["enriched_context"][i]
        answer = examples["answer"][i]
        offsets = tokenized_examples["offset_mapping"][i]

        # Find the character start and end index of the answer in the context
        start_char = context.find(answer)
        end_char = start_char + len(answer)

        # This handles cases where the answer wasn't found (e.g., Gemma paraphrased too much)
        if start_char == -1:
            start_positions.append(0)
            end_positions.append(0)
            continue

        # Get the sequence IDs (to distinguish Question from Context/Reasoning)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 3. Map character indices to token indices (The tricky part handled by the tokenizer tools)
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(tokenized_examples["input_ids"][i]) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Adjust token indices based on character position
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        start_positions.append(token_start_index - 1)

        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        end_positions.append(token_end_index + 1)

    # Add the labels to the tokenized output
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

# 3. Apply the preprocessing function to the entire dataset
tokenized_datasets = raw_datasets.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets.column_names,
    # You may add `num_proc=X` for faster processing if you have multiple CPU cores
)

# 4. Split the data into Training and Evaluation sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)
train_set = tokenized_datasets['train']
eval_set = tokenized_datasets['test']
print(f"✅ Training set size: {len(train_set)} samples")