In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import re
from joblib import Parallel, delayed
import json
import random
from sentence_transformers import SentenceTransformer, util
from torch.utils.data import Dataset, DataLoader
import math
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import logging

# Hide the tokenizer warning
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

Context window size: 1024


# Base Model

In [3]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device);

context_window_size = model.config.n_ctx
print(f'Context window size: {context_window_size}')

Context window size: 1024


# RAG

## Chunk Overlap
Instead of splitting the text into disjointed pieces, you make each new chunk start a sentence or two before the previous one ended.

- Original Context: [S1, S2, S3, S4, S5, S6]
- Chunking (no overlap):
    - Chunk 1: [S1, S2, S3]
    - Chunk 2: [S4, S5, S6]
- Chunking (with an overlap of 1 sentence):
    - Chunk 1: [S1, S2, S3]
    - Chunk 2: [S3, S4, S5]
    - Chunk 3: [S5, S6]
Why this helps: Let's say S2 introduces "Judi Dench" and S3 uses "she". With overlap, if a user's query matches the content of S4, you retrieve Chunk 2, which is [S3, S4, S5]. Even though S2 isn't there, the overlapping sentence S3 might still contain enough context to resolve the pronoun "she" used within it. It significantly increases the probability that a chunk is self-contained enough for the model to understand.



## Other, More Advanced Options:
Sentence Windowing: During retrieval, after you find the single best chunk, you also grab the chunk immediately before and after it to create a larger, more context-rich passage for the LLM.
Coreference Resolution: The most advanced technique is to use a separate NLP model to perform coreference resolution before chunking. This would process the text and replace pronouns like "she" with the noun they refer to ("Judi Dench"). This is powerful but adds a lot of complexity.

In [1]:
def split_context(context, min_context_length=20):
    cleanen_context = re.split(r'(?<=[.?!])\s+', context)
    cleanen_context = [x.strip() for x in cleanen_context if len(x) > min_context_length]
    return cleanen_context

def chunker(sentences, chunk_size=768, overlap_sentences=1):
    chunks = []
    current_chunk_sentences = []
    current_length = 0

    for sentence in sentences:
        # Use add_special_tokens=False to get the pure token length of the sentence
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
        tokenized_length = len(tokenized_sentence)

        if tokenized_length > chunk_size:
            # If there's a pending chunk, finalize it first
            if current_chunk_sentences:
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = []
                current_length = 0
            
            # Split the oversized sentence's tokens into smaller chunks
            for i in range(0, tokenized_length, chunk_size):
                sub_chunk_tokens = tokenized_sentence[i : i + chunk_size]
                # Decode the sub-chunk back to a string
                chunks.append(tokenizer.decode(sub_chunk_tokens))
            
            # Continue to the next sentence
            continue

        # If adding the next sentence overflows the chunk, finalize the current one
        if current_length + tokenized_length > chunk_size:
            chunks.append(" ".join(current_chunk_sentences))
            
            # Start new chunk with overlap
            if overlap_sentences > 0 and len(current_chunk_sentences) >= overlap_sentences:
                current_chunk_sentences = current_chunk_sentences[-overlap_sentences:]
            else:
                current_chunk_sentences = []
            
            # Recalculate length of the new chunk (with overlap)
            current_length = 0
            for s in current_chunk_sentences:
                current_length += len(tokenizer.encode(s, add_special_tokens=False))

        # Add the sentence to the current chunk
        current_chunk_sentences.append(sentence)
        current_length += tokenized_length

    # Add the last remaining chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks
    

def process_single_context(context, chunk_size, overlap_sentences):
    """Takes a single context string and returns a list of its chunks."""
    if not isinstance(context, str):
        return [] # Return an empty list for non-string inputs
    sentences = split_context(context)
    chunks = chunker(sentences, chunk_size=chunk_size, overlap_sentences=overlap_sentences)
    return chunks

# Chunking Data

In [2]:
INPUT_CSV_PATH = './Data/movie_trivia_qa.csv'
OUTPUT_JSONL_PATH = './Data/processed_chunks_with_qa.jsonl'
CHUNK_SIZE_FOR_FUNC = 768
OVERLAP_SENTENCES_FOR_FUNC = 1
BATCH_SIZE = 500

# --- Streaming Pipeline (Corrected to include Q&A) ---

# Create an iterator that reads the CSV in batches
csv_iterator = pd.read_csv(INPUT_CSV_PATH, chunksize=BATCH_SIZE)

with open(OUTPUT_JSONL_PATH, 'w') as outfile:
    print("Starting streaming processing to create complete training data source...")
    
    for i, df_batch in enumerate(csv_iterator):
        print(f"  - Processing batch {i+1}...")

        # Keep track of all relevant columns from the batch
        contexts_batch = df_batch['context'].tolist()
        questions_batch = df_batch['question'].tolist()
        answers_batch = df_batch['answer'].tolist()

        # Process only the contexts in parallel to get the chunks
        batch_chunk_results = Parallel(n_jobs=-1)(
            delayed(process_single_context)(
                context,
                chunk_size=CHUNK_SIZE_FOR_FUNC,
                overlap_sentences=OVERLAP_SENTENCES_FOR_FUNC
            ) for context in contexts_batch
        )

        # Zip everything together and write to the JSONL file
        for question, answer, chunks in zip(questions_batch, answers_batch, batch_chunk_results):
            output_record = {
                'question': question,
                'answer': answer,
                'chunks': chunks
            }
            outfile.write(json.dumps(output_record) + '\n')

print(f"Processing complete. Full data saved to {OUTPUT_JSONL_PATH}")

NameError: name 'pd' is not defined

In [None]:


# --- Configuration ---
INPUT_JSONL_PATH = './Data/processed_chunks_with_qa.jsonl'
TRAIN_OUTPUT_PATH = './Data/training_data.jsonl'
VAL_OUTPUT_PATH = './Data/validation_data.jsonl'
RETRIEVER_MODEL_NAME = 'all-MiniLM-L6-v2'
VALIDATION_SPLIT_RATIO = 0.1 
random.seed(42)

# --- Load the Retriever Model ---
print(f"Loading retriever model: {RETRIEVER_MODEL_NAME}...")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
retriever_model = SentenceTransformer(RETRIEVER_MODEL_NAME, device=device)
print(f"Model loaded on device: {device}")


def find_best_chunk_for_record(record, model):
    question = record.get('question')
    chunks = record.get('chunks')
    
    if not chunks or not question:
        return None

    # Encode question and chunks into vector embeddings
    question_embedding = model.encode(question, convert_to_tensor=True)
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_scores = util.cos_sim(question_embedding, chunk_embeddings)

    # Find the chunk with the highest score
    best_chunk_index = cosine_scores.argmax()
    best_chunk = chunks[best_chunk_index]

    return {
        'question': question,
        'answer': record.get('answer'),
        'context': best_chunk
    }

# --- Main Streaming, Processing, and Splitting Loop ---
# This will read the large input file once and write to two output files.
with open(INPUT_JSONL_PATH, 'r') as infile, \
     open(TRAIN_OUTPUT_PATH, 'w') as train_outfile, \
     open(VAL_OUTPUT_PATH, 'w') as val_outfile:

    print("Starting retrieval and splitting into training and validation sets...")

    for i, line in enumerate(infile):
        # Give a progress update every 5000 records
        if (i + 1) % 5000 == 0:
            print(f"  - Processed {i+1} records...")

        # 1. Load the record
        record = json.loads(line)
        
        # 2. Find the best chunk for this record
        final_record = find_best_chunk_for_record(record, retriever_model)

        # 3. Write the final record to the appropriate file
        if final_record:
            # Use a random roll to decide if this goes to validation or training
            if random.random() < VALIDATION_SPLIT_RATIO:
                val_outfile.write(json.dumps(final_record) + '\n')
            else:
                train_outfile.write(json.dumps(final_record) + '\n')

print("---")
print("Processing complete!")
print(f"Final training data saved to: {TRAIN_OUTPUT_PATH}")
print(f"Final validation data saved to: {VAL_OUTPUT_PATH}")

Loading retriever model: all-MiniLM-L6-v2...
Model loaded on device: mps
Starting retrieval and splitting into training and validation sets...
  - Processed 5000 records...
  - Processed 10000 records...
  - Processed 15000 records...
  - Processed 20000 records...
  - Processed 25000 records...
  - Processed 30000 records...
  - Processed 35000 records...
  - Processed 40000 records...
  - Processed 45000 records...
  - Processed 50000 records...
  - Processed 55000 records...
  - Processed 60000 records...
  - Processed 65000 records...
  - Processed 70000 records...
  - Processed 75000 records...
  - Processed 80000 records...
  - Processed 85000 records...
  - Processed 90000 records...
  - Processed 95000 records...
  - Processed 100000 records...
---
Processing complete!
Final training data saved to: ./Data/training_data.jsonl
Final validation data saved to: ./Data/validation_data.jsonl


# Dataset

## The Two Main Training Strategies
There are two main ways to train a model like this:

- "Retrieve-then-Train" (The Simpler, Pragmatic Approach):

  - Process: This is the "Step 2" I proposed. We do the similarity search offline, one time, as part of building the final training set. For each question in our training data, we find the single "best" chunk from its context. We then create a clean, static dataset where every entry is a perfect (question, best_chunk, answer) triplet.
  - Training: The model is then trained on these perfect examples. It learns one primary skill: "Given a question and a highly relevant piece of context, extract or generate the answer."
  - Inference: When a user asks a new question, you perform a live similarity search against your entire 13GB database of chunks to find the best one, and feed that to the model.

- "Joint" or "End-to-End" Training (The Advanced RAG Approach):
  - Process: This is a far more complex setup. During the training loop itself, for each question, you would dynamically perform a similarity search. You would retrieve not only the "positive" (correct) chunk but also several "hard negative" chunks (ones that are similar to the question but don't contain the answer).
  - Training: The model is trained on this complex input (question, positive_chunk, negative_chunk_1, negative_chunk_2, ..., answer). It has to learn two skills simultaneously: 1) To pay more attention to the positive chunk and ignore the negatives, and 2) To generate the answer from the correct context. This trains both the "retrieval" and "generation" aspects of the model.

In [9]:
class TriviaQADataset(Dataset):
    def __init__(self, file_path: str, tokenizer: AutoTokenizer, max_length: int = 1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.records = []

        # Load the data from the JSONL file
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.records.append(json.loads(line))

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        record = self.records[idx]
        context = record.get('context', '')
        question = record.get('question', '')
        answer = record.get('answer', '')

        input_text = f"Context: {context}\n\nQuestion: {question}\n\nAnswer: {answer}"

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # For language modeling, the model predicts the next token, so labels are the input_ids.
        # Squeeze to remove the batch dimension, as DataLoader will add it back.
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [13]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


train_dataset = TriviaQADataset(
    file_path='./Data/training_data.jsonl', 
    tokenizer=tokenizer
)

validation_dataset = TriviaQADataset(
    file_path='./Data/validation_data.jsonl', 
    tokenizer=tokenizer
)

# LoRA

In [14]:
class LoRALayer(torch.nn.Module):
    def __init__(self, in_features, out_features, rank=16, alpha=16):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = self.alpha / self.rank

        self.lora_A = torch.nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = torch.nn.Parameter(torch.zeros(out_features, rank))
        
        # Initialize weights
        torch.nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))


    def forward(self, x):
        lora_A_T = self.lora_A.T.to(dtype=x.dtype)
        lora_B_T = self.lora_B.T.to(dtype=x.dtype)
        
        return (x @ lora_A_T @ lora_B_T) * self.scaling

In [15]:
def add_lora_to_model(model, rank=16, alpha=16, device='mps'):

    for param in model.parameters():
        param.requires_grad = False  # Freeze original model parameters

    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            linear_layers.append((name, module))

    for name, module in linear_layers:
            
            lora_layer = LoRALayer(
                in_features=module.in_features,
                out_features=module.out_features,
                rank=rank,
                alpha=alpha
            ).to(device)
            
            # Register the LoRA layer as a submodule of the linear layer
            setattr(module, 'lora_layer', lora_layer)
            
            # Store original forward method
            original_forward = module.forward
            
            # Define new forward method
            def create_new_forward(original_fwd, lora_l):
                def new_forward(x):
                    return original_fwd(x) + lora_l(x) 
                return new_forward
            
            # Replace the forward method
            module.forward = create_new_forward(original_forward, lora_layer)
    return model

# Training

In [16]:
lora_model = add_lora_to_model(model, device=device)


lora_params = []
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        lora_params.append(param)
        print(f"Training parameter: {name}")

optimizer = torch.optim.AdamW(lora_params, lr=1e-4)

total_params = sum(p.numel() for p in lora_model.parameters())
trainable_params = sum(p.numel() for p in lora_params)

print(f"\nTotal parameters in model: {total_params}")
print(f"Trainable LoRA parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.4f}%")

Training parameter: lm_head.lora_layer.lora_A
Training parameter: lm_head.lora_layer.lora_B

Total parameters in model: 125256208
Trainable LoRA parameters: 816400
Percentage of trainable parameters: 0.6518%


In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=4)

In [36]:
NUM_EPOCHS = 3
lora_model.to(device)

for epoch in range(NUM_EPOCHS):
    print(f"--- Epoch {epoch + 1}/{NUM_EPOCHS} ---")
    
    # --- Training ---
    lora_model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        optimizer.zero_grad()
        
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = lora_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} | Average Training Loss: {avg_train_loss:.4f}")

    # --- Validation ---
    lora_model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = lora_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            
    avg_val_loss = total_val_loss / len(validation_dataloader)
    print(f"Epoch {epoch+1} | Average Validation Loss: {avg_val_loss:.4f}")

--- Epoch 1/3 ---


Training:   0%|          | 0/22707 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Training:   0%|          | 58/22707 [01:25<9:15:35,  1.47s/it] 


KeyboardInterrupt: 

In [18]:
def generate_answer(model, tokenizer, question, context, device):
    """
    Generates an answer given a question and a retrieved context.
    """
    # Format the input for the model
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    print(f"--- Generating answer for: '{question}' ---")
    
    # Generate text using the model
    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=50,  # Limit the length of the generated answer
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
    )
    
    # Decode the generated sequence
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    # Extract just the answer part
    # Handle cases where "Answer:" might not be present or is at the end
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[1].strip()
    else:
        answer = generated_text.strip() # Fallback if "Answer:" isn't found
    
    print(f"Generated Answer: {answer}\n")
    return answer

In [23]:
def evaluate_on_validation_data(model, tokenizer, validation_file_path, device, num_samples=10):
    """
    Loads validation data, generates answers, and compares them to the actual answers.
    """
    print(f"\n--- Starting Evaluation on {num_samples} Samples from Validation Set ---")
    
    try:
        with open(validation_file_path, 'r') as f:
            for i, line in enumerate(f):
                if i >= num_samples:
                    break

                record = json.loads(line)
                question = record.get('question', '')
                context = record.get('context', '')
                actual_answer = record.get('answer', '')

                # Format the input for the model
                prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
                
                # Tokenize the input
                inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True) # Truncate long contexts


                print(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True))


                input_ids = inputs.input_ids.to(device)
                attention_mask = inputs.attention_mask.to(device)
                
                # Generate text using the model
                output_sequences = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=50,
                    no_repeat_ngram_size=2,
                    pad_token_id=tokenizer.eos_token_id,
                    temperature=0.1, # Lower temperature for more deterministic, factual answers
                    top_k=40,
                )
                
                # Decode the generated sequence
                generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
                
                # Extract just the answer part
                if "Answer:" in generated_text:
                    generated_answer = generated_text.split("Answer:")[1].strip()
                else:
                    generated_answer = "Could not parse answer from model output."

                # Print the comparison
                print("-" * 40)
                print(f"Sample #{i+1}")
                print(f"Question: {question}")
                print(f"Actual Answer:   {actual_answer}")
                print(f"Generated Answer: {generated_answer}")
                print("-" * 40 + "\n")

    except FileNotFoundError:
        print(f"Error: Validation file not found at '{validation_file_path}'.")
        print("Please ensure the path is correct.")
    except Exception as e:
        print(f"An error occurred during evaluation: {e}")


# --- Main Execution ---

# Configuration
MODEL_NAME = 'gpt2'
LORA_WEIGHTS_PATH = 'lora_gpt2_triviaqa_weights.pth'
VALIDATION_DATA_PATH = './Data/validation_data.jsonl'
NUM_SAMPLES_TO_EVALUATE = 10 # Adjust as needed

# 1. Load Model and Tokenizer
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

print(f"Loading base model '{MODEL_NAME}' and tokenizer on device: {device}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.to(device)
print("Base model loaded.")

# 2. Apply LoRA structure to the model
lora_rank = 16
lora_alpha = 16
print(f"Applying LoRA structure to the model with rank={lora_rank}, alpha={lora_alpha}...")
model = add_lora_to_model(model, rank=lora_rank, alpha=lora_alpha, device=device)
print("LoRA structure applied.")

# 3. Load the fine-tuned LoRA weights
print(f"Loading LoRA weights from '{LORA_WEIGHTS_PATH}'...")

lora_state_dict = torch.load(LORA_WEIGHTS_PATH, map_location=device)
model.load_state_dict(lora_state_dict, strict=False)
print("LoRA weights loaded successfully.")


# Set the model to evaluation mode
model.eval()

# 4. Run evaluation
evaluate_on_validation_data(
    model=model,
    tokenizer=tokenizer,
    validation_file_path=VALIDATION_DATA_PATH,
    device=device,
    num_samples=NUM_SAMPLES_TO_EVALUATE
)

Loading base model 'gpt2' and tokenizer on device: mps...
Base model loaded.
Applying LoRA structure to the model with rank=16, alpha=16...
LoRA structure applied.
Loading LoRA weights from 'lora_gpt2_triviaqa_weights.pth'...
LoRA weights loaded successfully.

--- Starting Evaluation on 10 Samples from Validation Set ---
Context: One of the characteristics of the Cabindan independence movement is its constant fragmentation, into smaller and smaller factions. Economy

Angola has a rich subsoil heritage, from diamonds, oil, gold, copper, and a rich wildlife (dramatically impoverished during the civil war), forest, and fossils. Since independence, oil and diamonds have been the most important economic resource. Smallholder and plantation agriculture have dramatically dropped because of the Angolan Civil War, but have begun to recover after 2002. The transformation industry that had come into existence in the late colonial period collapsed at independence, because of the exodus of most of 

  lora_state_dict = torch.load(LORA_WEIGHTS_PATH, map_location=device)


----------------------------------------
Sample #1
Question: From which country did Angola achieve independence in 1975?
Actual Answer:   Portugal
Generated Answer: Angola
----------------------------------------

Context: Joan Alexandra Molinsky  (June 8, 1933 – September 4, 2014), better known as Joan Rivers, was an American comedian, actress, writer, producer, and television host noted for her often controversial comedic persona—where she was alternately self-deprecating or sharply acerbic, especially toward celebrities and politicians. Rivers came to prominence in 1965 as a guest on The Tonight Show. Hosted by her mentor, Johnny Carson, the show established Rivers' comedic style. In 1986, with her own rival program, The Late Show with Joan Rivers, Rivers became the first woman to host a late night network television talk show. She subsequently hosted The Joan Rivers Show (1989–1993), winning a Daytime Emmy for Outstanding Talk Show Host. Having become widely known for her comedic r