In [79]:
# %pip install peft evaluate transformers Levenshtein ipywidgets
# %pip install protobuf==3.20.3
# !rm -rf /kaggle/working/cache
# !rm -rf /kaggle/working/outputs

In [80]:
# X

import os
os.environ["TRANSFORMERS_DISABLE_CHAT_TEMPLATES"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_NO_ADDITIONAL_CHAT_TEMPLATES"] = "1"

In [81]:
import random
from datasets import load_dataset, load_from_disk
from transformers import CanineTokenizer
from peft import LoraConfig, TaskType, get_peft_model
import re
import string
from collections import Counter
import numpy as np
import Levenshtein

from transformers import TrainingArguments, Trainer, TrainerCallback
import json
from huggingface_hub import HfApi, notebook_login, whoami

In [82]:
# notebook_login()
# whoami()

In [83]:
from transformers import CanineTokenizer, CanineForQuestionAnswering
import torch
model_name = 'google/canine-s'
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

tokenizer = CanineTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=False)
model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
# funtion to  filter out impossible questions
def filter_function(example):
    return not example['is_impossible']

In [85]:
uqa_dataset = load_dataset("uqa/UQA")

# filtering
uqa_dataset_filtered = uqa_dataset.filter(filter_function)

# uqa_train = uqa_dataset_filtered["train"].shuffle(seed=42)
# uqa_val = uqa_dataset_filtered["validation"].shuffle(seed=42)

# now trying the filtered dataset
uqa_train = uqa_dataset_filtered['train'].shuffle(seed=42).select(range(80000))
uqa_val = uqa_dataset_filtered['validation'].shuffle(seed=42).select(range(8000))

print(f"üìä Dataset after filtering:")
print(f"   Original train size: {len(uqa_dataset['train']):,}")
print(f"   Filtered train size: {len(uqa_dataset_filtered['train']):,}")
print(f"   Using for training: {len(uqa_train):,}")
print(f"   Validation size: {len(uqa_val):,}")

üìä Dataset after filtering:
   Original train size: 124,745
   Filtered train size: 83,018
   Using for training: 80,000
   Validation size: 8,000


In [86]:
# Check character-token alignment
ex = uqa_train[444]
context = ex["context"]
context_tokens = tokenizer.encode(ex["context"], add_special_tokens=False)

print(f"Context length (characters): {len(context)}")
print(f"Context length (tokens): {len(context_tokens)}")
print(f"1:1 mapping: {len(context) == len(context_tokens)}")

Context length (characters): 1850
Context length (tokens): 1850
1:1 mapping: True


In [87]:
# Explore raw UQA dataset structure
print("="*80)
print("UQA DATASET STRUCTURE")
print("="*80)
print(f"Training set size: {len(uqa_train):,} examples")
print(f"Validation set size: {len(uqa_val):,} examples")
print(f"\nDataset columns: {uqa_train.column_names}")
print("\n" + "="*80)

# Show a few examples
print("\nüìù EXAMPLE 1 - Question with Answer")
print("="*80)
ex1 = uqa_train[0]
print(f"Question: {ex1['question']}")
print(f"\nContext (first 300 chars): {ex1['context'][:300]}...")
print(f"\nAnswer: '{ex1['answer']}'")
print(f"Answer starts at character position: {ex1['answer_start']}")

# Verify the answer extraction
if ex1['answer_start'] != -1:
    extracted = ex1['context'][ex1['answer_start']:ex1['answer_start']+len(ex1['answer'])]
    print(f"‚úì Extracted from context: '{extracted}'")
    print(f"‚úì Match: {extracted == ex1['answer']}")

print("\n" + "="*80)
print("\nüìù EXAMPLE 2 - Another Question")
print("="*80)
ex2 = uqa_train[100]
print(f"Question: {ex2['question']}")
print(f"\nContext length: {len(ex2['context'])} characters")
print(f"Answer: '{ex2['answer']}'")
print(f"Answer starts at position: {ex2['answer_start']}")

# Show answer in context
if ex2['answer_start'] != -1:
    start = max(0, ex2['answer_start'] - 50)
    end = min(len(ex2['context']), ex2['answer_start'] + len(ex2['answer']) + 50)
    context_snippet = ex2['context'][start:end]
    answer_pos = ex2['answer_start'] - start
    print(f"\nContext around answer:")
    print(f"...{context_snippet}...")
    print(f"    {' '*answer_pos}{'~'*len(ex2['answer'])} (answer here)")

print("\n" + "="*80)
print("\nüìä DATASET STATISTICS")
print("="*80)

# Compute some basic statistics
import numpy as np
question_lengths = [len(ex['question']) for ex in uqa_train.select(range(1000))]
context_lengths = [len(ex['context']) for ex in uqa_train.select(range(1000))]
answer_lengths = [len(ex['answer']) if ex['answer'] else 0 for ex in uqa_train.select(range(1000))]
has_answer = [ex['answer_start'] != -1 for ex in uqa_train.select(range(1000))]

print(f"Question length (chars): mean={np.mean(question_lengths):.1f}, max={np.max(question_lengths)}")
print(f"Context length (chars): mean={np.mean(context_lengths):.1f}, max={np.max(context_lengths)}")
print(f"Answer length (chars): mean={np.mean(answer_lengths):.1f}, max={np.max(answer_lengths)}")
print(f"Questions with answers: {sum(has_answer)/len(has_answer)*100:.1f}%")
print(f"Questions without answers: {(1-sum(has_answer)/len(has_answer))*100:.1f}%")

UQA DATASET STRUCTURE
Training set size: 80,000 examples
Validation set size: 8,000 examples

Dataset columns: ['id', 'title', 'context', 'question', 'is_impossible', 'answer', 'answer_start']


üìù EXAMPLE 1 - Question with Answer
Question: ŸÜÿßŸÜÿ¨ŸÜ⁄Ø ⁄©ÿß ÿß€å⁄Øÿ≤€å⁄©ŸπŸà ŸÑ€å⁄àÿ± ⁄©ŸàŸÜ €Å€íÿå ÿ≥€å⁄©ÿ±Ÿπÿ±€å ⁄©€í ÿ™ÿ≠ÿ™ ⁄©ÿßŸÖ ⁄©ÿ± ÿ±€Åÿß €Å€íÿü

Context (first 300 chars): ŸÅ€å ÿßŸÑÿ≠ÿßŸÑ ÿå ŸÜÿßŸÜÿ¨ŸÜ⁄Ø ⁄©€å ÿ≠⁄©ŸàŸÖÿ™ ⁄©ÿß ŸÖ⁄©ŸÖŸÑ ŸÜÿßŸÖ ŸÜÿßŸÜÿ¨ŸÜ⁄Ø ÿ≥Ÿπ€å ⁄©€å Ÿæ€åŸæŸÑÿ≤ ⁄ØŸàÿ±ŸÜŸÖŸÜŸπ €Å€í ÿßŸàÿ± €å€Å ÿ¥€Åÿ± ÿ≥€å Ÿæ€å ÿ≥€å ⁄©€í ÿß€å⁄© Ÿæÿßÿ±Ÿπ€å ÿ≠⁄©ŸÖÿ±ÿßŸÜ€å ⁄©€í ÿ™ÿ≠ÿ™ €Å€í ÿå ÿ¨ÿ≥ ŸÖ€å⁄∫ ÿ≥€å Ÿæ€å ÿ≥€å ŸÜÿßŸÜÿ¨ŸÜ⁄Ø ⁄©ŸÖ€åŸπ€å ÿ≥€å⁄©ÿ±Ÿπÿ±€å ÿ¥€Åÿ± ⁄©€í ⁄à€å ŸÅ€å⁄©ŸπŸà ⁄ØŸàÿ±ŸÜÿ± ⁄©€í ÿ∑Ÿàÿ± Ÿæÿ± ÿßŸàÿ± ŸÖ€åÿ¶ÿ± ÿ≥€å⁄©ÿ±Ÿπÿ±€å ⁄©€í ÿ™ÿ≠ÿ™ ⁄©ÿßŸÖ ⁄©ÿ±ŸÜ€í ŸàÿßŸÑ€å ÿ≠⁄©ŸàŸÖÿ™ ⁄©€í ÿß€å⁄Øÿ≤€å⁄©ŸπŸà ÿ≥ÿ±ÿ®ÿ±ÿß€Å ⁄©€í ÿ∑Ÿàÿ± Ÿæÿ± €Å€í€î...

Answer: 'ŸÖ€åÿ¶ÿ±'
Answer starts at character position: 196
‚úì Extracted from context: 'ŸÖ€åÿ¶ÿ±'
‚úì Match: True


üìù EXAMPLE 2 - Ano

---

## Updated preprocessors!

Previously, we tried to apply the same approach we used in TYDIQA on UQA, the problem was the preprocessors were aligning the answer spans in units of **byte-level spans** instead of **character-level spans**. The calculations were adding byte-level offsets to the answer lengths, and since Urdu characters may be quantified in multiple bytes, the model was being fed the wrong spans -> GIGO!

We are now testing an updated preprocessor

In [88]:
"""
FIXED preprocessing function for UQA with CANINE-S.
TyDiQA-style preprocessor adapted for UQA character offsets.

Key fixes applied:
1. Uses character-level offsets (UQA native format, no byte conversion needed)
2. Fixed boundary check: uses `<` instead of `<=` for chunk_end
3. Calculates gold_char_end as inclusive (answer_start + len(answer) - 1)
4. Dynamic cls_index for no-answer cases
5. Simplified context_offset calculation

This preprocessor passed all 200 real-world UQA examples in testing.
"""

MAX_SEQ_LENGTH = 384
DOC_STRIDE = 64  # Using TyDiQA's value for proven results

def preprocess_uqa(examples, tokenizer, max_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, model_obj=None, indices=None):
    """
    TyDiQA-style preprocessor adapted for UQA (character offsets).
    
    Args:
        examples: Batch with question, context, answer, answer_start fields
        tokenizer: CanineTokenizer instance
        max_length: Maximum sequence length (default 384)
        doc_stride: Sliding window overlap (default 64)
        model_obj: Optional model object (for compatibility)
        indices: Optional example indices for overflow mapping
    
    Returns:
        Dict with input_ids, attention_mask, token_type_ids, start_positions, 
        end_positions, overflow_to_sample_mapping
    """
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answer"]
    answer_starts = examples["answer_start"]
    
    special_tokens = tokenizer.num_special_tokens_to_add(pair=True)
    
    encoded = {
        "input_ids": [],
        "attention_mask": [],
        "token_type_ids": [],
        "start_positions": [],
        "end_positions": [],
        "overflow_to_sample_mapping": [],
    }
    
    for example_idx, (question, context, answer, answer_start) in enumerate(zip(questions, contexts, answers, answer_starts)):
        question_tokens = tokenizer.encode(question, add_special_tokens=False)
        context_tokens = tokenizer.encode(context, add_special_tokens=False)
        
        max_context_tokens = max_length - len(question_tokens) - special_tokens
        if max_context_tokens <= 0 or not context_tokens:
            continue
        
        # UQA uses character offsets (not bytes like TyDiQA)
        if answer and answer_start != -1:
            start_char = answer_start
            end_char = answer_start + len(answer) - 1  # Inclusive
            answer_span = (start_char, end_char)
        else:
            answer_span = None
        
        stride_tokens = max_context_tokens - doc_stride
        if stride_tokens <= 0:
            stride_tokens = max_context_tokens
        
        span_start = 0
        context_length = len(context_tokens)
        while span_start < context_length:
            span_end = min(span_start + max_context_tokens, context_length)
            context_chunk = context_tokens[span_start:span_end]
            
            input_ids = tokenizer.build_inputs_with_special_tokens(question_tokens, context_chunk)
            token_type_ids = tokenizer.create_token_type_ids_from_sequences(question_tokens, context_chunk)
            attention_mask = [1] * len(input_ids)
            
            cls_index = input_ids.index(tokenizer.cls_token_id)
            context_offset = len(input_ids) - len(context_chunk) - 1
            
            if answer_span is None:
                start_pos = cls_index
                end_pos = cls_index
            else:
                start_char, end_char = answer_span
                # CRITICAL FIX: Use < instead of <= for exclusive chunk_end
                answer_in_chunk = start_char >= span_start and end_char < span_end
                if answer_in_chunk:
                    start_pos = context_offset + (start_char - span_start)
                    end_pos = context_offset + (end_char - span_start)
                else:
                    start_pos = cls_index
                    end_pos = cls_index
            
            padding = max_length - len(input_ids)
            if padding > 0:
                pad_id = tokenizer.pad_token_id
                input_ids += [pad_id] * padding
                attention_mask += [0] * padding
                token_type_ids += [0] * padding
            else:
                input_ids = input_ids[:max_length]
                attention_mask = attention_mask[:max_length]
                token_type_ids = token_type_ids[:max_length]
                if start_pos >= max_length or end_pos >= max_length:
                    start_pos = cls_index
                    end_pos = cls_index
            
            encoded["input_ids"].append(input_ids)
            encoded["attention_mask"].append(attention_mask)
            encoded["token_type_ids"].append(token_type_ids)
            encoded["start_positions"].append(start_pos)
            encoded["end_positions"].append(end_pos)
            encoded["overflow_to_sample_mapping"].append(example_idx if indices is None else indices[example_idx])
            
            if span_end == context_length:
                break
            span_start += stride_tokens
    
    return encoded


In [89]:
# LoRA config
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=8,   # shadowing tydiqa for now
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"], # shadowing tydiqa for now
    bias="none",
    modules_to_save=["qa_outputs"],
)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


### Preprocessing examples...

In [90]:

print("="*80)
print("üî¨ PREPROCESSING WALKTHROUGH - Single Example")
print("="*80)

# Take one example
example = uqa_train[0]
print(f"\n1Ô∏è‚É£ ORIGINAL DATA")
print("-"*80)
print(f"Question: {example['question']}")
print(f"Answer: '{example['answer']}'")
print(f"Answer position: {example['answer_start']}")
print(f"Context length: {len(example['context'])} characters")

# Preprocess it
batch = {
    'question': [example['question']],
    'context': [example['context']],
    'answer': [example['answer']],
    'answer_start': [example['answer_start']]
}
processed = preprocess_uqa(batch, tokenizer, indices=[0])

print(f"\n2Ô∏è‚É£ AFTER PREPROCESSING")
print("-"*80)
print(f"Number of chunks created: {len(processed['input_ids'])}")
print(f"(Sliding window creates multiple chunks per example)")

# Show first chunk in detail
chunk_idx = 0
print(f"\n3Ô∏è‚É£ CHUNK {chunk_idx} DETAILS")
print("-"*80)
print(f"Input IDs length: {len(processed['input_ids'][chunk_idx])} tokens")
print(f"Start position: {processed['start_positions'][chunk_idx]}")
print(f"End position: {processed['end_positions'][chunk_idx]}")
print(f"Maps to original example: {processed['overflow_to_sample_mapping'][chunk_idx]}")

# Decode the inputs to show what the model sees
input_ids = processed['input_ids'][chunk_idx]
decoded_input = tokenizer.decode(input_ids, skip_special_tokens=False)
print(f"\n4Ô∏è‚É£ DECODED INPUT (first 400 chars, with special tokens)")
print("-"*80)
print(decoded_input[:400] + "...")

# Decode the labeled answer span
start_pos = processed['start_positions'][chunk_idx]
end_pos = processed['end_positions'][chunk_idx]
cls_idx = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0

if start_pos == cls_idx and end_pos == cls_idx:
    labeled_answer = "[NO ANSWER IN THIS CHUNK]"
else:
    labeled_answer = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True)

print(f"\n5Ô∏è‚É£ LABELED ANSWER SPAN IN THIS CHUNK")
print("-"*80)
print(f"Gold answer: '{example['answer']}'")
print(f"Labeled span: '{labeled_answer}'")
print(f"Match: {labeled_answer.strip() == example['answer'].strip()}")

# Show all chunks for this example
print(f"\n6Ô∏è‚É£ ALL CHUNKS FOR THIS EXAMPLE")
print("-"*80)
for i in range(len(processed['input_ids'])):
    start = processed['start_positions'][i]
    end = processed['end_positions'][i]
    if start == cls_idx and end == cls_idx:
        chunk_answer = "[NO ANSWER]"
    else:
        chunk_answer = tokenizer.decode(processed['input_ids'][i][start:end+1], skip_special_tokens=True).strip()
    has_answer = "‚úÖ" if chunk_answer == example['answer'].strip() else "‚ùå"
    print(f"  Chunk {i}: {has_answer} '{chunk_answer[:50]}'")

print("\n" + "="*80)

üî¨ PREPROCESSING WALKTHROUGH - Single Example

1Ô∏è‚É£ ORIGINAL DATA
--------------------------------------------------------------------------------
Question: ŸÜÿßŸÜÿ¨ŸÜ⁄Ø ⁄©ÿß ÿß€å⁄Øÿ≤€å⁄©ŸπŸà ŸÑ€å⁄àÿ± ⁄©ŸàŸÜ €Å€íÿå ÿ≥€å⁄©ÿ±Ÿπÿ±€å ⁄©€í ÿ™ÿ≠ÿ™ ⁄©ÿßŸÖ ⁄©ÿ± ÿ±€Åÿß €Å€íÿü
Answer: 'ŸÖ€åÿ¶ÿ±'
Answer position: 196
Context length: 268 characters

2Ô∏è‚É£ AFTER PREPROCESSING
--------------------------------------------------------------------------------
Number of chunks created: 1
(Sliding window creates multiple chunks per example)

3Ô∏è‚É£ CHUNK 0 DETAILS
--------------------------------------------------------------------------------
Input IDs length: 384 tokens
Start position: 259
End position: 262
Maps to original example: 0

4Ô∏è‚É£ DECODED INPUT (first 400 chars, with special tokens)
--------------------------------------------------------------------------------
ÓÄÄŸÜÿßŸÜÿ¨ŸÜ⁄Ø ⁄©ÿß ÿß€å⁄Øÿ≤€å⁄©ŸπŸà ŸÑ€å⁄àÿ± ⁄©ŸàŸÜ €Å€íÿå ÿ≥€å⁄©ÿ±Ÿπÿ±€å ⁄©€í ÿ™ÿ≠ÿ™ ⁄©ÿßŸÖ ⁄©ÿ± ÿ±€Åÿß €Å€íÿüÓÄÅŸÅ€å 

### Evaluation functions...

In [91]:
def normalize_answer(text):
    text = (text or "").lower()
    def remove_articles(s):
        return re.sub(r"\b(a|an|the)\b", " ", s)
    def remove_punctuation(s):
        return "".join(ch for ch in s if ch not in string.punctuation)
    def white_space_fix(s):
        return " ".join(s.split())
    return white_space_fix(remove_articles(remove_punctuation(text)))

def exact_match_score(prediction, ground_truth):
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    gold_tokens = normalize_answer(ground_truth).split()
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    # BUGFIX: Prevent division by zero if both precision and recall are 0
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def decode_prediction(input_ids, start_idx, end_idx):

    global tokenizer
    
    # Dynamic CLS handling
    cls_index = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0
    
    # No answer case (both point to CLS)
    if start_idx == cls_index and end_idx == cls_index:
        return ""
    
    # Invalid range (start after end) - treat as no answer
    if start_idx > end_idx:
        return ""
    
    # Defensive bounds checking
    if start_idx < 0 or end_idx < 0:
        return ""
    if start_idx >= len(input_ids) or end_idx >= len(input_ids):
        return ""
    
    # Clamp to valid range (additional safety)
    start_idx = max(start_idx, 0)
    end_idx = min(end_idx, len(input_ids) - 1)
    
    # Decode with inclusive slicing [start:end+1]
    text = tokenizer.decode(input_ids[start_idx:end_idx + 1], skip_special_tokens=True)
    return text.strip()

def gold_answer(example):
    if example["answer_start"] == -1:
        return ""
    return example["answer"]

def edit_distance_score(prediction, ground_truth):
    return Levenshtein.ratio(normalize_answer(prediction), normalize_answer(ground_truth))


#--- CHANGED TO MATCH TYDIQA APPROACH
def evaluate_checkpoint(checkpoint_path=None):
    """
    EXACT REPLICA of TyDiQA evaluation approach.
    Loads checkpoint from disk (or uses provided path).
    """
    # Load base model + trained adapter (TyDiQA approach)
    base_model = CanineForQuestionAnswering.from_pretrained(
        model_name, 
        trust_remote_code=False
    )
    
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model.to(device)
    
    # Exact TyDiQA eval args
    eval_args = TrainingArguments(
        output_dir="outputs/canine-s-uqa-filtered",
        per_device_eval_batch_size=1,  # Match TyDiQA exactly
        dataloader_drop_last=False,
        fp16=False,  # TyDiQA uses False
        bf16=False,
        report_to="none"
    )
    
    eval_trainer = Trainer(
        model=model,
        args=eval_args,
        eval_dataset=processed_val,
        processing_class=tokenizer,  # Use processing_class
    )
    
    # Progress bar (optional, TyDiQA has this)
    print(f"üß™ Evaluating checkpoint: {checkpoint_path}")
    from tqdm.auto import tqdm
    with tqdm(total=len(processed_val), desc="Evaluating", unit="samples") as pbar:
        predictions = eval_trainer.predict(processed_val)
        pbar.update(len(processed_val))
    
    start_logits, end_logits = predictions.predictions
    
    # EXACT TyDiQA aggregation logic
    best_predictions = {}
    for feature_index, feature in enumerate(processed_val):
        sample_idx = int(feature["overflow_to_sample_mapping"])
        input_ids = feature["input_ids"]
        
        start_idx = int(np.argmax(start_logits[feature_index]))
        end_idx = int(np.argmax(end_logits[feature_index]))
        score = float(start_logits[feature_index][start_idx] + end_logits[feature_index][end_idx])
        prediction_text = decode_prediction(input_ids, start_idx, end_idx)
        
        stored = best_predictions.get(sample_idx)
        if stored is None or score > stored[0]:
            best_predictions[sample_idx] = (score, prediction_text)

    # TEST!
    # After best_predictions loop, before computing metrics:
    print(f"\nüîç Debug: Sample predictions:")
    for idx in list(best_predictions.keys())[:5]:
        score, pred = best_predictions[idx]
        gold = gold_answer(uqa_val[idx])
        print(f"  Pred: '{pred[:50]}' | Gold: '{gold[:50]}'")
    
    # Calculate metrics
    em_scores = []
    f1_scores = []
    edit_dist_scores = []
    for sample_idx, (_, prediction_text) in best_predictions.items():
        reference = gold_answer(uqa_val[int(sample_idx)])
        em_scores.append(exact_match_score(prediction_text, reference))
        f1_scores.append(f1_score(prediction_text, reference))
        edit_dist_scores.append(edit_distance_score(prediction_text, reference))
    
    em = float(np.mean(em_scores)) if em_scores else 0.0
    f1 = float(np.mean(f1_scores)) if f1_scores else 0.0
    edit_dist = float(np.mean(edit_dist_scores)) if edit_dist_scores else 0.0
    
    print(f"Examples evaluated: {len(em_scores)}")
    print(f"Exact Match: {em * 100:.2f}")
    print(f"F1: {f1 * 100:.2f}")
    print(f"Edit Distance (normalized): {edit_dist * 100:.2f}")
    
    return {"exact_match": em, "f1": f1, "edit_distance": edit_dist}

In [92]:
# ‚ö†Ô∏è CRITICAL: Must regenerate preprocessed data with FILTERED dataset
# The old cache was created from unfiltered data - indices won't match!

# print("üîÑ Preprocessing filtered dataset (this will take a few minutes)...")
processed_train = uqa_train.map(
    lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), 
    batched=True, 
    remove_columns=uqa_train.column_names, 
    with_indices=True
)
processed_val = uqa_val.map(
    lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), 
    batched=True, 
    remove_columns=uqa_val.column_names, 
    with_indices=True
)

# print(f"‚úÖ Preprocessing complete!")
# print(f"   Training chunks: {len(processed_train):,}")
# print(f"   Validation chunks: {len(processed_val):,}")

In [93]:
# print("="*80)
# print("üìà DATASET STATISTICS AFTER PREPROCESSING")
# print("="*80)

# # Count chunks per example
# from collections import Counter
# chunks_per_example = Counter(processed_train["overflow_to_sample_mapping"])
# chunks_distribution = Counter(chunks_per_example.values())

# print(f"\nüì¶ Chunks Distribution:")
# print(f"   Total original examples: {len(uqa_train):,}")
# print(f"   Total preprocessed chunks: {len(processed_train):,}")
# print(f"   Average chunks per example: {len(processed_train)/len(uqa_train):.2f}")
# print(f"\n   Distribution:")
# for num_chunks in sorted(chunks_distribution.keys())[:10]:
#     count = chunks_distribution[num_chunks]
#     print(f"     {num_chunks} chunk(s): {count:,} examples ({count/len(uqa_train)*100:.1f}%)")

# # Count examples with answers in at least one chunk
# examples_with_answers = 0
# for orig_idx in range(len(uqa_train)):
#     # Find all chunks for this example
#     chunk_indices = [i for i, x in enumerate(processed_train["overflow_to_sample_mapping"]) if x == orig_idx]
    
#     # Check if any chunk has an answer (not pointing to CLS)
#     has_answer = False
#     for chunk_idx in chunk_indices:
#         input_ids = processed_train[chunk_idx]["input_ids"]
#         start_pos = processed_train[chunk_idx]["start_positions"]
#         end_pos = processed_train[chunk_idx]["end_positions"]
#         cls_idx = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0
        
#         if not (start_pos == cls_idx and end_pos == cls_idx):
#             has_answer = True
#             break
    
#     if has_answer:
#         examples_with_answers += 1

# print(f"\n‚úÖ Answer Coverage:")
# print(f"   Examples with answer in at least one chunk: {examples_with_answers:,}/{len(uqa_train):,} ({examples_with_answers/len(uqa_train)*100:.1f}%)")
# print(f"   Expected: ~100% (since we filtered impossible questions)")

# print("="*80)

## ‚úÖ Verification: Test Preprocessed Results (LLM generated)

Before training, let's verify that the new preprocessor produces correct results.

In [94]:
import random
from collections import defaultdict

print("="*80)
print("üß™ FIXED TEST: Answer Extraction Accuracy (OPTIMIZED)")
print("="*80)

# Step 1: Build reverse index ONCE (O(n) instead of O(n¬≤))
print("Building chunk index...")
chunk_index = defaultdict(list)
for chunk_idx, sample_idx in enumerate(processed_train["overflow_to_sample_mapping"]):
    chunk_index[sample_idx].append(chunk_idx)

# Step 2: Test on random original examples
num_samples = 200
test_orig_indices = random.sample(range(len(uqa_train)), num_samples)

correct = 0
incorrect = 0
failed_examples = []

for orig_idx in test_orig_indices:
    orig_example = uqa_train[orig_idx]
    gold_ans = orig_example["answer"].strip()
    
    # Get all chunks for this example (O(1) lookup!)
    chunk_indices = chunk_index[orig_idx]
    
    # Check if ANY chunk has the correct answer
    found_correct = False
    for chunk_idx in chunk_indices:
        proc = processed_train[chunk_idx]
        input_ids = proc["input_ids"]
        start = proc["start_positions"]
        end = proc["end_positions"]
        
        cls_idx = input_ids.index(tokenizer.cls_token_id)
        
        # Skip chunks without answer
        if start == cls_idx and end == cls_idx:
            continue
        
        # Extract answer
        predicted = tokenizer.decode(input_ids[start:end+1], skip_special_tokens=True).strip()
        
        if predicted.lower() == gold_ans.lower():
            found_correct = True
            break
    
    if found_correct or not gold_ans:
        correct += 1
    else:
        incorrect += 1
        if len(failed_examples) < 5:
            failed_examples.append({
                "idx": orig_idx,
                "question": orig_example["question"][:60],
                "gold": gold_ans[:50],
                "num_chunks": len(chunk_indices)
            })

accuracy = correct / num_samples * 100
print(f"\nüìä Results: ‚úÖ {correct}/{num_samples} ({accuracy:.1f}%)")

if accuracy >= 95:
    print("‚úÖ PASSED - Preprocessor working correctly!")
else:
    print(f"‚ùå FAILED - Only {accuracy:.1f}% accuracy")
    if failed_examples:
        print(f"\n‚ö†Ô∏è First {len(failed_examples)} failures:")
        for ex in failed_examples:
            print(f"  #{ex['idx']}: '{ex['question']}...'")
            print(f"    Expected: '{ex['gold']}', Chunks: {ex['num_chunks']}")

print("="*80)

üß™ FIXED TEST: Answer Extraction Accuracy (OPTIMIZED)
Building chunk index...

üìä Results: ‚úÖ 199/200 (99.5%)
‚úÖ PASSED - Preprocessor working correctly!


In [95]:
print("="*80)
print("üß™ TEST 3: Validation Data Integrity")
print("="*80)

# Same checks for validation data
print("\n1Ô∏è‚É£ Checking validation dataset structure...")
missing_val = [col for col in required_columns if col not in processed_val.column_names]

if missing_val:
    print(f"‚ùå CRITICAL: Missing columns: {missing_val}")
else:
    print(f"‚úÖ All required columns present")

# Check validation mapping
print("\n2Ô∏è‚É£ Validating overflow_to_sample_mapping...")
max_val_idx = max(processed_val["overflow_to_sample_mapping"])
if max_val_idx >= len(uqa_val):
    print(f"‚ùå CRITICAL: overflow_to_sample_mapping has index {max_val_idx} >= dataset size {len(uqa_val)}")
else:
    print(f"‚úÖ overflow_to_sample_mapping valid (max={max_val_idx}, dataset size={len(uqa_val)})")

# Test extraction on validation
print("\n3Ô∏è‚É£ Testing answer extraction on validation set...")
val_correct = 0
val_incorrect = 0
val_samples = min(100, len(processed_val))

for proc_idx in range(val_samples):
    proc_example = processed_val[proc_idx]
    orig_idx = proc_example["overflow_to_sample_mapping"]
    orig_example = uqa_val[orig_idx]
    
    input_ids = proc_example["input_ids"]
    start_pos = proc_example["start_positions"]
    end_pos = proc_example["end_positions"]
    
    cls_idx = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0
    
    if start_pos == cls_idx and end_pos == cls_idx:
        predicted_answer = ""
    else:
        predicted_answer = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True).strip()
    
    gold_ans = orig_example["answer"].strip()

    print(f"GOLD: {gold_ans.lower()}")
    print(f"PREDICTED: {predicted_answer.lower()}\n")
    
    if predicted_answer.lower() == gold_ans.lower():
        val_correct += 1
    else:
        val_incorrect += 1

val_accuracy = val_correct / val_samples * 100
print(f"   Validation accuracy: {val_correct}/{val_samples} ({val_accuracy:.1f}%)")

if val_accuracy < 95:
    print(f"   ‚ùå WARNING: Validation accuracy is low!")
else:
    print(f"   ‚úÖ Validation data is correct!")

print("="*80)

üß™ TEST 3: Validation Data Integrity

1Ô∏è‚É£ Checking validation dataset structure...
‚úÖ All required columns present

2Ô∏è‚É£ Validating overflow_to_sample_mapping...
‚úÖ overflow_to_sample_mapping valid (max=7999, dataset size=8000)

3Ô∏è‚É£ Testing answer extraction on validation set...
GOLD: ŸÅÿ±ŸÜ
PREDICTED: 

GOLD: ŸÅÿ±ŸÜ
PREDICTED: 

GOLD: ŸÅÿ±ŸÜ
PREDICTED: ŸÅÿ±ŸÜ

GOLD: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ
PREDICTED: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ

GOLD: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ
PREDICTED: 

GOLD: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ
PREDICTED: 

GOLD: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ
PREDICTED: 

GOLD: Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ
PREDICTED: 

GOLD: ⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©
PREDICTED: 

GOLD: ⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©
PREDICTED: ⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©

GOLD: ⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©
PREDICTED: ⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: ÿ≥€å ⁄à€å 4

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: 

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: 

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: 

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: 

GOLD: ÿ≥€å ⁄à€å 4
PREDICTED: 

GOLD: 1950 ⁄©€å

### !!! KEY TAKEAWAY FROM ABOVE CELLS!

A lot of chunks do not have the answer in the chunked context, so (0, 0) -> `[CLS]` tok is being predicted!
This may be giving way to a lot of mispredictions in evaluation!

---

In [96]:
# processed_train

In [97]:
# processed_val

In [98]:
# Save newly processed data (OPTIONAL - for future reuse with same filtered dataset)
processed_train.save_to_disk("/kaggle/working/cache/processed_train_uqa_filtered")
processed_val.save_to_disk("/kaggle/working/cache/processed_val_uqa_filtered")

# # ‚ùå DO NOT load old cache - it has index mismatches with filtered data!
# # If you've already run the preprocessing cell above, skip this cell

processed_train = load_from_disk("/kaggle/working/cache/processed_train_uqa_filtered")
processed_val = load_from_disk("/kaggle/working/cache/processed_val_uqa_filtered")

Saving the dataset (0/2 shards):   0%|          | 0/234581 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25319 [00:00<?, ? examples/s]

In [99]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")



In [100]:
# build LoRA model

peft_model = get_peft_model(model, lora_config)
peft_model.gradient_checkpointing_enable()
print_trainable_parameters(peft_model)

trainable params: 345602 || all params: 132430084 || trainable%: 0.26096940329661045


---

## Model Training:


In [102]:
training_args = TrainingArguments(
    output_dir="outputs/canine-s-uqa-filtered",

    per_device_train_batch_size=4,  # increased train_batch_size from tydiqa
    per_device_eval_batch_size=16,

    gradient_accumulation_steps=4,  # decreased grad accum from tydiqa
    gradient_checkpointing=True,

    num_train_epochs=1, # same as tydiqa
    learning_rate=3e-5,  
    weight_decay=0.01,
    
    eval_strategy="no",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,  # increased to 1000
    logging_steps=50,
    
    fp16=True,
    bf16=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="VohraAK/canine-s-uqa-filtered",
    hub_strategy="checkpoint",
    )

# CustomEvalCallback - EXACT TyDiQA approach
class CustomEvalCallback(TrainerCallback):
    def __init__(self, eval_func, eval_dataset):
        self.eval_func = eval_func
        self.eval_dataset = eval_dataset

    def on_save(self, args, state, control, model=None, **kwargs):
        """
        Runs AFTER checkpoint is saved.
        Loads checkpoint from disk and evaluates it.
        """
        checkpoint_path = f"{args.output_dir}/checkpoint-{state.global_step}"
        print(f"\nüîç Running custom evaluation at step {state.global_step}...")

        # Call evaluation function (loads from checkpoint)
        metrics = self.eval_func(checkpoint_path)

        # Add metrics to state's log_history
        state.log_history.append({
            "step": state.global_step,
            "eval_exact_match": metrics["exact_match"],
            "eval_f1": metrics["f1"],
            "eval_edit_distance": metrics["edit_distance"],
        })

        # Print metrics
        print(f"‚úÖ Step {state.global_step}: EM={metrics['exact_match']*100:.2f}, F1={metrics['f1']*100:.2f}, EditDist={metrics['edit_distance']*100:.2f}")

        # Re-save trainer_state.json with updated metrics
        state_path = f"{checkpoint_path}/trainer_state.json"
        try:
            with open(state_path, 'r') as f:
                state_dict = json.load(f)
            state_dict['log_history'] = state.log_history
            with open(state_path, 'w') as f:
                json.dump(state_dict, f, indent=2)
            print(f"üíæ Updated trainer_state.json with custom metrics")
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not update trainer_state.json: {e}")

        # Push to Hub
        try:
            print(f"‚òÅÔ∏è  Pushing checkpoint-{state.global_step} to Hub...")
            api = HfApi()
            api.upload_folder(
                folder_path=checkpoint_path,
                repo_id=args.hub_model_id,
                path_in_repo=f"checkpoint-{state.global_step}",
                commit_message=f"Add checkpoint {state.global_step} (EM={metrics['exact_match']*100:.1f}%, F1={metrics['f1']*100:.1f}%)",
                repo_type="model"
            )
            print(f"‚úÖ Pushed checkpoint-{state.global_step} to Hub")
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not push to Hub: {e}")

        return control



In [103]:
trainer_cb = CustomEvalCallback(evaluate_checkpoint, processed_val)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    callbacks=[trainer_cb],
)

No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [104]:
trainer.train()



Step,Training Loss
50,5.9451
100,5.9024
150,5.8675
200,5.8366
250,5.7987
300,5.7744
350,5.7348
400,5.7044
450,5.662
500,5.6318



üîç Running custom evaluation at step 1000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-1000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 1.30
Edit Distance (normalized): 4.01
‚úÖ Step 1000: EM=0.04, F1=1.30, EditDist=4.01
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-1000 to Hub...
‚úÖ Pushed checkpoint-1000 to Hub





üîç Running custom evaluation at step 2000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-2000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.03
F1: 0.83
Edit Distance (normalized): 2.63
‚úÖ Step 2000: EM=0.03, F1=0.83, EditDist=2.63
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-2000 to Hub...
‚úÖ Pushed checkpoint-2000 to Hub





üîç Running custom evaluation at step 3000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-3000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.67
Edit Distance (normalized): 2.15
‚úÖ Step 3000: EM=0.04, F1=0.67, EditDist=2.15
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3000 to Hub...
‚úÖ Pushed checkpoint-3000 to Hub





üîç Running custom evaluation at step 4000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-4000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.03
F1: 0.55
Edit Distance (normalized): 1.84
‚úÖ Step 4000: EM=0.03, F1=0.55, EditDist=1.84
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4000 to Hub...
‚úÖ Pushed checkpoint-4000 to Hub





üîç Running custom evaluation at step 5000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-5000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.03
F1: 0.53
Edit Distance (normalized): 1.68
‚úÖ Step 5000: EM=0.03, F1=0.53, EditDist=1.68
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5000 to Hub...
‚úÖ Pushed checkpoint-5000 to Hub





üîç Running custom evaluation at step 6000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-6000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: '' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.03
F1: 0.47
Edit Distance (normalized): 1.51
‚úÖ Step 6000: EM=0.03, F1=0.47, EditDist=1.51
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6000 to Hub...
‚úÖ Pushed checkpoint-6000 to Hub





üîç Running custom evaluation at step 7000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-7000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.01
F1: 0.42
Edit Distance (normalized): 1.40
‚úÖ Step 7000: EM=0.01, F1=0.42, EditDist=1.40
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-7000 to Hub...
‚úÖ Pushed checkpoint-7000 to Hub





üîç Running custom evaluation at step 8000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-8000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.01
F1: 0.42
Edit Distance (normalized): 1.36
‚úÖ Step 8000: EM=0.01, F1=0.42, EditDist=1.36
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-8000 to Hub...
‚úÖ Pushed checkpoint-8000 to Hub





üîç Running custom evaluation at step 9000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-9000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.03
F1: 0.42
Edit Distance (normalized): 1.32
‚úÖ Step 9000: EM=0.03, F1=0.42, EditDist=1.32
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-9000 to Hub...
‚úÖ Pushed checkpoint-9000 to Hub





üîç Running custom evaluation at step 10000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-10000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.42
Edit Distance (normalized): 1.31
‚úÖ Step 10000: EM=0.04, F1=0.42, EditDist=1.31
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-10000 to Hub...
‚úÖ Pushed checkpoint-10000 to Hub





üîç Running custom evaluation at step 11000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-11000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.40
Edit Distance (normalized): 1.25
‚úÖ Step 11000: EM=0.04, F1=0.40, EditDist=1.25
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-11000 to Hub...
‚úÖ Pushed checkpoint-11000 to Hub





üîç Running custom evaluation at step 12000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-12000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.38
Edit Distance (normalized): 1.22
‚úÖ Step 12000: EM=0.04, F1=0.38, EditDist=1.22
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-12000 to Hub...
‚úÖ Pushed checkpoint-12000 to Hub





üîç Running custom evaluation at step 13000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-13000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.36
Edit Distance (normalized): 1.19
‚úÖ Step 13000: EM=0.04, F1=0.36, EditDist=1.19
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-13000 to Hub...
‚úÖ Pushed checkpoint-13000 to Hub





üîç Running custom evaluation at step 14000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-14000


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.36
Edit Distance (normalized): 1.19
‚úÖ Step 14000: EM=0.04, F1=0.36, EditDist=1.19
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-14000 to Hub...
‚úÖ Pushed checkpoint-14000 to Hub





üîç Running custom evaluation at step 14662...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-s-uqa-filtered/checkpoint-14662


Evaluating:   0%|          | 0/25319 [00:00<?, ?samples/s]


üîç Debug: Sample predictions:
  Pred: '' | Gold: 'ŸÅÿ±ŸÜ'
  Pred: '' | Gold: 'Ÿπÿ±€å ÿ¢⁄©ÿ≥€åÿ¨ŸÜ'
  Pred: '' | Gold: '⁄Ø€åŸπÿ≥ ⁄©Ÿà Ÿæ€å⁄à ŸÑÿß⁄©'
  Pred: '' | Gold: 'ÿ≥€å ⁄à€å 4'
  Pred: 'ÿ¨Ÿàÿ≤ŸÅ' | Gold: '1950 ⁄©€å ÿØ€Åÿßÿ¶€å'
Examples evaluated: 8000
Exact Match: 0.04
F1: 0.36
Edit Distance (normalized): 1.20
‚úÖ Step 14662: EM=0.04, F1=0.36, EditDist=1.20
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-14662 to Hub...
‚úÖ Pushed checkpoint-14662 to Hub


TrainOutput(global_step=14662, training_loss=4.310766358246112, metrics={'train_runtime': 12975.5577, 'train_samples_per_second': 18.079, 'train_steps_per_second': 1.13, 'total_flos': 5.796696923995853e+16, 'train_loss': 4.310766358246112, 'epoch': 1.0})

---

### Diagnosing Preprocessing Functions!!!

These functions are just analysing the preprocessing logic above, they're just using the base model, NOT our trained model...

In [105]:
# # Diagnostic cell (fixed): Investigate preprocessing and truncation for many samples
# import random
# import pandas as pd
# from transformers import AutoTokenizer

# # Set display options to see full Urdu text
# pd.set_option('display.max_colwidth', None)

# try:
#     tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
# except Exception:
#     tokenizer = None

# num_samples = 20000  # Number of samples to check
# results = []

# for split_name, orig_data, proc_data in [
#     ("train", uqa_train, processed_train),
#     ("val", uqa_val, processed_val)
# ]:
#     # Sample random indices
#     if len(proc_data) < num_samples:
#         current_indices = range(len(proc_data))
#     else:
#         current_indices = random.sample(range(len(proc_data)), num_samples)

#     for idx in current_indices:
#         proc = proc_data[idx]
#         # Use overflow_to_sample_mapping to get the correct original index
#         orig_idx = proc["overflow_to_sample_mapping"]
#         orig = orig_data[orig_idx]

#         input_ids = proc["input_ids"]
#         start_pos = proc["start_positions"]
#         end_pos = proc["end_positions"]

#         gold_answer = orig.get("gold_answer", orig.get("answer", ""))
#         question = orig.get("question", "")

#         # Decode input_ids to text (for debugging context)
#         if tokenizer:
#             decoded_text = tokenizer.decode(input_ids, skip_special_tokens=False)
#         else:
#             decoded_text = str(input_ids)

#         # Extract predicted answer span
#         if 0 <= start_pos < len(input_ids) and 0 <= end_pos < len(input_ids):
#             if tokenizer:
#                 pred_span = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True)
#             else:
#                 pred_span = str(input_ids[start_pos:end_pos+1])
#         else:
#             pred_span = "[CLS]" # Represents no answer found in this chunk or invalid

#         # Check if pred_span matches gold answer
#         # We strip() to ignore minor whitespace differences
#         pred_matches_gold = pred_span.strip() == gold_answer.strip()

#         # Check if gold is even reachable in this chunk
#         gold_in_decoded = gold_answer in decoded_text

#         results.append({
#             "Split": split_name,
#             "Question": question,
#             "Gold Answer": gold_answer,
#             "Extracted Answer": pred_span,
#             "Match": pred_matches_gold,
#             "Gold Reachable": gold_in_decoded,
#             "orig_idx": orig_idx
#         })

# # Create DataFrame
# results_df = pd.DataFrame(results)

# # --- SIDE BY SIDE COMPARISON ---

# # 1. Filter for Solvable Mismatches (Gold was there, but we predicted wrong)
# problem_cases = results_df[
#     (results_df["Gold Reachable"] == True) &
#     (results_df["Match"] == False)
# ][["Question", "Gold Answer", "Extracted Answer", "Split"]]

# print(f"üîç Checked {len(results_df)} samples.")
# print(f"‚ùå Found {len(problem_cases)} cases where Gold was present but Extraction failed.")

# print("\nüìä Side-by-Side Comparison (Top 20 Failures):")
# display(problem_cases.head(50))

# print("\n‚úÖ Side-by-Side Comparison (First 10 Rows - Mixed):")
# display(results_df[["Question", "Gold Answer", "Extracted Answer", "Match"]].head(50))

In [106]:
# # Accuracy: fraction of rows where extracted answer matches gold answer
# accuracy = (results_df["Match"]).mean()

# # Precision: among rows where extracted answer is non-empty, fraction that matches gold
# # We filter out cases where the model predicted nothing (empty string) or just whitespace
# non_empty_pred = results_df["Extracted Answer"].str.strip() != ""

# # Avoid division by zero if no predictions were made
# if non_empty_pred.sum() > 0:
#     precision = (results_df["Match"] & non_empty_pred).sum() / non_empty_pred.sum()
# else:
#     precision = 0.0

# print(f"Accuracy: {accuracy:.3f}")
# print(f"Precision: {precision:.3f}")