# CANINE-S Hybrid - UQA

In [4]:
# %pip install peft evaluate transformers Levenshtein ipywidgets
# %pip install protobuf==3.20.3
# !rm -rf /kaggle/working/cache

In [5]:
# X

import os
os.environ["TRANSFORMERS_DISABLE_CHAT_TEMPLATES"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_NO_ADDITIONAL_CHAT_TEMPLATES"] = "1"

In [6]:
from datasets import load_dataset, load_from_disk
# from UQA.canine_utils import preprocess_uqa, lora_config, print_trainable_parameters, normalize_answer, exact_match_score, f1_score, edit_distance_score, gold_answer, decode_prediction
from transformers import CanineTokenizer
from peft import LoraConfig, TaskType, get_peft_model
import re
import string
from collections import Counter
import numpy as np
import Levenshtein

from transformers import TrainingArguments, Trainer, TrainerCallback
import json
from huggingface_hub import HfApi, notebook_login, whoami

In [7]:
# notebook_login()
# whoami()

In [8]:
from transformers import CanineTokenizer, CanineForQuestionAnswering
import torch
model_name = 'google/canine-s'
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

tokenizer = CanineTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=False)
model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)

tokenizer_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/528M [00:00<?, ?B/s]

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
uqa_dataset = load_dataset("uqa/UQA")
uqa_train = uqa_dataset["train"].shuffle(seed=42).select(range(40000))
uqa_val = uqa_dataset["validation"].shuffle(seed=42).select(range(2000))

README.md:   0%|          | 0.00/898 [00:00<?, ?B/s]

data/train-00000-of-00001-bac007e8ca7192(‚Ä¶):   0%|          | 0.00/30.2M [00:00<?, ?B/s]

data/validation-00000-of-00001-cf8a6960d(‚Ä¶):   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/124745 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16824 [00:00<?, ? examples/s]

In [None]:
# Explore raw UQA dataset structure
print("="*80)
print("UQA DATASET STRUCTURE")
print("="*80)
print(f"Training set size: {len(uqa_train):,} examples")
print(f"Validation set size: {len(uqa_val):,} examples")
print(f"\nDataset columns: {uqa_train.column_names}")
print("\n" + "="*80)

# Show a few examples
print("\nüìù EXAMPLE 1 - Question with Answer")
print("="*80)
ex1 = uqa_train[0]
print(f"Question: {ex1['question']}")
print(f"\nContext (first 300 chars): {ex1['context'][:300]}...")
print(f"\nAnswer: '{ex1['answer']}'")
print(f"Answer starts at character position: {ex1['answer_start']}")

# Verify the answer extraction
if ex1['answer_start'] != -1:
    extracted = ex1['context'][ex1['answer_start']:ex1['answer_start']+len(ex1['answer'])]
    print(f"‚úì Extracted from context: '{extracted}'")
    print(f"‚úì Match: {extracted == ex1['answer']}")

print("\n" + "="*80)
print("\nüìù EXAMPLE 2 - Another Question")
print("="*80)
ex2 = uqa_train[100]
print(f"Question: {ex2['question']}")
print(f"\nContext length: {len(ex2['context'])} characters")
print(f"Answer: '{ex2['answer']}'")
print(f"Answer starts at position: {ex2['answer_start']}")

# Show answer in context
if ex2['answer_start'] != -1:
    start = max(0, ex2['answer_start'] - 50)
    end = min(len(ex2['context']), ex2['answer_start'] + len(ex2['answer']) + 50)
    context_snippet = ex2['context'][start:end]
    answer_pos = ex2['answer_start'] - start
    print(f"\nContext around answer:")
    print(f"...{context_snippet}...")
    print(f"    {' '*answer_pos}{'~'*len(ex2['answer'])} (answer here)")

print("\n" + "="*80)
print("\nüìä DATASET STATISTICS")
print("="*80)

# Compute some basic statistics
import numpy as np
question_lengths = [len(ex['question']) for ex in uqa_train.select(range(1000))]
context_lengths = [len(ex['context']) for ex in uqa_train.select(range(1000))]
answer_lengths = [len(ex['answer']) if ex['answer'] else 0 for ex in uqa_train.select(range(1000))]
has_answer = [ex['answer_start'] != -1 for ex in uqa_train.select(range(1000))]

print(f"Question length (chars): mean={np.mean(question_lengths):.1f}, max={np.max(question_lengths)}")
print(f"Context length (chars): mean={np.mean(context_lengths):.1f}, max={np.max(context_lengths)}")
print(f"Answer length (chars): mean={np.mean(answer_lengths):.1f}, max={np.max(answer_lengths)}")
print(f"Questions with answers: {sum(has_answer)/len(has_answer)*100:.1f}%")
print(f"Questions without answers: {(1-sum(has_answer)/len(has_answer))*100:.1f}%")

## üîç Data Exploration: Understanding the UQA Dataset

Let's explore what the raw dataset looks like before preprocessing.

---

## Updated preprocessors!

Previously, we tried to apply the same approach we used in TYDIQA on UQA, the problem was the preprocessors were aligning the answer spans in units of **byte-level spans** instead of **character-level spans**. The calculations were adding byte-level offsets to the answer lengths, and since Urdu characters may be quantified in multiple bytes, the model was being fed the wrong spans -> GIGO!

In [None]:
"""
FIXED preprocessing function for UQA with CANINE-S.
Copy this into Train_CANINE_S_LoRA_UQA.ipynb cell 8 to replace the existing preprocess_uqa function.

Key fixes:
1. Added byte-to-char conversion helpers (from TyDiQA)
2. Support both byte-based and character-based offsets via use_byte_offsets parameter
3. Changed gold_char_end calculation to be inclusive (removed +1, added -1 after len(answer))
4. Use dynamic cls_index for no-answer cases instead of hardcoded 0
5. Fixed answer chunk boundary check (< instead of <=)
6. Removed incorrect -1 subtraction from end_pos calculation
"""

from bisect import bisect_right

MAX_SEQ_LENGTH = 384
DOC_STRIDE = 64

def _build_byte_to_char_index(text: str) -> list:
    """Build cumulative UTF-8 byte offsets for each character boundary."""
    cumulative = [0]
    for char in text:
        cumulative.append(cumulative[-1] + len(char.encode("utf-8")))
    return cumulative

def _byte_to_char(cumulative_bytes: list, byte_index: int) -> int:
    """Map a byte offset to the nearest character index (floor)."""
    position = bisect_right(cumulative_bytes, byte_index) - 1
    return max(position, 0)

def preprocess_uqa(examples, tokenizer, max_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, model_obj=None, indices=None, use_byte_offsets=False):
    """
    Robust preprocessing for UQA (Urdu Question Answering) with CANINE-S.
    
    Args:
        examples: Batch of examples with question, context, answer, answer_start fields
        tokenizer: CanineTokenizer instance
        max_length: Maximum sequence length
        doc_stride: Sliding window stride
        model_obj: Optional model object (unused, for compatibility)
        indices: Optional example indices for overflow mapping
        use_byte_offsets: If True, treats answer_start as byte offset (like TyDiQA)
                         If False, treats as character offset (default UQA behavior)
    
    Returns:
        Dict with input_ids, attention_mask, token_type_ids, start_positions, 
        end_positions, overflow_to_sample_mapping
    """
    # Handle tokenizer/model limits safely
    tokenizer_max = getattr(tokenizer, "model_max_length", max_length)
    model_max = getattr(model_obj.config, "max_position_embeddings", None) if model_obj is not None else None
    max_allowed = max_length
    if tokenizer_max is not None and tokenizer_max > 0:
        max_allowed = min(max_allowed, tokenizer_max)
    if model_max is not None and model_max > 0:
        max_allowed = min(max_allowed, model_max)

    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answer"]
    answer_starts = examples["answer_start"]

    encoded = {
        "input_ids": [],
        "attention_mask": [],
        "token_type_ids": [],
        "start_positions": [],
        "end_positions": [],
        "overflow_to_sample_mapping": []
    }

    for i, (question, context, answer, answer_start) in enumerate(zip(questions, contexts, answers, answer_starts)):
        example_idx = indices[i] if indices is not None else i

        # CANINE encodes to characters directly (1 char = 1 token)
        question_ids = tokenizer.encode(question, add_special_tokens=False)
        context_ids = tokenizer.encode(context, add_special_tokens=False)

        # 1. Setup Targets - Convert offsets to character indices
        if answer and answer_start != -1:
            if use_byte_offsets:
                # UQA might use byte offsets for multi-byte Urdu characters
                byte_map = _build_byte_to_char_index(context)
                gold_char_start = _byte_to_char(byte_map, answer_start)
                answer_end_byte = answer_start + len(answer.encode('utf-8'))
                gold_char_end = _byte_to_char(byte_map, answer_end_byte - 1)
            else:
                # Standard character-based offsets
                gold_char_start = answer_start
                # CRITICAL FIX: gold_char_end is INCLUSIVE (points to last char, not past it)
                gold_char_end = answer_start + len(answer) - 1
        else:
            gold_char_start = -1
            gold_char_end = -1

        # 2. Calculate Window Size
        special_tokens_count = tokenizer.num_special_tokens_to_add(pair=True)
        max_context_length = max_allowed - len(question_ids) - special_tokens_count

        if max_context_length <= 0:
            continue

        # 3. Sliding Window Loop
        stride_step = max_context_length - doc_stride
        if stride_step <= 0:
            stride_step = max_context_length

        for chunk_start_idx in range(0, len(context_ids), stride_step):
            chunk_end_idx = min(chunk_start_idx + max_context_length, len(context_ids))
            context_chunk = context_ids[chunk_start_idx:chunk_end_idx]

            # Build inputs with special tokens: [CLS] question [SEP] context [SEP]
            input_ids = tokenizer.build_inputs_with_special_tokens(question_ids, context_chunk)
            token_type_ids = tokenizer.create_token_type_ids_from_sequences(question_ids, context_chunk)
            attention_mask = [1] * len(input_ids)

            # Find where context starts in input_ids
            sep_indices = [k for k, x in enumerate(input_ids) if x == tokenizer.sep_token_id]
            if not sep_indices:
                continue
            context_offset_in_input = sep_indices[0] + 1
            
            # Find CLS position dynamically (should be 0 for CANINE, but be safe)
            cls_index = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0

            # 4. Label Assignment
            # Check if answer is ENTIRELY within this chunk (both start and end)
            is_answer_in_chunk = (
                gold_char_start >= chunk_start_idx and
                gold_char_end <= chunk_end_idx and  # Inclusive: answer must fit within chunk
                gold_char_start != -1
            )

            if is_answer_in_chunk:
                # Map global context indices to local input_ids indices
                start_pos = context_offset_in_input + (gold_char_start - chunk_start_idx)
                end_pos = context_offset_in_input + (gold_char_end - chunk_start_idx)
                # NO -1 here because gold_char_end is already INCLUSIVE
            else:
                # No answer in this chunk - point to [CLS] token
                start_pos = cls_index
                end_pos = cls_index

            # 5. Padding
            pad_len = max_allowed - len(input_ids)
            if pad_len > 0:
                input_ids += [tokenizer.pad_token_id] * pad_len
                attention_mask += [0] * pad_len
                token_type_ids += [0] * pad_len

            # 6. Final Safety Truncation
            if len(input_ids) > max_allowed:
                input_ids = input_ids[:max_allowed]
                attention_mask = attention_mask[:max_allowed]
                token_type_ids = token_type_ids[:max_allowed]
                if start_pos >= max_allowed or end_pos >= max_allowed:
                    start_pos = cls_index
                    end_pos = cls_index

            encoded["input_ids"].append(input_ids)
            encoded["attention_mask"].append(attention_mask)
            encoded["token_type_ids"].append(token_type_ids)
            encoded["start_positions"].append(start_pos)
            encoded["end_positions"].append(end_pos)
            encoded["overflow_to_sample_mapping"].append(example_idx)

            # Break if we've covered the entire context
            if chunk_end_idx >= len(context_ids):
                break

    return encoded


# USAGE EXAMPLE:
# First, test which offset type UQA uses:
# Run: python diagnose_uqa_offsets.py
#
# If character-based (expected):
# processed_train = uqa_train.map(
#     lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices, use_byte_offsets=False),
#     batched=True, remove_columns=uqa_train.column_names, with_indices=True
# )
#
# If byte-based (like TyDiQA):
# processed_train = uqa_train.map(
#     lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices, use_byte_offsets=True),
#     batched=True, remove_columns=uqa_train.column_names, with_indices=True
# )


In [None]:
# üîç Inspect CANINE-S Architecture to determine layer count
print("="*80)
print("üîç INSPECTING CANINE-S ARCHITECTURE")
print("="*80)

# Count encoder layers
num_encoder_layers = 0
for name, _ in model.named_parameters():
    if "encoder.layer." in name:
        layer_num = int(name.split("encoder.layer.")[1].split(".")[0])
        num_encoder_layers = max(num_encoder_layers, layer_num + 1)

print(f"\nüìä Model Architecture:")
print(f"  Total encoder layers: {num_encoder_layers}")
print(f"  Will apply LoRA to: All {num_encoder_layers} layers (Q, K, V)")
print(f"  Will fully unfreeze: Layer {num_encoder_layers - 1} (last layer only)")
print(f"  Will keep frozen: Layers 0-{num_encoder_layers - 2}")

print(f"\nüéØ Hybrid Strategy:")
print(f"  - LoRA adapters provide parameter-efficient adaptation")
print(f"  - Last layer unfrozen for maximum task-specific expressiveness")
print(f"  - Expected trainable params: ~6-8M (~5% of 133M total)")
print(f"  - Training time increase: ~25% (acceptable trade-off)")
print("\n" + "="*80)

In [None]:
# LoRA config
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=16,   # changed from 8
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value", "key"],
    bias="none",
    modules_to_save=["qa_outputs"],
)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


In [11]:
# LoRA config
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=16,   # changed from 8
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value", "key"],
    bias="none",
    modules_to_save=["qa_outputs"],
)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


In [None]:
# Let's manually preprocess ONE example to see what happens step by step
print("="*80)
print("üî¨ PREPROCESSING WALKTHROUGH - Single Example")
print("="*80)

# Take one example
example = uqa_train[0]
print(f"\n1Ô∏è‚É£ ORIGINAL DATA")
print("-"*80)
print(f"Question: {example['question']}")
print(f"Answer: '{example['answer']}'")
print(f"Answer position: {example['answer_start']}")
print(f"Context length: {len(example['context'])} characters")

# Preprocess it
batch = {
    'question': [example['question']],
    'context': [example['context']],
    'answer': [example['answer']],
    'answer_start': [example['answer_start']]
}
processed = preprocess_uqa(batch, tokenizer, indices=[0])

print(f"\n2Ô∏è‚É£ AFTER PREPROCESSING")
print("-"*80)
print(f"Number of chunks created: {len(processed['input_ids'])}")
print(f"(Sliding window creates multiple chunks per example)")

# Show first chunk in detail
chunk_idx = 0
print(f"\n3Ô∏è‚É£ CHUNK {chunk_idx} DETAILS")
print("-"*80)
print(f"Input IDs length: {len(processed['input_ids'][chunk_idx])} tokens")
print(f"Start position: {processed['start_positions'][chunk_idx]}")
print(f"End position: {processed['end_positions'][chunk_idx]}")
print(f"Maps to original example: {processed['overflow_to_sample_mapping'][chunk_idx]}")

# Decode the inputs to show what the model sees
input_ids = processed['input_ids'][chunk_idx]
decoded_input = tokenizer.decode(input_ids, skip_special_tokens=False)
print(f"\n4Ô∏è‚É£ DECODED INPUT (first 400 chars, with special tokens)")
print("-"*80)
print(decoded_input[:400] + "...")

# Decode the labeled answer span
start_pos = processed['start_positions'][chunk_idx]
end_pos = processed['end_positions'][chunk_idx]
cls_idx = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0

if start_pos == cls_idx and end_pos == cls_idx:
    labeled_answer = "[NO ANSWER IN THIS CHUNK]"
else:
    labeled_answer = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True)

print(f"\n5Ô∏è‚É£ LABELED ANSWER SPAN IN THIS CHUNK")
print("-"*80)
print(f"Gold answer: '{example['answer']}'")
print(f"Labeled span: '{labeled_answer}'")
print(f"Match: {labeled_answer.strip() == example['answer'].strip()}")

# Show all chunks for this example
print(f"\n6Ô∏è‚É£ ALL CHUNKS FOR THIS EXAMPLE")
print("-"*80)
for i in range(len(processed['input_ids'])):
    start = processed['start_positions'][i]
    end = processed['end_positions'][i]
    if start == cls_idx and end == cls_idx:
        chunk_answer = "[NO ANSWER]"
    else:
        chunk_answer = tokenizer.decode(processed['input_ids'][i][start:end+1], skip_special_tokens=True).strip()
    has_answer = "‚úÖ" if chunk_answer == example['answer'].strip() else "‚ùå"
    print(f"  Chunk {i}: {has_answer} '{chunk_answer[:50]}'")

print("\n" + "="*80)

## üîß Preprocessing Exploration: Raw Data ‚Üí Model Input

Now let's see what happens during preprocessing - how we convert text to token IDs and create training labels.

In [12]:
# preprocess the train and val splits
processed_train = uqa_train.map(lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), batched=True, remove_columns=uqa_train.column_names, with_indices=True)
processed_val = uqa_val.map(lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), batched=True, remove_columns=uqa_val.column_names, with_indices=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3179 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
# processed_train

In [None]:
# Explore preprocessed dataset structure
print("="*80)
print("üì¶ PREPROCESSED DATASET STRUCTURE")
print("="*80)
print(f"Original training examples: {len(uqa_train):,}")
print(f"Preprocessed training chunks: {len(processed_train):,}")
print(f"Expansion ratio: {len(processed_train)/len(uqa_train):.2f}x")
print(f"(Each example creates ~{len(processed_train)//len(uqa_train):.1f} chunks due to sliding window)")

print(f"\nFeatures in preprocessed data: {processed_train.column_names}")
print(f"\nFeature shapes (for one chunk):")
for col in processed_train.column_names:
    sample = processed_train[0][col]
    if isinstance(sample, list):
        print(f"  - {col}: list of {len(sample)} elements")
    else:
        print(f"  - {col}: scalar value = {sample}")

# Analyze label distribution
print("\n" + "="*80)
print("üìä LABEL DISTRIBUTION IN PREPROCESSED DATA")
print("="*80)

sample_size = min(5000, len(processed_train))
cls_idx = 0  # CLS is at position 0 for CANINE

no_answer_chunks = 0
answer_chunks = 0

for i in range(sample_size):
    start = processed_train[i]['start_positions']
    end = processed_train[i]['end_positions']
    if start == cls_idx and end == cls_idx:
        no_answer_chunks += 1
    else:
        answer_chunks += 1

print(f"Chunks with answer: {answer_chunks:,} ({answer_chunks/sample_size*100:.1f}%)")
print(f"Chunks without answer: {no_answer_chunks:,} ({no_answer_chunks/sample_size*100:.1f}%)")
print(f"\nüí° This is expected! Most chunks don't contain the answer due to sliding window.")
print(f"   Each question gets ~3-5 chunks, but only 1 contains the answer.")

# Show a few preprocessed examples
print("\n" + "="*80)
print("üîç SAMPLE PREPROCESSED CHUNKS")
print("="*80)

for i in [0, 10, 20]:
    chunk = processed_train[i]
    orig_idx = chunk['overflow_to_sample_mapping']
    original = uqa_train[orig_idx]
    
    input_ids = chunk['input_ids']
    start_pos = chunk['start_positions']
    end_pos = chunk['end_positions']
    
    # Decode
    if start_pos == 0 and end_pos == 0:
        labeled = "[NO ANSWER]"
    else:
        labeled = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True).strip()
    
    print(f"\nChunk {i} (from example {orig_idx}):")
    print(f"  Question: {original['question'][:60]}...")
    print(f"  Gold answer: '{original['answer']}'")
    print(f"  This chunk's label: '{labeled}'")
    print(f"  Positions: [{start_pos}, {end_pos}]")

print("\n" + "="*80)

## üì¶ Understanding the Preprocessed Dataset

Let's explore the full preprocessed dataset structure that gets fed into the model.

In [14]:
# processed_val

In [None]:
# Build Hybrid Model: LoRA + Unfrozen Last Layer
print("="*80)
print("üî® BUILDING HYBRID MODEL")
print("="*80)

# Step 1: Apply LoRA to all layers
print("\n1Ô∏è‚É£ Applying LoRA adapters to all encoder layers...")
peft_model = get_peft_model(model, lora_config)
peft_model.gradient_checkpointing_enable()

print("\nüìä After LoRA (before unfreezing):")
print_trainable_parameters(peft_model)

# Step 2: Unfreeze ONLY the last encoder layer
last_layer_idx = num_encoder_layers - 1
print(f"\n2Ô∏è‚É£ Unfreezing last encoder layer (layer {last_layer_idx})...")

unfrozen_param_count = 0
for name, param in peft_model.named_parameters():
    if f"encoder.layer.{last_layer_idx}." in name:
        param.requires_grad = True
        unfrozen_param_count += param.numel()

print(f"   Unfrozen {unfrozen_param_count:,} additional parameters from layer {last_layer_idx}")

print("\nüìä Final Hybrid Model (LoRA + Unfrozen Last Layer):")
print_trainable_parameters(peft_model)

print("\n‚úÖ Hybrid model ready for training!")
print("="*80)

In [None]:
# Show what the model sees during training
print("="*80)
print("üéì MODEL TRAINING DATA FLOW")
print("="*80)

# Take one batch from preprocessed data
batch_size = 4
sample_batch = processed_train.select(range(batch_size))

print(f"\n1Ô∏è‚É£ BATCH STRUCTURE")
print("-"*80)
print(f"Batch size: {batch_size} chunks")
print(f"Each chunk in the batch contains:")

# Show batch structure
for key in sample_batch.column_names:
    sample_value = sample_batch[0][key]
    if isinstance(sample_value, list):
        print(f"  - {key}: shape ({batch_size}, {len(sample_value)})")
    else:
        print(f"  - {key}: shape ({batch_size},)")

print(f"\n2Ô∏è‚É£ WHAT THE MODEL RECEIVES (for 1 chunk in batch)")
print("-"*80)
example_idx = 0
print(f"Input IDs: {len(sample_batch[example_idx]['input_ids'])} tokens")
print(f"  First 10 token IDs: {sample_batch[example_idx]['input_ids'][:10]}")
print(f"\nAttention mask: {sample_batch[example_idx]['attention_mask'][:20]}...")
print(f"  (1=attend to token, 0=ignore padding)")
print(f"\nToken type IDs: {sample_batch[example_idx]['token_type_ids'][:20]}...")
print(f"  (0=question tokens, 1=context tokens)")

print(f"\n3Ô∏è‚É£ TRAINING TARGETS (what model learns to predict)")
print("-"*80)
print(f"Target start position: {sample_batch[example_idx]['start_positions']}")
print(f"Target end position: {sample_batch[example_idx]['end_positions']}")
print(f"\nüí° The model learns to output these exact positions!")

print("\n" + "="*80)

## üß† Model Training: How Data Flows Through CANINE

Let's understand what happens during training - how the preprocessed chunks get fed into the model and what it learns.

In [None]:
def normalize_answer(text):
    text = (text or "").lower()
    def remove_articles(s):
        return re.sub(r"\b(a|an|the)\b", " ", s)
    def remove_punctuation(s):
        return "".join(ch for ch in s if ch not in string.punctuation)
    def white_space_fix(s):
        return " ".join(s.split())
    return white_space_fix(remove_articles(remove_punctuation(text)))

def exact_match_score(prediction, ground_truth):
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    gold_tokens = normalize_answer(ground_truth).split()
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    # BUGFIX: Prevent division by zero if both precision and recall are 0
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def decode_prediction(input_ids, start_idx, end_idx, tokenizer):
    # Dynamic CLS handling
    cls_index = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0
    
    # No answer case (both point to CLS)
    if start_idx == cls_index and end_idx == cls_index:
        return ""
    
    # Invalid range (start after end) - treat as no answer
    if start_idx > end_idx:
        return ""
    
    # Defensive bounds checking
    if start_idx < 0 or end_idx < 0:
        return ""
    if start_idx >= len(input_ids) or end_idx >= len(input_ids):
        return ""
    
    # Clamp to valid range (additional safety)
    start_idx = max(start_idx, 0)
    end_idx = min(end_idx, len(input_ids) - 1)
    
    # Decode with inclusive slicing [start:end+1]
    text = tokenizer.decode(input_ids[start_idx:end_idx + 1], skip_special_tokens=True)
    return text.strip()

def gold_answer(example):
    if example["answer_start"] == -1:
        return ""
    return example["answer"]

def edit_distance_score(prediction, ground_truth):
    return Levenshtein.ratio(normalize_answer(prediction), normalize_answer(ground_truth))


def evaluate_checkpoint(checkpoint_path=None, model_instance=None, eval_dataset=None):
    """Evaluate either a checkpoint path (loads model) or a provided model instance.

    - checkpoint_path: path to checkpoint folder
    - model_instance: an in-memory model (preferably a PeftModel or CanineForQuestionAnswering)
    - eval_dataset: optional dataset to evaluate; if None the default processed_val will be used
    """
    if eval_dataset is None:
        eval_dataset = processed_val

    # If a model_instance is given, use it directly (avoid re-loading a fresh base model)
    if model_instance is not None:
        eval_model = model_instance
    else:
        base_model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)
        eval_model = get_peft_model(base_model, lora_config)
        # Try loading adapter weights; fall back to PeftModel.from_pretrained if needed
        try:
            eval_model.load_adapter(checkpoint_path)
        except Exception:
            from peft import PeftModel
            eval_model = PeftModel.from_pretrained(base_model, checkpoint_path)
        
        # HYBRID: Unfreeze last layer to match training configuration
        # Discover layer count
        num_layers = 0
        for name, _ in eval_model.named_parameters():
            if "encoder.layer." in name:
                layer_num = int(name.split("encoder.layer.")[1].split(".")[0])
                num_layers = max(num_layers, layer_num + 1)
        
        # Unfreeze last layer
        if num_layers > 0:
            last_layer_idx = num_layers - 1
            for name, param in eval_model.named_parameters():
                if f"encoder.layer.{last_layer_idx}." in name:
                    param.requires_grad = True

    eval_model.to(device)

    eval_args = TrainingArguments(
        # Small evaluation config; uses cpu/mps if no gpu during eval
        output_dir="outputs/canine-s-uqa-hybrid",
        per_device_eval_batch_size=16,
        dataloader_drop_last=False,
        fp16=True,
        bf16=False,
        report_to="none",
    )

    # Run evaluation via a lightweight Trainer so prediction loop is standard
    eval_trainer = Trainer(
        model=eval_model,
        args=eval_args,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    predictions = eval_trainer.predict(eval_dataset)
    start_logits, end_logits = predictions.predictions
    
    # BUGFIX: Validate logits shape before processing
    if len(start_logits) == 0 or len(end_logits) == 0:
        print("‚ö†Ô∏è Warning: Empty logits received from model!")
        return {"exact_match": 0.0, "f1": 0.0, "edit_distance": 0.0}
    
    if start_logits.shape[0] != end_logits.shape[0]:
        print(f"‚ö†Ô∏è Warning: Mismatched logits shapes: {start_logits.shape} vs {end_logits.shape}")
        return {"exact_match": 0.0, "f1": 0.0, "edit_distance": 0.0}
    
    best_predictions = {}
    for feature_index, feature in enumerate(eval_dataset):
        # Defensive check: ensure feature_index is within logits bounds
        if feature_index >= len(start_logits) or feature_index >= len(end_logits):
            print(f"‚ö†Ô∏è Warning: Feature index {feature_index} out of bounds (logits length: {len(start_logits)})")
            continue
            
        sample_idx = int(feature["overflow_to_sample_mapping"])
        input_ids = feature["input_ids"]
        
        # BUGFIX: Validate logits arrays are non-empty before argmax
        if len(start_logits[feature_index]) == 0 or len(end_logits[feature_index]) == 0:
            print(f"‚ö†Ô∏è Warning: Empty logits at feature {feature_index}, skipping")
            continue
        
        start_idx = int(np.argmax(start_logits[feature_index]))
        end_idx = int(np.argmax(end_logits[feature_index]))
        score = float(start_logits[feature_index][start_idx] + end_logits[feature_index][end_idx])
        prediction_text = decode_prediction(input_ids, start_idx, end_idx, tokenizer=tokenizer)
        stored = best_predictions.get(sample_idx)
        if stored is None or score > stored[0]:
            best_predictions[sample_idx] = (score, prediction_text)

    em_scores = []
    f1_scores = []
    edit_dist_scores = []
    for sample_idx, (_, prediction_text) in best_predictions.items():
        # BUGFIX: Validate sample_idx is within dataset bounds
        if sample_idx >= len(uqa_val):
            print(f"‚ö†Ô∏è Warning: sample_idx {sample_idx} out of bounds (dataset size: {len(uqa_val)})")
            continue

            

        reference = gold_answer(uqa_val[int(sample_idx)])    return {"exact_match": em, "f1": f1, "edit_distance": edit_dist}

        em_scores.append(exact_match_score(prediction_text, reference))    print(f"Edit Distance (normalized): {edit_dist * 100:.2f}")

        f1_scores.append(f1_score(prediction_text, reference))    print(f"F1: {f1 * 100:.2f}")

        edit_dist_scores.append(edit_distance_score(prediction_text, reference))    print(f"Exact Match: {em * 100:.2f}")

    print(f"Examples evaluated: {len(em_scores)}")

    em = float(np.mean(em_scores)) if em_scores else 0.0    edit_dist = float(np.mean(edit_dist_scores)) if edit_dist_scores else 0.0
    f1 = float(np.mean(f1_scores)) if f1_scores else 0.0

In [None]:
training_args = TrainingArguments(
    output_dir="outputs/canine-s-uqa-hybrid",  # Changed for hybrid

    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,

    gradient_accumulation_steps=4,
    gradient_checkpointing=True,

    num_train_epochs=1,
    learning_rate=4e-5,     # Reduced from 5e-5 for hybrid (more trainable params)
    warmup_ratio=0.06,      # Added warmup for stability (~420 steps)
    weight_decay=0.01,
    eval_strategy="no",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=50,
    fp16=True,
    bf16=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="VohraAK/canine-s-uqa-hybrid",  # Changed for hybrid
    hub_strategy="checkpoint",
    )

class CustomEvalCallback(TrainerCallback):
    def __init__(self, eval_func, eval_dataset, use_in_memory_model=True, verbose=True):
        self.eval_func = eval_func
        self.eval_dataset = eval_dataset
        self.use_in_memory_model = use_in_memory_model
        self.verbose = verbose
        # trainer reference (set after trainer exists)
        self.trainer = None

    def on_save(self, args, state, control, model=None, **kwargs):
        checkpoint_path = f"{args.output_dir}/checkpoint-{state.global_step}"
        if self.verbose:
            print(f"\nüîç Running custom evaluation at step {state.global_step}...")

        # Prefer evaluating the in-memory trainer model (fast + avoids re-loading)
        if self.use_in_memory_model and self.trainer is not None:
            if self.verbose:
                print("Using in-memory model for evaluation (no reloading).")
            try:
                metrics = self.eval_func(checkpoint_path=None, model_instance=self.trainer.model, eval_dataset=self.eval_dataset)
            except Exception as e:
                print("‚ö†Ô∏è in-memory evaluation failed, falling back to checkpoint load:", e)
                metrics = self.eval_func(checkpoint_path)
        else:
            metrics = self.eval_func(checkpoint_path)

        # record metrics in state.log_history
        state.log_history.append({
            "step": state.global_step,
            "eval_exact_match": metrics.get("exact_match"),
            "eval_f1": metrics.get("f1"),
            "eval_edit_distance": metrics.get("edit_distance"),
        })

        if self.verbose:
            print(f"‚úÖ Step {state.global_step}: EM={metrics.get('exact_match',0)*100:.2f}, F1={metrics.get('f1',0)*100:.2f}, EditDist={metrics.get('edit_distance',0)*100:.2f}")

        # Update trainer_state.json to include custom metrics
        state_path = f"{checkpoint_path}/trainer_state.json"
        try:
            with open(state_path, 'r') as f:
                state_dict = json.load(f)
            state_dict['log_history'] = state.log_history
            with open(state_path, 'w') as f:
                json.dump(state_dict, f, indent=2)
            if self.verbose:
                print(f"üíæ Updated trainer_state.json with custom metrics")
        except Exception as e:
            if self.verbose:
                print(f"‚ö†Ô∏è  Warning: Could not update trainer_state.json: {e}")

        try:
            if self.verbose:
                print(f"‚òÅÔ∏è  Pushing checkpoint-{state.global_step} to Hub...")
            api = HfApi()
            api.upload_folder(
                folder_path=checkpoint_path,
                repo_id=args.hub_model_id,
                path_in_repo=f"checkpoint-{state.global_step}",
                commit_message=f"Add checkpoint {state.global_step} (EM={metrics.get('exact_match',0)*100:.1f}%, F1={metrics.get('f1',0)*100:.1f}%)",
                repo_type="model"
            )
            if self.verbose:
                print(f"‚úÖ Pushed checkpoint-{state.global_step} to Hub")
        except Exception as e:
            if self.verbose:
                print(f"‚ö†Ô∏è  Warning: Could not push to Hub: {e}")

        return control

In [20]:
trainer_cb = CustomEvalCallback(evaluate_checkpoint, processed_val, use_in_memory_model=True)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    callbacks=[trainer_cb],
)


No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
trainer.train()



Step,Training Loss
50,5.9179
100,5.8341
150,5.7689
200,5.6774
250,5.6209
300,5.5737
350,5.4919
400,5.4166
450,5.3625
500,5.338



üîç Running custom evaluation at step 500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 25.30
F1: 26.26
Edit Distance (normalized): 28.09
‚úÖ Step 500: EM=25.30, F1=26.26, EditDist=28.09
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-500 to Hub...
‚úÖ Pushed checkpoint-500 to Hub





üîç Running custom evaluation at step 1000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 29.95
F1: 30.50
Edit Distance (normalized): 31.41
‚úÖ Step 1000: EM=29.95, F1=30.50, EditDist=31.41
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-1000 to Hub...
‚úÖ Pushed checkpoint-1000 to Hub





üîç Running custom evaluation at step 1500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 30.90
F1: 31.27
Edit Distance (normalized): 31.92
‚úÖ Step 1500: EM=30.90, F1=31.27, EditDist=31.92
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-1500 to Hub...
‚úÖ Pushed checkpoint-1500 to Hub





üîç Running custom evaluation at step 2000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 31.35
F1: 31.63
Edit Distance (normalized): 32.17
‚úÖ Step 2000: EM=31.35, F1=31.63, EditDist=32.17
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-2000 to Hub...
‚úÖ Pushed checkpoint-2000 to Hub





üîç Running custom evaluation at step 2500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 31.75
F1: 31.94
Edit Distance (normalized): 32.36
‚úÖ Step 2500: EM=31.75, F1=31.94, EditDist=32.36
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-2500 to Hub...
‚úÖ Pushed checkpoint-2500 to Hub





üîç Running custom evaluation at step 3000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.25
F1: 32.46
Edit Distance (normalized): 32.87
‚úÖ Step 3000: EM=32.25, F1=32.46, EditDist=32.87
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3000 to Hub...
‚úÖ Pushed checkpoint-3000 to Hub





üîç Running custom evaluation at step 3500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.35
F1: 32.51
Edit Distance (normalized): 32.86
‚úÖ Step 3500: EM=32.35, F1=32.51, EditDist=32.86
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3500 to Hub...
‚úÖ Pushed checkpoint-3500 to Hub


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f77a8601-32db-4ced-a2f7-ce00d8972974)')' thrown while requesting HEAD https://huggingface.co/google/canine-s/resolve/main/config.json
Retrying in 1s [Retry 1/5].



üîç Running custom evaluation at step 4000...


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ea454141-2b46-42a9-82d4-6615effc264f)')' thrown while requesting HEAD https://huggingface.co/google/canine-s/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.50
F1: 32.66
Edit Distance (normalized): 33.01
‚úÖ Step 4000: EM=32.50, F1=32.66, EditDist=33.01
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4000 to Hub...
‚úÖ Pushed checkpoint-4000 to Hub





üîç Running custom evaluation at step 4500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.65
F1: 32.79
Edit Distance (normalized): 33.08
‚úÖ Step 4500: EM=32.65, F1=32.79, EditDist=33.08
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4500 to Hub...
‚úÖ Pushed checkpoint-4500 to Hub





üîç Running custom evaluation at step 5000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.75
F1: 32.87
Edit Distance (normalized): 33.14
‚úÖ Step 5000: EM=32.75, F1=32.87, EditDist=33.14
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5000 to Hub...
‚úÖ Pushed checkpoint-5000 to Hub





üîç Running custom evaluation at step 5500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.80
F1: 32.92
Edit Distance (normalized): 33.19
‚úÖ Step 5500: EM=32.80, F1=32.92, EditDist=33.19
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5500 to Hub...
‚úÖ Pushed checkpoint-5500 to Hub





üîç Running custom evaluation at step 6000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.75
F1: 32.85
Edit Distance (normalized): 33.09
‚úÖ Step 6000: EM=32.75, F1=32.85, EditDist=33.09
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6000 to Hub...
‚úÖ Pushed checkpoint-6000 to Hub





üîç Running custom evaluation at step 6500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.70
F1: 32.80
Edit Distance (normalized): 33.06
‚úÖ Step 6500: EM=32.70, F1=32.80, EditDist=33.06
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6500 to Hub...
‚úÖ Pushed checkpoint-6500 to Hub





üîç Running custom evaluation at step 7000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.70
F1: 32.80
Edit Distance (normalized): 33.06
‚úÖ Step 7000: EM=32.70, F1=32.80, EditDist=33.06
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-7000 to Hub...
‚úÖ Pushed checkpoint-7000 to Hub





üîç Running custom evaluation at step 7313...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Examples evaluated: 2000
Exact Match: 32.70
F1: 32.80
Edit Distance (normalized): 33.06
‚úÖ Step 7313: EM=32.70, F1=32.80, EditDist=33.06
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-7313 to Hub...
‚úÖ Pushed checkpoint-7313 to Hub


TrainOutput(global_step=7313, training_loss=4.007603400027102, metrics={'train_runtime': 2205.5529, 'train_samples_per_second': 53.046, 'train_steps_per_second': 3.316, 'total_flos': 2.9095953406848e+16, 'train_loss': 4.007603400027102, 'epoch': 1.0})

---

### Diagnosing Preprocessing Functions!!!

These functions are just analysing the preprocessing logic above, they're just using the base model, NOT our trained model...

In [26]:
# Diagnostic cell (fixed): Investigate preprocessing and truncation for many samples
import random
import pandas as pd
from transformers import AutoTokenizer

# Set display options to see full Urdu text
pd.set_option('display.max_colwidth', None)

try:
    tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
except Exception:
    tokenizer = None

num_samples = 20000  # Number of samples to check
results = []

for split_name, orig_data, proc_data in [
    ("train", uqa_train, processed_train),
    ("val", uqa_val, processed_val)
]:
    # Sample random indices
    if len(proc_data) < num_samples:
        current_indices = range(len(proc_data))
    else:
        current_indices = random.sample(range(len(proc_data)), num_samples)

    for idx in current_indices:
        proc = proc_data[idx]
        # Use overflow_to_sample_mapping to get the correct original index
        orig_idx = proc["overflow_to_sample_mapping"]
        orig = orig_data[orig_idx]

        input_ids = proc["input_ids"]
        start_pos = proc["start_positions"]
        end_pos = proc["end_positions"]

        gold_answer = orig.get("gold_answer", orig.get("answer", ""))
        question = orig.get("question", "")

        # Decode input_ids to text (for debugging context)
        if tokenizer:
            decoded_text = tokenizer.decode(input_ids, skip_special_tokens=False)
        else:
            decoded_text = str(input_ids)

        # Extract predicted answer span
        if 0 <= start_pos < len(input_ids) and 0 <= end_pos < len(input_ids):
            if tokenizer:
                pred_span = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True)
            else:
                pred_span = str(input_ids[start_pos:end_pos+1])
        else:
            pred_span = "[CLS]" # Represents no answer found in this chunk or invalid

        # Check if pred_span matches gold answer
        # We strip() to ignore minor whitespace differences
        pred_matches_gold = pred_span.strip() == gold_answer.strip()

        # Check if gold is even reachable in this chunk
        gold_in_decoded = gold_answer in decoded_text

        results.append({
            "Split": split_name,
            "Question": question,
            "Gold Answer": gold_answer,
            "Extracted Answer": pred_span,
            "Match": pred_matches_gold,
            "Gold Reachable": gold_in_decoded,
            "orig_idx": orig_idx
        })

# Create DataFrame
results_df = pd.DataFrame(results)

# --- SIDE BY SIDE COMPARISON ---

# 1. Filter for Solvable Mismatches (Gold was there, but we predicted wrong)
problem_cases = results_df[
    (results_df["Gold Reachable"] == True) &
    (results_df["Match"] == False)
][["Question", "Gold Answer", "Extracted Answer", "Split"]]

print(f"üîç Checked {len(results_df)} samples.")
print(f"‚ùå Found {len(problem_cases)} cases where Gold was present but Extraction failed.")

print("\nüìä Side-by-Side Comparison (Top 20 Failures):")
display(problem_cases.head(50))

print("\n‚úÖ Side-by-Side Comparison (First 10 Rows - Mixed):")
display(results_df[["Question", "Gold Answer", "Extracted Answer", "Match"]].head(50))

üîç Checked 26317 samples.
‚ùå Found 624 cases where Gold was present but Extraction failed.

üìä Side-by-Side Comparison (Top 20 Failures):


Unnamed: 0,Question,Gold Answer,Extracted Answer,Split
43,Close Encounters ŸÜ€í ⁄©ÿ™ŸÜ€í ÿ¢ÿ≥⁄©ÿ± ÿ¨€åÿ™€íÿü,ÿØŸà,,train
160,€å€Å ÿßÿ≥ÿßÿ™ÿ∞€Å ⁄©€í ÿ™ÿßÿ¨ÿ± ÿß⁄©ÿ´ÿ± ⁄©€åÿß ⁄©ÿ±ÿ™€í ÿ™⁄æ€íÿü,ŸÖ€åŸÜ⁄àŸàŸÑ€åŸÜ ÿ¢ÿ±⁄©ÿ≥Ÿπÿ±ÿß,,train
161,ÿ®ÿ±⁄© ⁄©€Åÿß⁄∫ ŸÅ⁄©ÿ± ŸÖŸÜÿØ ÿ™⁄æÿß ⁄©€Å ÿ®ÿ±ÿ∑ÿßŸÜ€å€Å ÿ¨ŸÜ⁄Ø ŸÜ€Å€å⁄∫ ÿ¨€åÿ™ ÿ≥⁄©ÿ™ÿßÿü,ÿßŸÖÿ±€å⁄©€Å,,train
205,1964 ŸÖ€å⁄∫ ÿß€åŸÜ ÿß€í ÿß€åŸÖ ⁄©ÿß ÿ±€ÅŸÜŸÖÿß ⁄©ŸàŸÜ ŸÜÿßŸÖÿ≤ÿØ ⁄©€åÿß ⁄Ø€åÿß ÿ™⁄æÿßÿü,ŸÜÿßÿµÿ±,,train
287,ŸæŸÜ⁄©⁄æŸà⁄∫ ŸàÿßŸÑ€í ⁄©€å⁄ëŸà⁄∫ ⁄©Ÿà ⁄©€åÿß ⁄©€Åÿß ÿ¨ÿßÿ™ÿß €Å€íÿü,ŸæŸπ€åÿ±⁄ØŸàŸπÿß,,train
386,ÿ®€åŸàŸÜÿ≥€å ŸÜ€í 2006 ŸÖ€å⁄∫ ⁄©ÿ≥ ⁄©Ÿà ÿÆÿ±ÿßÿ¨ ÿ™ÿ≠ÿ≥€åŸÜ Ÿæ€åÿ¥ ⁄©€åÿßÿü,ŸÖÿßÿ¶€å⁄©ŸÑ ÿ¨€å⁄©ÿ≥ŸÜ,,train
465,⁄©ŸàŸÜ ÿ≥ÿß ŸÖŸÑ⁄© Ÿæ€ÅŸÑÿß ŸÖŸÑ⁄© ÿ™⁄æÿß ÿ¨ÿ≥ Ÿæÿ± ÿßŸÖÿ±€å⁄©€Å ŸÜ€í ÿ¨ŸÜ⁄Ø ⁄©ÿß ÿßÿπŸÑÿßŸÜ ⁄©€åÿß ÿ™⁄æÿßÿü,ÿ®ÿ±ÿ∑ÿßŸÜ€å€Å,,train
547,Ÿπ€å ⁄à€å ⁄Øÿßÿ±⁄àŸÜ ⁄©ÿ™ŸÜ€å Ÿπ€åŸÖŸà⁄∫ ⁄©ÿß ⁄Ø⁄æÿ± €Å€íÿü,ÿØŸà,,train
555,€Åÿßÿ¶€å⁄© ŸÜ€í ⁄©€Åÿß ÿ™⁄æÿß ⁄©€Å ŸÖÿπÿßÿ¥ÿ±€í ⁄©€í ŸÑ€å€í ÿ≠ŸÅÿßÿ∏ÿ™€å ÿ¨ÿßŸÑ ⁄©ŸàŸÜ ŸÅÿ±ÿß€ÅŸÖ ⁄©ÿ±€í ⁄Øÿßÿü,ÿ±€åÿßÿ≥ÿ™,,train
562,ÿ±Ÿàÿ≥ ÿßŸàÿ± ÿ™ÿ±⁄©€å ⁄©€í ŸÖÿßÿ®€åŸÜ ÿ™ÿµŸÅ€å€Å ⁄©€í ÿ®ÿπÿØ ÿå ÿ¢ÿ≥Ÿπÿ±€åÿß ŸÜ€í ⁄©ÿ≥ ⁄©€í ÿ≥ÿßÿ™⁄æ ÿ¥ÿßŸÖŸÑ €ÅŸàŸÜ€í ⁄©ÿß ŸÅ€åÿµŸÑ€Å ⁄©€åÿßÿü,ÿ™ÿ±⁄©€å,,train



‚úÖ Side-by-Side Comparison (First 10 Rows - Mixed):


Unnamed: 0,Question,Gold Answer,Extracted Answer,Match
0,⁄©ÿ≥ ÿ≥ÿßŸÑ ŸÖ€å⁄∫ ⁄à€åŸπŸàŸÜ ⁄©€í ÿÆÿ¥⁄© ÿ≥ÿßŸÖÿßŸÜ ÿ®ŸÜÿØ ⁄©ÿ± ÿØ€åÿß ⁄Ø€åÿß ÿ™⁄æÿßÿü,,,True
1,⁄ØŸàÿ±ÿ®ÿß⁄ÜŸàŸÅ ŸÜ€í ⁄©€åÿß ÿ™ÿÆŸÑ€åŸÇ ⁄©ÿ±ŸÜ€í ⁄©€å ÿßŸÖ€åÿØ ⁄©€å ÿ™⁄æ€åÿü,ŸÜ€åÿß ÿ≥Ÿæÿ±€åŸÖ ŸÇÿßŸÜŸàŸÜ ÿ≥ÿßÿ≤ ÿßÿØÿßÿ±€Å,ŸÜ€åÿß ÿ≥Ÿæÿ±€åŸÖ ŸÇÿßŸÜŸàŸÜ ÿ≥ÿßÿ≤ ÿßÿØÿßÿ±€Å,True
2,2006 ŸÖ€å⁄∫ ⁄©ŸàŸÜÿ≥ŸÑ ÿ¢ŸÜ ŸÅÿßÿ±ŸÜ ÿ±€åŸÑ€åÿ¥ŸÜÿ≤ ŸÖ€å⁄∫ ŸàÿßŸÜ ŸÜ€åŸàŸÖŸÜ ŸÜ€í ⁄©€åÿß ÿß€åÿ¨ŸÜ⁄àÿß Ÿæ€åÿ¥ ⁄©€åÿß ÿ™⁄æÿßÿü,,,True
3,€åÿ≥Ÿàÿπ ⁄©€í ÿ®ÿßŸÑÿ∫ €ÅŸàŸÜ€í ⁄©€å ŸÅŸÜ⁄©ÿßÿ±ÿßŸÜ€Å ÿ™ÿµŸà€åÿ± ⁄©ÿ¥€åŸà⁄∫ ⁄©Ÿà ⁄©€åÿß ⁄©€Åÿß ÿ¨ÿßÿ™ÿß €Å€íÿü,,,True
4,UF6 ⁄©ŸàŸÜ ÿ≥ÿß ŸÖÿ±⁄©ÿ® €Å€íÿü,€åŸàÿ±€åŸÜ€åŸÖ €Å€å⁄©ÿ≥ÿßŸÅŸÑŸàŸàÿ±ÿßÿ¶⁄à,€åŸàÿ±€åŸÜ€åŸÖ €Å€å⁄©ÿ≥ÿßŸÅŸÑŸàŸàÿ±ÿßÿ¶⁄à,True
5,⁄àÿ≥ŸæŸÑ€í ÿ±€åÿ≤ŸàŸÑŸàÿ¥ŸÜ ŸÖÿßÿ±⁄©€åŸπ ŸÖ€å⁄∫ ÿ≤€åÿßÿØ€Å ÿ™ÿ± ÿ≥ŸæŸÑÿßÿ¶ÿ±ÿ≤ ŸÜ€í 2010 ⁄©€å ÿØ€Åÿßÿ¶€å ŸÖ€å⁄∫ ⁄©€åÿß Ÿæ€åÿ¥ ⁄©€åÿßÿü,,,True
6,Algernon ÿ≥⁄àŸÜ€å ⁄©€åÿß ⁄©€í ŸÑÿ¶€í ÿß€å⁄© ÿÆÿ∑ÿ±€Å ÿ≥ŸÖÿ¨⁄æÿß ÿ¨ÿßÿ™ÿß ÿ™⁄æÿßÿü,,,True
7,⁄©Ÿàÿ≤ÿßŸÜ ⁄©ÿ≥ ⁄Ü€åÿ≤ ÿ≥€í ÿ®ŸÜ€í ÿ™⁄æ€íÿü,ŸÑŸà€Å€í €åÿß ⁄ÜŸÖ⁄ë€í,ŸÑŸà€Å€í €åÿß ⁄ÜŸÖ⁄ë€í,True
8,Cynanthus latirostris ⁄©ÿ≥ ŸÇÿ≥ŸÖ ⁄©ÿß Ÿæÿ±ŸÜÿØ€Å €Å€íÿü,€ÅŸàŸÖŸÜ⁄Ø ÿ®ÿ±⁄à,€ÅŸàŸÖŸÜ⁄Ø ÿ®ÿ±⁄à,True
9,ÿßÿ≥ ⁄©€å ŸÖŸàÿ™ ⁄©€í ÿ®ÿπÿØ Ÿà€åŸàÿ± ⁄©€í ŸÑÿ¶€í ⁄©ŸàŸÜ ŸÑ€í ŸÑ€åÿßÿü,ÿßŸÑÿ®ÿ±Ÿπ ⁄©€åÿ≥ŸÑŸÜ⁄Ø ÿßŸàÿ± €ÅŸÜÿ≥ ÿ¨Ÿàÿ±⁄ØŸÜ ÿßÿ≥ŸπŸÖŸæŸÅ,ÿßŸÑÿ®ÿ±Ÿπ ⁄©€åÿ≥ŸÑŸÜ⁄Ø ÿßŸàÿ± €ÅŸÜÿ≥ ÿ¨Ÿàÿ±⁄ØŸÜ ÿßÿ≥ŸπŸÖŸæŸÅ,True


In [24]:
# Accuracy: fraction of rows where extracted answer matches gold answer
accuracy = (results_df["Match"]).mean()

# Precision: among rows where extracted answer is non-empty, fraction that matches gold
# We filter out cases where the model predicted nothing (empty string) or just whitespace
non_empty_pred = results_df["Extracted Answer"].str.strip() != ""

# Avoid division by zero if no predictions were made
if non_empty_pred.sum() > 0:
    precision = (results_df["Match"] & non_empty_pred).sum() / non_empty_pred.sum()
else:
    precision = 0.0

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")

Accuracy: 0.583
Precision: 1.000


---

## üìã Hybrid Training Summary

**This notebook implements a cost-optimized hybrid training approach:**

### Changes from LoRA-Only Notebook:
1. ‚úÖ **Architecture Inspection Cell** - Discovers CANINE-S layer count programmatically
2. ‚úÖ **Hybrid Model Building** - LoRA adapters + last encoder layer fully unfrozen
3. ‚úÖ **Updated Evaluation** - Applies same unfreezing logic during checkpoint evaluation
4. ‚úÖ **Adjusted Hyperparameters:**
   - Learning rate: 5e-5 ‚Üí 4e-5 (20% reduction for stability)
   - Warmup: 0% ‚Üí 6% (~420 steps for smooth start)
5. ‚úÖ **Updated Paths:**
   - Output: `outputs/canine-s-uqa-hybrid`
   - Hub: `VohraAK/canine-s-uqa-hybrid`

### Expected Results:
- **Trainable Parameters:** ~6-8M (vs ~1M LoRA-only)
- **Training Time:** +25% (acceptable trade-off)
- **Memory:** Minimal increase (fp16 + gradient checkpointing)
- **Target Metrics:** EM/F1 > 40% (breaking through 33% plateau)

### Cost-Benefit Analysis:
| Approach | Params | Time | Memory | Expected EM/F1 |
|----------|--------|------|--------|----------------|
| LoRA Only | 1M | 1.0x | 1.0x | 33% ‚ùå |
| **Hybrid (1 layer)** | **6-8M** | **1.25x** | **1.1x** | **40-55% ‚úÖ** |
| Hybrid (2 layers) | 11-16M | 1.60x | 1.3x | 45-60% (risky) |

**Ready to train! üöÄ**