In [25]:
# %pip install peft evaluate transformers Levenshtein ipywidgets
# %pip install protobuf==3.20.3
# !rm -rf /kaggle/working/cache

In [26]:
import os
os.environ["TRANSFORMERS_DISABLE_CHAT_TEMPLATES"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_NO_ADDITIONAL_CHAT_TEMPLATES"] = "1"

In [27]:
from datasets import load_dataset, load_from_disk
# from UQA.canine_utils import preprocess_uqa, lora_config, print_trainable_parameters, normalize_answer, exact_match_score, f1_score, edit_distance_score, gold_answer, decode_prediction
from transformers import CanineTokenizer
from peft import LoraConfig, TaskType, get_peft_model
import re
import string
from collections import Counter
import numpy as np
import Levenshtein

from transformers import TrainingArguments, Trainer, TrainerCallback
import json
from huggingface_hub import HfApi, notebook_login, whoami

In [28]:

# notebook_login()
# whoami()

In [29]:
from transformers import CanineTokenizer, CanineForQuestionAnswering
import torch
model_name = 'google/canine-s'
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

tokenizer = CanineTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=False)
model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
uqa_dataset = load_dataset("uqa/UQA")
uqa_train = uqa_dataset["train"].shuffle(seed=42).select(range(40000))
uqa_val = uqa_dataset["validation"].shuffle(seed=42).select(range(10000))

---

## Updated preprocessors!

Previously, we tried to apply the same approach we used in TYDIQA on UQA, the problem was the preprocessors were aligning the answer spans in units of **byte-level spans** instead of **character-level spans**. The calculations were adding byte-level offsets to the answer lengths, and since Urdu characters may be quantified in multiple bytes, the model was being fed the wrong spans -> GIGO!

In [31]:
MAX_SEQ_LENGTH = 384
DOC_STRIDE = 64

def preprocess_uqa(examples, tokenizer, max_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, model_obj=None, indices=None):
    # Handle tokenizer/model limits safely
    tokenizer_max = getattr(tokenizer, "model_max_length", max_length)
    model_max = getattr(model_obj.config, "max_position_embeddings", None) if model_obj is not None else None
    max_allowed = max_length
    if tokenizer_max is not None and tokenizer_max > 0:
        max_allowed = min(max_allowed, tokenizer_max)
    if model_max is not None and model_max > 0:
        max_allowed = min(max_allowed, model_max)

    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answer"]
    answer_starts = examples["answer_start"]

    encoded = {
        "input_ids": [],
        "attention_mask": [],
        "token_type_ids": [],
        "start_positions": [],
        "end_positions": [],
        "overflow_to_sample_mapping": []
    }

    for i, (question, context, answer, answer_start) in enumerate(zip(questions, contexts, answers, answer_starts)):
        example_idx = indices[i] if indices is not None else i

        # CANINE encodes to characters directly.
        # add_special_tokens=False gives us raw character IDs.
        question_ids = tokenizer.encode(question, add_special_tokens=False)
        context_ids = tokenizer.encode(context, add_special_tokens=False)

        # 1. Setup Targets (DIRECT MAPPING)
        # UQA answer_start is a CHARACTER index. CANINE tokens are CHARACTERS.
        # Therefore, answer_start maps 1:1 to the context_ids index.
        if answer and answer_start != -1:
            gold_char_start = answer_start
            gold_char_end = answer_start + len(answer) # Points to char AFTER the answer
        else:
            gold_char_start = -1
            gold_char_end = -1

        # 2. Calculate Window Size
        # [CLS] + Question + [SEP] + Context + [SEP]
        special_tokens_count = tokenizer.num_special_tokens_to_add(pair=True)
        max_context_length = max_allowed - len(question_ids) - special_tokens_count

        if max_context_length <= 0:
            # Edge case: Question is too long, skip or truncate question (skipping here for safety)
            continue

        # 3. Sliding Window Loop
        stride_step = max_context_length - doc_stride
        if stride_step <= 0: stride_step = max_context_length # Fallback if doc_stride is too big

        for chunk_start_idx in range(0, len(context_ids), stride_step):
            chunk_end_idx = min(chunk_start_idx + max_context_length, len(context_ids))
            context_chunk = context_ids[chunk_start_idx:chunk_end_idx]

            # Build inputs using tokenizer utility to handle special tokens correctly
            input_ids = tokenizer.build_inputs_with_special_tokens(question_ids, context_chunk)
            token_type_ids = tokenizer.create_token_type_ids_from_sequences(question_ids, context_chunk)
            attention_mask = [1] * len(input_ids)

            # Calculate Offset: Where does the context actually start in input_ids?
            # Typically: [CLS] (1) + Q_Len + [SEP] (1) = Start of Context
            # We calculate this dynamically to be safe:
            sep_indices = [k for k, x in enumerate(input_ids) if x == tokenizer.sep_token_id]
            if not sep_indices:
                continue # Should not happen
            context_offset_in_input = sep_indices[0] + 1

            # 4. Label Assignment
            # Check if the answer lies ENTIRELY within this specific chunk
            is_answer_in_chunk = (
                gold_char_start >= chunk_start_idx and
                gold_char_end <= chunk_end_idx and
                gold_char_start != -1
            )

            if is_answer_in_chunk:
                # Map global context index to local window index
                start_pos = context_offset_in_input + (gold_char_start - chunk_start_idx)
                # -1 because end_positions is usually inclusive in HF Trainers
                end_pos = context_offset_in_input + (gold_char_end - chunk_start_idx) - 1
            else:
                # Label as [CLS] (index 0) if answer is not here
                start_pos = 0
                end_pos = 0

            # 5. Padding
            pad_len = max_allowed - len(input_ids)
            if pad_len > 0:
                input_ids += [tokenizer.pad_token_id] * pad_len
                attention_mask += [0] * pad_len
                token_type_ids += [0] * pad_len

            # 6. Final Safety Truncation (just in case)
            if len(input_ids) > max_allowed:
                input_ids = input_ids[:max_allowed]
                attention_mask = attention_mask[:max_allowed]
                token_type_ids = token_type_ids[:max_allowed]
                # If labels were pushed out by truncation, reset to 0
                if start_pos >= max_allowed or end_pos >= max_allowed:
                    start_pos = 0
                    end_pos = 0

            encoded["input_ids"].append(input_ids)
            encoded["attention_mask"].append(attention_mask)
            encoded["token_type_ids"].append(token_type_ids)
            encoded["start_positions"].append(start_pos)
            encoded["end_positions"].append(end_pos)
            encoded["overflow_to_sample_mapping"].append(example_idx)

            # Break loop if this chunk reached the end of the context
            if chunk_end_idx >= len(context_ids):
                break

    return encoded


In [32]:
# LoRA config
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=32,   # changed from 8
    lora_alpha=64,  # changed from 32
    lora_dropout=0.1,
    target_modules=["query", "value", "key"],   # added key, output.dense
    bias="none",
    modules_to_save=["qa_outputs"],
)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


In [33]:
# preprocess the train and val splits
processed_train = uqa_train.map(lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), batched=True, remove_columns=uqa_train.column_names, with_indices=True)
processed_val = uqa_val.map(lambda examples, indices: preprocess_uqa(examples, tokenizer, indices=indices), batched=True, remove_columns=uqa_val.column_names, with_indices=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3198 > 2048). Running this sequence through the model will result in indexing errors


In [34]:
# processed_train

In [35]:
# processed_val

In [36]:
processed_train.save_to_disk("/kaggle/working/cache/processed_train_uqa")
processed_val.save_to_disk("/kaggle/working/cache/processed_val_uqa")   # cached it


processed_train = load_from_disk("/kaggle/working/cache/processed_train_uqa")
processed_val = load_from_disk("/kaggle/working/cache/processed_val_uqa")

Saving the dataset (0/1 shards):   0%|          | 0/116995 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/31446 [00:00<?, ? examples/s]

In [37]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")



In [38]:
# build LoRA model

peft_model = get_peft_model(model, lora_config)
peft_model.gradient_checkpointing_enable()
print_trainable_parameters(peft_model)

trainable params: 2065922 || all params: 134150404 || trainable%: 1.5400043074040985


In [39]:
# evals


def normalize_answer(text):
    text = (text or "").lower()
    def remove_articles(s):
        return re.sub(r"\b(a|an|the)\b", " ", s)
    def remove_punctuation(s):
        return "".join(ch for ch in s if ch not in string.punctuation)
    def white_space_fix(s):
        return " ".join(s.split())
    return white_space_fix(remove_articles(remove_punctuation(text)))

def exact_match_score(prediction, ground_truth):
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    gold_tokens = normalize_answer(ground_truth).split()
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def edit_distance_score(prediction, ground_truth):
    pred_norm = normalize_answer(prediction)
    gold_norm = normalize_answer(ground_truth)
    if not gold_norm and not pred_norm:
        return 1.0
    if not gold_norm or not pred_norm:
        return 0.0
    distance = Levenshtein.distance(pred_norm, gold_norm)
    max_len = max(len(pred_norm), len(gold_norm))
    return 1.0 - (distance / max_len) if max_len > 0 else 1.0

def gold_answer(example):
    # Extracts the gold answer substring from the context using character offsets
    answer = example.get("answer")
    context = example.get("context")
    answer_start = example.get("answer_start", -1)
    if answer and answer_start is not None and answer_start != -1:
        return context[answer_start: answer_start + len(answer)]
    return "[CLS]"


def decode_prediction(input_ids, start_idx, end_idx, tokenizer=None):
    if start_idx > end_idx:
        start_idx, end_idx = end_idx, start_idx
    if tokenizer is None:
        raise ValueError("Tokenizer must be provided for decoding.")
    cls_index = input_ids.index(tokenizer.cls_token_id)
    # If both point to CLS token, return [CLS] sentinel
    if start_idx == cls_index and end_idx == cls_index:
        return "[CLS]"
    start_idx = max(start_idx, 0)
    end_idx = min(end_idx, len(input_ids) - 1)
    if start_idx > end_idx:
        return "[CLS]"
    text = tokenizer.decode(input_ids[start_idx:end_idx + 1], skip_special_tokens=True)
    text = text.strip()
    return text if text else "[CLS]"


def evaluate_checkpoint(checkpoint_path=None, model_instance=None, eval_dataset=None):
    """Evaluate either a checkpoint path (loads model) or a provided model instance.

    - checkpoint_path: path to checkpoint folder
    - model_instance: an in-memory model (preferably a PeftModel or CanineForQuestionAnswering)
    - eval_dataset: optional dataset to evaluate; if None the default processed_val will be used
    """
    if eval_dataset is None:
        eval_dataset = processed_val

    # If a model_instance is given, use it directly (avoid re-loading a fresh base model)
    if model_instance is not None:
        eval_model = model_instance
    else:
        base_model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)
        eval_model = get_peft_model(base_model, lora_config)
        # Try loading adapter weights; fall back to PeftModel.from_pretrained if needed
        try:
            eval_model.load_adapter(checkpoint_path)
        except Exception:
            from peft import PeftModel
            eval_model = PeftModel.from_pretrained(base_model, checkpoint_path)

    eval_model.to(device)

    eval_args = TrainingArguments(
        # Small evaluation config; uses cpu/mps if no gpu during eval
        output_dir="outputs/canine-s-uqa",
        per_device_eval_batch_size=16,
        dataloader_drop_last=False,
        fp16=True,
        bf16=False,
        report_to="none",
    )

    # Run evaluation via a lightweight Trainer so prediction loop is standard
    eval_trainer = Trainer(
        model=eval_model,
        args=eval_args,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    predictions = eval_trainer.predict(eval_dataset)
    start_logits, end_logits = predictions.predictions
    best_predictions = {}
    for feature_index, feature in enumerate(eval_dataset):
        sample_idx = int(feature["overflow_to_sample_mapping"])
        input_ids = feature["input_ids"]
        start_idx = int(np.argmax(start_logits[feature_index]))
        end_idx = int(np.argmax(end_logits[feature_index]))
        score = float(start_logits[feature_index][start_idx] + end_logits[feature_index][end_idx])
        prediction_text = decode_prediction(input_ids, start_idx, end_idx, tokenizer=tokenizer)
        stored = best_predictions.get(sample_idx)
        if stored is None or score > stored[0]:
            best_predictions[sample_idx] = (score, prediction_text)

    em_scores = []
    f1_scores = []
    edit_dist_scores = []
    for sample_idx, (_, prediction_text) in best_predictions.items():
        reference = gold_answer(uqa_val[int(sample_idx)])
        em_scores.append(exact_match_score(prediction_text, reference))
        f1_scores.append(f1_score(prediction_text, reference))
        edit_dist_scores.append(edit_distance_score(prediction_text, reference))

    em = float(np.mean(em_scores)) if em_scores else 0.0
    f1 = float(np.mean(f1_scores)) if f1_scores else 0.0
    edit_dist = float(np.mean(edit_dist_scores)) if edit_dist_scores else 0.0
    print(f"Examples evaluated: {len(em_scores)}")
    print(f"Exact Match: {em * 100:.2f}")
    print(f"F1: {f1 * 100:.2f}")
    print(f"Edit Distance (normalized): {edit_dist * 100:.2f}")
    return {"exact_match": em, "f1": f1, "edit_distance": edit_dist}


In [40]:
training_args = TrainingArguments(
    output_dir="outputs/canine-s-uqa",

    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,

    gradient_accumulation_steps=4,
    gradient_checkpointing=True,

    num_train_epochs=1,
    learning_rate=3e-4,
    weight_decay=0.01,
    eval_strategy="no",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=25,
    fp16=True,
    bf16=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="VohraAK/canine-s-uqa",
    hub_strategy="checkpoint",
    )

class CustomEvalCallback(TrainerCallback):
    def __init__(self, eval_func, eval_dataset, use_in_memory_model=True, verbose=True):
        self.eval_func = eval_func
        self.eval_dataset = eval_dataset
        self.use_in_memory_model = use_in_memory_model
        self.verbose = verbose
        # trainer reference (set after trainer exists)
        self.trainer = None

    def on_save(self, args, state, control, model=None, **kwargs):
        checkpoint_path = f"{args.output_dir}/checkpoint-{state.global_step}"
        if self.verbose:
            print(f"\nüîç Running custom evaluation at step {state.global_step}...")

        # Prefer evaluating the in-memory trainer model (fast + avoids re-loading)
        if self.use_in_memory_model and self.trainer is not None:
            if self.verbose:
                print("Using in-memory model for evaluation (no reloading).")
            try:
                metrics = self.eval_func(checkpoint_path=None, model_instance=self.trainer.model, eval_dataset=self.eval_dataset)
            except Exception as e:
                print("‚ö†Ô∏è in-memory evaluation failed, falling back to checkpoint load:", e)
                metrics = self.eval_func(checkpoint_path)
        else:
            metrics = self.eval_func(checkpoint_path)

        # record metrics in state.log_history
        state.log_history.append({
            "step": state.global_step,
            "eval_exact_match": metrics.get("exact_match"),
            "eval_f1": metrics.get("f1"),
            "eval_edit_distance": metrics.get("edit_distance"),
        })

        if self.verbose:
            print(f"‚úÖ Step {state.global_step}: EM={metrics.get('exact_match',0)*100:.2f}, F1={metrics.get('f1',0)*100:.2f}, EditDist={metrics.get('edit_distance',0)*100:.2f}")

        # Update trainer_state.json to include custom metrics
        state_path = f"{checkpoint_path}/trainer_state.json"
        try:
            with open(state_path, 'r') as f:
                state_dict = json.load(f)
            state_dict['log_history'] = state.log_history
            with open(state_path, 'w') as f:
                json.dump(state_dict, f, indent=2)
            if self.verbose:
                print(f"üíæ Updated trainer_state.json with custom metrics")
        except Exception as e:
            if self.verbose:
                print(f"‚ö†Ô∏è  Warning: Could not update trainer_state.json: {e}")

        try:
            if self.verbose:
                print(f"‚òÅÔ∏è  Pushing checkpoint-{state.global_step} to Hub...")
            api = HfApi()
            api.upload_folder(
                folder_path=checkpoint_path,
                repo_id=args.hub_model_id,
                path_in_repo=f"checkpoint-{state.global_step}",
                commit_message=f"Add checkpoint {state.global_step} (EM={metrics.get('exact_match',0)*100:.1f}%, F1={metrics.get('f1',0)*100:.1f}%)",
                repo_type="model"
            )
            if self.verbose:
                print(f"‚úÖ Pushed checkpoint-{state.global_step} to Hub")
        except Exception as e:
            if self.verbose:
                print(f"‚ö†Ô∏è  Warning: Could not push to Hub: {e}")

        return control

In [41]:
trainer_cb = CustomEvalCallback(evaluate_checkpoint, processed_val, use_in_memory_model=True)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    callbacks=[trainer_cb],
)


In [42]:
trainer.train()



Step,Training Loss
25,5.8404
50,5.6255
75,5.3901
100,5.2163
125,5.061
150,4.9508
175,4.8116
200,4.5768
225,4.5701
250,4.4409



üîç Running custom evaluation at step 500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 30.15
F1: 30.52
Edit Distance (normalized): 30.96
‚úÖ Step 500: EM=30.15, F1=30.52, EditDist=30.96
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-500 to Hub...
‚úÖ Pushed checkpoint-500 to Hub





üîç Running custom evaluation at step 1000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 31.22
F1: 31.43
Edit Distance (normalized): 31.73
‚úÖ Step 1000: EM=31.22, F1=31.43, EditDist=31.73
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-1000 to Hub...
‚úÖ Pushed checkpoint-1000 to Hub





üîç Running custom evaluation at step 1500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 31.79
F1: 31.95
Edit Distance (normalized): 32.22
‚úÖ Step 1500: EM=31.79, F1=31.95, EditDist=32.22
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-1500 to Hub...
‚úÖ Pushed checkpoint-1500 to Hub





üîç Running custom evaluation at step 2000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.18
F1: 32.32
Edit Distance (normalized): 32.55
‚úÖ Step 2000: EM=32.18, F1=32.32, EditDist=32.55
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-2000 to Hub...
‚úÖ Pushed checkpoint-2000 to Hub





üîç Running custom evaluation at step 2500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.47
F1: 32.57
Edit Distance (normalized): 32.79
‚úÖ Step 2500: EM=32.47, F1=32.57, EditDist=32.79
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-2500 to Hub...
‚úÖ Pushed checkpoint-2500 to Hub





üîç Running custom evaluation at step 3000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.65
F1: 32.74
Edit Distance (normalized): 32.95
‚úÖ Step 3000: EM=32.65, F1=32.74, EditDist=32.95
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3000 to Hub...
‚úÖ Pushed checkpoint-3000 to Hub





üîç Running custom evaluation at step 3500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.76
F1: 32.83
Edit Distance (normalized): 33.04
‚úÖ Step 3500: EM=32.76, F1=32.83, EditDist=33.04
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3500 to Hub...
‚úÖ Pushed checkpoint-3500 to Hub





üîç Running custom evaluation at step 4000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.92
F1: 33.00
Edit Distance (normalized): 33.20
‚úÖ Step 4000: EM=32.92, F1=33.00, EditDist=33.20
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4000 to Hub...
‚úÖ Pushed checkpoint-4000 to Hub





üîç Running custom evaluation at step 4500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.92
F1: 32.98
Edit Distance (normalized): 33.17
‚úÖ Step 4500: EM=32.92, F1=32.98, EditDist=33.17
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4500 to Hub...
‚úÖ Pushed checkpoint-4500 to Hub





üîç Running custom evaluation at step 5000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.93
F1: 32.99
Edit Distance (normalized): 33.18
‚úÖ Step 5000: EM=32.93, F1=32.99, EditDist=33.18
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5000 to Hub...
‚úÖ Pushed checkpoint-5000 to Hub





üîç Running custom evaluation at step 5500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 32.95
F1: 33.00
Edit Distance (normalized): 33.20
‚úÖ Step 5500: EM=32.95, F1=33.00, EditDist=33.20
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5500 to Hub...
‚úÖ Pushed checkpoint-5500 to Hub





üîç Running custom evaluation at step 6000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 33.01
F1: 33.07
Edit Distance (normalized): 33.26
‚úÖ Step 6000: EM=33.01, F1=33.07, EditDist=33.26
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6000 to Hub...
‚úÖ Pushed checkpoint-6000 to Hub





üîç Running custom evaluation at step 6500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 33.01
F1: 33.06
Edit Distance (normalized): 33.26
‚úÖ Step 6500: EM=33.01, F1=33.06, EditDist=33.26
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6500 to Hub...
‚úÖ Pushed checkpoint-6500 to Hub





üîç Running custom evaluation at step 7000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 33.01
F1: 33.06
Edit Distance (normalized): 33.26
‚úÖ Step 7000: EM=33.01, F1=33.06, EditDist=33.26
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-7000 to Hub...
‚úÖ Pushed checkpoint-7000 to Hub





üîç Running custom evaluation at step 7313...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(


Examples evaluated: 10000
Exact Match: 33.01
F1: 33.06
Edit Distance (normalized): 33.26
‚úÖ Step 7313: EM=33.01, F1=33.06, EditDist=33.26
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-7313 to Hub...
‚úÖ Pushed checkpoint-7313 to Hub


TrainOutput(global_step=7313, training_loss=2.690779531701358, metrics={'train_runtime': 3842.2612, 'train_samples_per_second': 30.45, 'train_steps_per_second': 1.903, 'total_flos': 2.937418744905216e+16, 'train_loss': 2.690779531701358, 'epoch': 1.0})

### Diagnosing Preprocessing Functions!!!

In [43]:
# Diagnostic cell (fixed): Investigate preprocessing and truncation for many samples
import random
import pandas as pd
from transformers import AutoTokenizer

# Set display options to see full Urdu text
pd.set_option('display.max_colwidth', None)

try:
    tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
except Exception:
    tokenizer = None

num_samples = 20000  # Number of samples to check
results = []

for split_name, orig_data, proc_data in [
    ("train", uqa_train, processed_train),
    ("val", uqa_val, processed_val)
]:
    # Sample random indices
    if len(proc_data) < num_samples:
        current_indices = range(len(proc_data))
    else:
        current_indices = random.sample(range(len(proc_data)), num_samples)

    for idx in current_indices:
        proc = proc_data[idx]
        # Use overflow_to_sample_mapping to get the correct original index
        orig_idx = proc["overflow_to_sample_mapping"]
        orig = orig_data[orig_idx]

        input_ids = proc["input_ids"]
        start_pos = proc["start_positions"]
        end_pos = proc["end_positions"]

        gold_answer = orig.get("gold_answer", orig.get("answer", ""))
        question = orig.get("question", "")

        # Decode input_ids to text (for debugging context)
        if tokenizer:
            decoded_text = tokenizer.decode(input_ids, skip_special_tokens=False)
        else:
            decoded_text = str(input_ids)

        # Extract predicted answer span
        if 0 <= start_pos < len(input_ids) and 0 <= end_pos < len(input_ids):
            if tokenizer:
                pred_span = tokenizer.decode(input_ids[start_pos:end_pos+1], skip_special_tokens=True)
            else:
                pred_span = str(input_ids[start_pos:end_pos+1])
        else:
            pred_span = "[CLS]" # Represents no answer found in this chunk or invalid

        # Check if pred_span matches gold answer
        # We strip() to ignore minor whitespace differences
        pred_matches_gold = pred_span.strip() == gold_answer.strip()

        # Check if gold is even reachable in this chunk
        gold_in_decoded = gold_answer in decoded_text

        results.append({
            "Split": split_name,
            "Question": question,
            "Gold Answer": gold_answer,
            "Extracted Answer": pred_span,
            "Match": pred_matches_gold,
            "Gold Reachable": gold_in_decoded,
            "orig_idx": orig_idx
        })

# Create DataFrame
results_df = pd.DataFrame(results)

# --- SIDE BY SIDE COMPARISON ---

# 1. Filter for Solvable Mismatches (Gold was there, but we predicted wrong)
problem_cases = results_df[
    (results_df["Gold Reachable"] == True) &
    (results_df["Match"] == False)
][["Question", "Gold Answer", "Extracted Answer", "Split"]]

print(f"üîç Checked {len(results_df)} samples.")
print(f"‚ùå Found {len(problem_cases)} cases where Gold was present but Extraction failed.")

print("\nüìä Side-by-Side Comparison (Top 20 Failures):")
display(problem_cases.head(50))

print("\n‚úÖ Side-by-Side Comparison (First 10 Rows - Mixed):")
display(results_df[["Question", "Gold Answer", "Extracted Answer", "Match"]].head(50))

üîç Checked 40000 samples.
‚ùå Found 1012 cases where Gold was present but Extraction failed.

üìä Side-by-Side Comparison (Top 20 Failures):


Unnamed: 0,Question,Gold Answer,Extracted Answer,Split
12,ÿ¥⁄©ÿßÿ±€å ÿßŸàÿ± ÿ¥⁄©ÿßÿ± ⁄©€å ÿ®ÿßÿ™ ⁄Ü€åÿ™ ÿ¨ÿ≥ ŸÖ€å⁄∫ ŸÖ€Åÿßÿ±ÿ™ ÿ¥ÿßŸÖŸÑ €ÅŸàÿ™€å €Å€í ÿßÿ≥ ⁄©€í ŸÜÿ™€åÿ¨€í ŸÖ€å⁄∫ ÿß€å⁄© ÿ™Ÿàÿßÿ≤ŸÜ €ÅŸàÿ™ÿß €Å€í ÿ¨ÿ≥€í ⁄©€Åÿß ÿ¨ÿßÿ™ÿß €Å€íÿü,ÿ™Ÿàÿßÿ≤ŸÜ,,train
43,⁄Ü€åŸÜ€åŸà⁄∫ ⁄©€å ÿ∑ÿ±ŸÅ ÿ≥€í ÿ≠ŸÖŸÑŸà⁄∫ ⁄©€å ÿß€å⁄© ŸÖÿ§ÿ´ÿ± ÿ≥€åÿ±€åÿ≤ ⁄©€í ÿ∑Ÿàÿ± Ÿæÿ± ⁄©€åÿß ÿ¥ÿ±Ÿàÿπ €ÅŸà ÿ≥⁄©ÿ™ÿß €Å€í ÿ¨ÿ≥ ŸÜ€í ⁄©ÿ≥ ÿπŸÑÿßŸÇ€í ⁄©Ÿà ÿØŸàÿ®ÿßÿ±€Å ÿ≠ÿßÿµŸÑ ⁄©€åÿßÿü,ŸÑÿßÿ¶ŸÜ,,train
56,ÿÆÿßŸÜ€Å ÿ¨ŸÜ⁄Ø€å ⁄©€í ÿ®ÿπÿØ ÿ™ÿ±ŸÇ€å ⁄©ÿ±ŸÜ€í ŸàÿßŸÑ€å Ÿæ€ÅŸÑ€å ÿ®⁄ë⁄æÿ™€å €ÅŸàÿ¶€å ÿµŸÜÿπÿ™Ÿà⁄∫ ŸÖ€å⁄∫ ÿ≥€í ÿß€å⁄© ⁄©€åÿß ÿ™⁄æ€åÿü,ÿ™ŸÖÿ®ÿß⁄©Ÿà,,train
60,ÿ®ŸÜÿØŸàŸÇ ⁄©€í ŸÜÿ∏ÿßŸÖ ⁄©Ÿà ŸÖ⁄©ŸÖŸÑ ÿ∑Ÿàÿ± Ÿæÿ± ÿ™ÿ®ÿØ€åŸÑ ⁄©ÿ±ŸÜ€í ⁄©ÿß ÿßŸÖ⁄©ÿßŸÜ ⁄©€åÿß €Å€íÿü,ŸÖ€åÿ≤ÿßÿ¶ŸÑ,,train
92,⁄©ÿ≥ ÿ≥ÿßŸÑ ⁄©€í ÿØŸàÿ±ÿßŸÜ ⁄Ü€åŸàÿßŸàÿß ⁄©Ÿà ÿØÿ¥ŸÖŸÜ ⁄©€í ⁄©ŸÜŸπÿ±ŸàŸÑ ÿ≥€í ÿ¢ÿ≤ÿßÿØ ŸÇÿ±ÿßÿ± ÿØ€åÿß ⁄Ø€åÿß ÿ™⁄æÿßÿü,1866,,train
111,"⁄©ŸàŸÜ ÿ≥ÿß ÿ¨ÿßŸÜŸàÿ± ŸÖÿπÿ¨ÿ≤ÿßÿ™€å ÿ∑Ÿàÿ± Ÿæÿ± ŸàÿßŸæÿ≥ ÿ¢ ⁄Ø€åÿß €Å€í ÿßŸàÿ± ÿßÿ≥€í ""ÿ¨ŸÜ⁄ØŸÑ ŸÖ€å⁄∫ ŸÖÿπÿØŸàŸÖ"" ÿ≥€í ""ŸÜŸÇÿµÿßŸÜ ÿØ€Å"" ŸÖ€å⁄∫ ÿßŸæ ⁄Øÿ±€å⁄à ⁄©€åÿß ⁄Ø€åÿß €Å€íÿü",ÿπÿ±ÿ® ÿßŸàÿ±€å⁄©ÿ≥,,train
120,ÿß€å⁄© ÿπŸÖŸÑ ⁄©€åÿß €Å€í ÿ¨Ÿà ÿ¨ÿ≥ŸÖ ⁄©€í ÿ∞ÿ±€åÿπ€í ⁄©€åÿß ÿ¨ÿßÿ™ÿß €Å€í ÿßŸàÿ± ÿßÿ≥ ŸÖ€å⁄∫ ÿ¨ÿßŸÜ ÿ®Ÿàÿ¨⁄æ ⁄©ÿ± ⁄©Ÿàÿ¥ÿ¥ ÿ¥ÿßŸÖŸÑ €ÅŸàÿ™€å €Å€íÿü,ÿ≥€åŸÑÿß,,train
222,⁄©ŸàŸÜ ÿßŸæŸÜ€å ÿ≤€åÿßÿØ€Å ÿ™ÿ± ŸÖŸàÿ≥€åŸÇ€å ÿÆŸàÿØ ŸÑ⁄©⁄æÿ™ÿß ÿßŸàÿ± ÿ™ÿÆŸÑ€åŸÇ ⁄©ÿ±ÿ™ÿß €Å€íÿü,ŸÖ€å⁄àŸàŸÜÿß,,train
243,⁄©ÿ™ŸÜ€í ÿßŸÖÿ±€å⁄©€å ÿ≥ÿ±ŸÖÿß€å€Å ⁄©ÿßÿ±€å ⁄©€í ÿ®€åŸÜ⁄©Ÿà⁄∫ ŸÜ€í 2004 ÿ≥€í 2007 ÿ™⁄© ÿßŸæŸÜ€í ŸÖÿßŸÑ€å ŸÑ€åŸàÿ±€åÿ¨ ŸÖ€å⁄∫ ŸÜŸÖÿß€åÿß⁄∫ ÿßÿ∂ÿßŸÅ€Å ⁄©€åÿßÿü,ŸæÿßŸÜ⁄Ü,,train
246,ÿßÿ≥ ⁄©€å ÿÆÿµŸàÿµ€åÿßÿ™ ⁄©Ÿà ÿ™ÿ®ÿØ€åŸÑ ⁄©ÿ±ŸÜ€í ⁄©€í ŸÑÿ¶€í ⁄©ÿ≥ ÿ∑ÿ±ÿ≠ ⁄©ÿß ÿπŸÑÿßÿ¨ ⁄©€åÿß ÿ¨ÿß ÿ≥⁄©ÿ™ÿß €Å€íÿü,⁄Øÿ±ŸÖ€å ⁄©€í ÿπŸÑÿßÿ¨,,train



‚úÖ Side-by-Side Comparison (First 10 Rows - Mixed):


Unnamed: 0,Question,Gold Answer,Extracted Answer,Match
0,ÿ±⁄Üÿ±⁄àÿ≥ŸÜ ŸÜ€í ÿØÿπŸà€åŸ∞ ⁄©€åÿß ⁄©€Å ÿ¥Ÿàÿßÿ±ÿ≤ŸÜ€å⁄Øÿ± ⁄©€í Ÿæÿ®ŸÑÿ≥ÿ≥Ÿπ ÿßŸàÿ± ÿßÿ≥ÿ≥ŸπŸÜŸπ ŸÜ€í ÿßÿ≥€í ÿ®ÿØŸÜÿßŸÖ ⁄©ÿ±ŸÜ€í ⁄©€í ŸÑÿ¶€í ⁄©ÿ≥ ÿßÿÆÿ®ÿßÿ± ⁄©ÿß ÿßÿ≥ÿ™ÿπŸÖÿßŸÑ ⁄©€åÿß ÿ™⁄æÿßÿü,ŸÑÿßÿ≥ ÿß€åŸÜÿ¨ŸÑÿ≥ Ÿπÿßÿ¶ŸÖÿ≤,ŸÑÿßÿ≥ ÿß€åŸÜÿ¨ŸÑÿ≥ Ÿπÿßÿ¶ŸÖÿ≤,True
1,⁄©€åÿß ŸÇÿßŸÜŸàŸÜ ŸÖŸÜÿ∏Ÿàÿ± ⁄©€åÿß ⁄Ø€åÿß ÿ™⁄æÿß ÿ¨Ÿà Plebeian ⁄©ŸàŸÜÿ≥ŸÑ patrician ÿ≥€åŸÜ€åŸπÿ±ÿ≤ ⁄©€å ŸÖŸÜÿ∏Ÿàÿ±€å ⁄©€í ÿ®ÿ∫€åÿ± ÿß€å⁄© ÿ®ŸÑ Ÿæÿ± ÿ∫Ÿàÿ± ⁄©ÿ±ŸÜ€í ⁄©€å ÿßÿ¨ÿßÿ≤ÿ™ ÿØ€åÿü,ŸÑ€å⁄©ÿ≥ €ÅŸàÿ±Ÿπ€åŸÜÿ≥€åÿß,,False
2,ÿ®€å⁄©ÿ± ⁄©ÿß ÿÆ€åÿßŸÑ ÿ™⁄æÿß ⁄©€Å ÿßŸÑ€å⁄©Ÿπÿ±ŸàŸÑÿßÿ¶Ÿπ⁄© ⁄©€åŸæÿ≥€åŸπÿ±ÿ≤ ⁄©ÿß ⁄©ŸàŸÜ ÿ≥ÿß ÿ¨ÿ≤Ÿà ŸæŸàÿ±ÿ≥ ⁄©ÿßÿ±ÿ®ŸÜ ÿßŸÑ€å⁄©Ÿπÿ±Ÿà⁄àÿ≥ ÿ≥€í ŸÖÿÆÿ™ŸÑŸÅ €Å€íÿü,,,True
3,⁄©ŸàŸÜ ÿ≥€å ÿßŸÜ⁄Øÿ±€åÿ≤€å ⁄©ÿßŸÑŸàŸÜ€å ŸÜ€í ÿß€åÿ≥€í ŸÇŸàÿßŸÜ€åŸÜ ŸÖŸÜÿ∏Ÿàÿ± ⁄©€å€í ÿ¨ŸÜ€ÅŸà⁄∫ ŸÜ€í ÿ®⁄ÜŸà⁄∫ ⁄©Ÿà ÿ®ÿßŸæ ⁄©€å ÿ≥ŸÖÿßÿ¨€å ÿ≠€åÿ´€åÿ™ ÿØ€åÿü,,,True
4,ÿß€å⁄© ŸÖÿ¥€åŸÜ ⁄©Ÿà ÿßÿ≥ ⁄©€í ÿµÿßÿ±ŸÅ ⁄©€å ÿ∑ÿ±ŸÅ ÿ≥€í ÿ±ÿ≥ÿßÿ¶€å ÿ≠ÿßÿµŸÑ ⁄©ÿ±ŸÜ€í ⁄©€å ÿßÿ¨ÿßÿ≤ÿ™ ÿØ€åÿ™ÿß €Å€íÿü,,,True
5,ÿ™ÿßÿ¨⁄©ÿ≥ÿ™ÿßŸÜ ⁄©€í ÿßÿ≥⁄©ŸàŸÑ ⁄©€í ŸÜÿ∏ÿßŸÖ ŸÖ€å⁄∫ ÿßÿ≥⁄©ŸàŸÑ ⁄©€í ⁄©ÿ™ŸÜ€í ÿ≥ÿßŸÑ €Å€å⁄∫ÿü,11 ÿ≥ÿßŸÑ ⁄©€å Ÿæÿ±ÿßÿ¶ŸÖÿ±€å ÿßŸàÿ± ÿ´ÿßŸÜŸà€å ÿ™ÿπŸÑ€åŸÖ,,False
6,10 ŸÖ€å⁄ØÿßŸàÿßŸπ ÿ≥€í ⁄©ŸÖ €Åÿßÿ¶€å⁄àÿ±Ÿà ŸæŸÑÿßŸÜŸπ ÿ≥€í ÿ®ÿ¨ŸÑ€å ⁄©€å ÿßŸàÿ≥ÿ∑ ŸÑÿß⁄Øÿ™ ⁄©€åÿß €Å€íÿü,,,True
7,ÿµŸÜÿπÿßÿ° ⁄©€í ŸÖÿÆÿ∑Ÿàÿ∑ÿßÿ™ ⁄©ÿ≥ ÿ≥ÿßŸÑ ÿ≥€í Ÿæ€ÅŸÑ€í ÿ™€åÿßÿ± ⁄©€å€í ⁄Øÿ¶€í ÿ™⁄æ€íÿü,671 AD,671 AD,True
8,ŸæŸàŸæ ŸÅÿ±ÿßŸÜÿ≥ÿ≥ ŸÜ€í ÿØŸÜ€åÿß ÿ®⁄æÿ± ŸÖ€å⁄∫ ÿ≥ŸÜÿ™Ÿà⁄∫ ⁄©€í ÿ™€ÅŸàÿßÿ±Ÿà⁄∫ ⁄©€í ÿ¨ŸÜÿ±ŸÑ ÿ±ŸàŸÖŸÜ ⁄©€åŸÑŸÜ⁄àÿ± ŸÖ€å⁄∫ ÿßŸæŸÜ€å ÿßÿÆÿ™€åÿßÿ±€å €åÿßÿØ⁄Øÿßÿ± ⁄©ÿ® ÿ¥ÿßŸÖŸÑ ⁄©€åÿü,11 ÿ≥ÿ™ŸÖÿ®ÿ± 2014,11 ÿ≥ÿ™ŸÖÿ®ÿ± 2014,True
9,13 Ÿà€å⁄∫ ÿßŸàÿ± 14 Ÿà€å⁄∫ ÿµÿØ€å ⁄©€í ÿØŸàÿ±ÿßŸÜ ŸÅÿ±ÿßŸÜÿ≥ ŸÖ€å⁄∫ ⁄©ŸàŸÜ ÿ≥ÿß ÿ®⁄ëÿß ÿ™ŸÜÿßÿ≤ÿπ€Å Ÿæ€åÿØÿß €ÅŸàÿßÿü,ÿ≥Ÿà ÿ≥ÿßŸÑ€Å ÿ¨ŸÜ⁄Ø,,False


In [44]:
# Accuracy: fraction of rows where extracted answer matches gold answer
accuracy = (results_df["Match"]).mean()

# Precision: among rows where extracted answer is non-empty, fraction that matches gold
# We filter out cases where the model predicted nothing (empty string) or just whitespace
non_empty_pred = results_df["Extracted Answer"].str.strip() != ""

# Avoid division by zero if no predictions were made
if non_empty_pred.sum() > 0:
    precision = (results_df["Match"] & non_empty_pred).sum() / non_empty_pred.sum()
else:
    precision = 0.0

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")

Accuracy: 0.578
Precision: 1.000
