---

In [8]:
!pip install peft evaluate transformers Levenshtein
!pip install protobuf==3.20.3



## Reproducing CANINE results on TyDiQA for QA and MasakhaNER for NER

In [None]:
import os
import json

os.environ["TRANSFORMERS_DISABLE_CHAT_TEMPLATES"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_NO_ADDITIONAL_CHAT_TEMPLATES"] = "1"

from collections import Counter
import re
import string
from typing import Optional

import numpy as np
from peft import PeftModel
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm
from datasets import load_dataset, load_from_disk
from evaluate import load as load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, CanineTokenizer, CanineForQuestionAnswering, TrainerCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
import Levenshtein

# get train and validation subsets
tydiqa_dataset = load_dataset("tydiqa", "primary_task")
tydiqa_train = tydiqa_dataset["train"]
tydiqa_val = tydiqa_dataset["validation"]

tydiqa_train = tydiqa_train.shuffle(seed=42).select(range(2000))
tydiqa_val = tydiqa_val.shuffle(seed=42).select(range(500))

In [10]:
from huggingface_hub import notebook_login, whoami, HfApi

# notebook_login()
whoami()

{'type': 'user',
 'id': '6783c3dea61d3631a3b02839',
 'name': 'VohraAK',
 'fullname': 'Abdullah Khurram Vohra',
 'isPro': False,
 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/md0nqnvyVM8wKPTAlguu9.png',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'cs5316',
   'role': 'fineGrained',
   'createdAt': '2025-11-07T10:44:22.116Z',
   'fineGrained': {'canReadGatedRepos': True,
    'global': [],
    'scoped': [{'entity': {'_id': '6783c3dea61d3631a3b02839',
       'type': 'user',
       'name': 'VohraAK'},
      'permissions': ['collection.read',
       'repo.content.read',
       'repo.write',
       'inference.serverless.write',
       'inference.endpoints.infer.write',
       'inference.endpoints.write',
       'user.webhooks.read',
       'user.webhooks.write',
       'collection.write']}]}}}}

In [11]:
model_name = 'google/canine-c'
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

Note: evaluating TyDiQA on its primary task. Adjust `MAX_TRAIN_SAMPLES`, `MAX_VAL_SAMPLES`, `MAX_SEQ_LENGTH`, and `MAX_TRAINING_STEPS` above when moving between local debugging and full-scale runs.

In [12]:
tokenizer = CanineTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=False)
model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print([name for name, _ in model.canine.named_modules() if "embedding" in name.lower()][:20])

['char_embeddings', 'char_embeddings.HashBucketCodepointEmbedder_0', 'char_embeddings.HashBucketCodepointEmbedder_1', 'char_embeddings.HashBucketCodepointEmbedder_2', 'char_embeddings.HashBucketCodepointEmbedder_3', 'char_embeddings.HashBucketCodepointEmbedder_4', 'char_embeddings.HashBucketCodepointEmbedder_5', 'char_embeddings.HashBucketCodepointEmbedder_6', 'char_embeddings.HashBucketCodepointEmbedder_7', 'char_embeddings.char_position_embeddings', 'char_embeddings.token_type_embeddings', 'char_embeddings.LayerNorm', 'char_embeddings.dropout']


In [14]:
model.config.use_cache = False

In [15]:
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {num_params}")

Number of model parameters: 132084482


### Creating a preprocess function to align byte offsets to CANINE's character tokens (helped by LLMs)

In [16]:
MAX_SEQ_LENGTH = 384
DOC_STRIDE = 64

from bisect import bisect_right

def _build_byte_to_char_index(text: str) -> list[int]:
    """Return cumulative UTF-8 byte offsets for each character boundary."""
    cumulative = [0]
    for char in text:
        cumulative.append(cumulative[-1] + len(char.encode("utf-8")))
    return cumulative

def _byte_to_char(cumulative_bytes: list[int], byte_index: int) -> int:
    """Map a byte offset to the nearest character index (floor)."""
    position = bisect_right(cumulative_bytes, byte_index) - 1
    return max(position, 0)

def preprocess(examples, tokenizer, max_length: int = MAX_SEQ_LENGTH, doc_stride: int = DOC_STRIDE):
    """Prepare TyDiQA primary-task batches for CANINE fine-tuning."""
    questions = [q.strip() for q in examples["question_text"]]
    contexts = examples["document_plaintext"]
    annotations_list = examples["annotations"]

    special_tokens = tokenizer.num_special_tokens_to_add(pair=True)

    encoded = {
        "input_ids": [],
        "attention_mask": [],
        "token_type_ids": [],
        "start_positions": [],
        "end_positions": [],
        "overflow_to_sample_mapping": [],
    }

    for example_idx, (question, context, annotations) in enumerate(zip(questions, contexts, annotations_list)):
        question_tokens = tokenizer.encode(question, add_special_tokens=False)
        context_tokens = tokenizer.encode(context, add_special_tokens=False)

        max_context_tokens = max_length - len(question_tokens) - special_tokens
        if max_context_tokens <= 0 or not context_tokens:
            continue

        minimal_start = annotations.get("minimal_answers_start_byte", [])
        minimal_end = annotations.get("minimal_answers_end_byte", [])
        if minimal_start and minimal_start[0] != -1:
            byte_map = _build_byte_to_char_index(context)
            start_char = _byte_to_char(byte_map, minimal_start[0])
            end_char = _byte_to_char(byte_map, max(minimal_end[0] - 1, minimal_start[0]))
            answer_span = (start_char, end_char)
        else:
            answer_span = None

        stride_tokens = max_context_tokens - doc_stride
        if stride_tokens <= 0:
            stride_tokens = max_context_tokens

        span_start = 0
        context_length = len(context_tokens)
        while span_start < context_length:
            span_end = min(span_start + max_context_tokens, context_length)
            context_chunk = context_tokens[span_start:span_end]

            input_ids = tokenizer.build_inputs_with_special_tokens(question_tokens, context_chunk)
            token_type_ids = tokenizer.create_token_type_ids_from_sequences(question_tokens, context_chunk)
            attention_mask = [1] * len(input_ids)

            cls_index = input_ids.index(tokenizer.cls_token_id)
            context_offset = len(input_ids) - len(context_chunk) - 1

            if answer_span is None:
                start_pos = cls_index
                end_pos = cls_index
            else:
                start_char, end_char = answer_span
                answer_in_chunk = start_char >= span_start and end_char < span_end
                if answer_in_chunk:
                    start_pos = context_offset + (start_char - span_start)
                    end_pos = context_offset + (end_char - span_start)
                else:
                    start_pos = cls_index
                    end_pos = cls_index

            padding = max_length - len(input_ids)
            if padding > 0:
                pad_id = tokenizer.pad_token_id
                input_ids += [pad_id] * padding
                attention_mask += [0] * padding
                token_type_ids += [0] * padding
            else:
                input_ids = input_ids[:max_length]
                attention_mask = attention_mask[:max_length]
                token_type_ids = token_type_ids[:max_length]
                if start_pos >= max_length or end_pos >= max_length:
                    start_pos = cls_index
                    end_pos = cls_index

            encoded["input_ids"].append(input_ids)
            encoded["attention_mask"].append(attention_mask)
            encoded["token_type_ids"].append(token_type_ids)
            encoded["start_positions"].append(start_pos)
            encoded["end_positions"].append(end_pos)
            encoded["overflow_to_sample_mapping"].append(example_idx)

            if span_end == context_length:
                break
            span_start += stride_tokens

    return encoded

### Verifying byte mappings are correct... (helped by LLMs)

---

### Preprocessing train and test datasets...

In [17]:
# processed_train = tydiqa_train.map(lambda examples: preprocess(examples, tokenizer), batched=True, remove_columns=tydiqa_train.column_names)
# processed_val = tydiqa_val.map(lambda examples: preprocess(examples, tokenizer), batched=True, remove_columns=tydiqa_val.column_names,)

In [18]:
# Save preprocessed train and validation datasets to disk
# processed_train.save_to_disk("cache/processed_train")
# processed_val.save_to_disk("cache/processed_val")

# To reload later:
processed_train = load_from_disk("cache/processed_train")
processed_val = load_from_disk("cache/processed_val")

In [19]:
# processed_val

### Setting up LoRA config

In [20]:
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    bias="none",
    modules_to_save=["qa_outputs"],
)

In [21]:
def print_trainable_parameters(model):
    """
    From https://colab.research.google.com/drive/14xo6sj4dARk8lXZbOifHEn1f_70qNAwy?usp=sharing#scrollTo=4W1j6lxaNnxC
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [22]:
peft_model = get_peft_model(model, lora_config)
peft_model.gradient_checkpointing_enable()
print_trainable_parameters(peft_model)

trainable params: 345602 || all params: 132430084 || trainable%: 0.26096940329661045


In [23]:
# def _resolve_checkpoint_path(checkpoint_path: Optional[str] = None, base_dir: str = "outputs/canine-tydiqa") -> str:
#     if checkpoint_path is None:
#         if not os.path.isdir(base_dir):
#             raise ValueError(f"No base checkpoint dir found at {base_dir}")
#         checkpoints = [
#             os.path.join(base_dir, d)
#             for d in os.listdir(base_dir)
#             if d.startswith("checkpoint-") and os.path.isdir(os.path.join(base_dir, d))
#         ]
#         if not checkpoints:
#             raise ValueError(f"No checkpoints found in {base_dir}")
#         latest = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[-1]
#         print(f"Using latest checkpoint: {latest}")
#         return latest
#     if os.path.isdir(checkpoint_path):
#         return checkpoint_path
#     raise ValueError(f"Invalid checkpoint path: {checkpoint_path}")

# def _check_checkpoint_files(path: str):
#     files = os.listdir(path)
#     print(f"Files in checkpoint {path}: {files[:20]} ...")
#     has_peft = "adapter_model.safetensors" in files or "adapter_model.bin" in files
#     has_full = "pytorch_model.bin" in files or "model.safetensors" in files
#     if not has_peft and not has_full:
#         raise ValueError(
#             "Checkpoint is missing adapter weights (adapter_model.bin/.safetensors) "
#             "and full model weights (pytorch_model.bin/model.safetensors). "
#             "Ensure you're pointing evaluate_checkpoint to a directory that holds "
#             "the trained adapter or full model."
#         )
#     return "adapter" if has_peft else "full"

def normalize_answer(text: str) -> str:
    text = (text or "").lower()

    def remove_articles(s: str) -> str:
        return re.sub(r"\b(a|an|the)\b", " ", s)

    def remove_punctuation(s: str) -> str:
        return "".join(ch for ch in s if ch not in string.punctuation)

    def white_space_fix(s: str) -> str:
        return " ".join(s.split())

    return white_space_fix(remove_articles(remove_punctuation(text)))

def exact_match_score(prediction: str, ground_truth: str) -> float:
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction: str, ground_truth: str) -> float:
    pred_tokens = normalize_answer(prediction).split()
    gold_tokens = normalize_answer(ground_truth).split()
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def edit_distance_score(prediction: str, ground_truth: str) -> float:
    """Compute normalized Levenshtein distance (1.0 = perfect match, 0.0 = completely different)"""
    pred_norm = normalize_answer(prediction)
    gold_norm = normalize_answer(ground_truth)
    if not gold_norm and not pred_norm:
        return 1.0
    if not gold_norm or not pred_norm:
        return 0.0
    distance = Levenshtein.distance(pred_norm, gold_norm)
    max_len = max(len(pred_norm), len(gold_norm))
    return 1.0 - (distance / max_len) if max_len > 0 else 1.0

def _gold_answer(example) -> str:
    annotations = example["annotations"]
    minimal_start = annotations.get("minimal_answers_start_byte", [])
    minimal_end = annotations.get("minimal_answers_end_byte", [])
    if minimal_start and minimal_start[0] != -1:
        context = example["document_plaintext"]
        mapping = _build_byte_to_char_index(context)
        start_char = _byte_to_char(mapping, minimal_start[0])
        end_char = _byte_to_char(mapping, max(minimal_end[0] - 1, minimal_start[0]))
        return context[start_char:end_char + 1]
    return "[CLS]"

def _decode_prediction(input_ids, start_idx: int, end_idx: int) -> str:
    if start_idx > end_idx:
        start_idx, end_idx = end_idx, start_idx
    cls_index = input_ids.index(tokenizer.cls_token_id)
    if start_idx == cls_index and end_idx == cls_index:
        return "[CLS]"
    start_idx = max(start_idx, 0)
    end_idx = min(end_idx, len(input_ids) - 1)
    if start_idx > end_idx:
        return "[CLS]"
    text = tokenizer.decode(input_ids[start_idx:end_idx + 1], skip_special_tokens=True)
    text = text.strip()
    return text if text else "[CLS]"


def evaluate_checkpoint(checkpoint_path: Optional[str] = None):
    base_model = CanineForQuestionAnswering.from_pretrained(model_name, trust_remote_code=False)
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model.to(device)

    eval_args = TrainingArguments(
        output_dir="outputs/canine-tydiqa",
        per_device_eval_batch_size=1,
        dataloader_drop_last=False,
        fp16=False,
        bf16=False,
        report_to="none"
    )

    eval_trainer = Trainer(
        model=model,
        args=eval_args,
        eval_dataset=processed_val,
        tokenizer=tokenizer,
    )

    # tqdm progress bar during evaluation
    print(f"üß™ Evaluating checkpoint: {checkpoint_path}")
    with tqdm(total=len(processed_val), desc="Evaluating", unit="samples") as pbar:
        predictions = eval_trainer.predict(processed_val)
        pbar.update(len(processed_val))

    start_logits, end_logits = predictions.predictions

    best_predictions = {}
    for feature_index, feature in enumerate(processed_val):
        sample_idx = int(feature["overflow_to_sample_mapping"])
        input_ids = feature["input_ids"]

        start_idx = int(np.argmax(start_logits[feature_index]))
        end_idx = int(np.argmax(end_logits[feature_index]))
        score = float(start_logits[feature_index][start_idx] + end_logits[feature_index][end_idx])
        prediction_text = _decode_prediction(input_ids, start_idx, end_idx)

        stored = best_predictions.get(sample_idx)
        if stored is None or score > stored[0]:
            best_predictions[sample_idx] = (score, prediction_text)

    em_scores = []
    f1_scores = []
    edit_dist_scores = []
    for sample_idx, (_, prediction_text) in best_predictions.items():
        reference = _gold_answer(tydiqa_val[int(sample_idx)])
        em_scores.append(exact_match_score(prediction_text, reference))
        f1_scores.append(f1_score(prediction_text, reference))
        edit_dist_scores.append(edit_distance_score(prediction_text, reference))

    em = float(np.mean(em_scores)) if em_scores else 0.0
    f1 = float(np.mean(f1_scores)) if f1_scores else 0.0
    edit_dist = float(np.mean(edit_dist_scores)) if edit_dist_scores else 0.0

    print(f"Examples evaluated: {len(em_scores)}")
    print(f"Exact Match: {em * 100:.2f}")
    print(f"F1: {f1 * 100:.2f}")
    print(f"Edit Distance (normalized): {edit_dist * 100:.2f}")

    return {"exact_match": em, "f1": f1, "edit_distance": edit_dist}


In [24]:
# peft_model = peft_model.to(dtype=torch.float16)

training_args = TrainingArguments(
    output_dir="outputs/canine-tydiqa",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=3e-5,
    weight_decay=0.01,
    eval_strategy="no",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=25,
    fp16=True,
    bf16=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="VohraAK/canine-tydiqa",
    hub_strategy="checkpoint",  # Automatically pushes each checkpoint
)

class CustomEvalCallback(TrainerCallback):
    def __init__(self, eval_func, eval_dataset):
        self.eval_func = eval_func
        self.eval_dataset = eval_dataset

    def on_save(self, args, state, control, model=None, **kwargs):
        """
        This runs AFTER checkpoint is saved, so we need to:
        1. Run evaluation
        2. Add metrics to state
        3. Re-save the trainer_state.json with updated metrics
        """
        checkpoint_path = f"{args.output_dir}/checkpoint-{state.global_step}"
        print(f"\nüîç Running custom evaluation at step {state.global_step}...")

        # Call your custom evaluation function
        metrics = self.eval_func(checkpoint_path)

        # Add metrics to state's log_history
        state.log_history.append({
            "step": state.global_step,
            "eval_exact_match": metrics["exact_match"],
            "eval_f1": metrics["f1"],
            "eval_edit_distance": metrics["edit_distance"],
        })

        # Print metrics for clarity
        print(f"‚úÖ Step {state.global_step}: EM={metrics['exact_match']*100:.2f}, F1={metrics['f1']*100:.2f}, EditDist={metrics['edit_distance']*100:.2f}")

        # Re-save the trainer_state.json with updated metrics
        state_path = f"{checkpoint_path}/trainer_state.json"
        try:
            # Read the existing state file
            with open(state_path, 'r') as f:
                state_dict = json.load(f)
            
            # Update with our new log_history
            state_dict['log_history'] = state.log_history
            
            # Write it back
            with open(state_path, 'w') as f:
                json.dump(state_dict, f, indent=2)
            
            print(f"üíæ Updated trainer_state.json with custom metrics")
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not update trainer_state.json: {e}")

        # Manually push this specific checkpoint to Hub as a separate folder
        try:
            print(f"‚òÅÔ∏è  Pushing checkpoint-{state.global_step} to Hub...")
            api = HfApi()
            api.upload_folder(
                folder_path=checkpoint_path,
                repo_id=args.hub_model_id,
                path_in_repo=f"checkpoint-{state.global_step}",  # Each checkpoint in its own folder!
                commit_message=f"Add checkpoint {state.global_step} (EM={metrics['exact_match']*100:.1f}%, F1={metrics['f1']*100:.1f}%)",
                repo_type="model"
            )
            print(f"‚úÖ Pushed checkpoint-{state.global_step} to Hub")
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not push to Hub: {e}")

        return control


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    callbacks=[CustomEvalCallback(evaluate_checkpoint, processed_val)],
)

No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# To resume training from a checkpoint, you can pass the checkpoint path to the train method:
trainer.train(resume_from_checkpoint="outputs/canine-tydiqa/checkpoint-3000")
# trainer.train()



Step,Training Loss
3025,1.9124
3050,1.9969
3075,1.9024
3100,1.8544
3125,1.9208
3150,1.8718
3175,1.8616
3200,1.9541
3225,1.8514
3250,1.8033



üîç Running custom evaluation at step 3500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-3500


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 63.80
F1: 63.82
Edit Distance (normalized): 64.54
‚úÖ Step 3500: EM=63.80, F1=63.82, EditDist=64.54
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-3500 to Hub...
‚úÖ Pushed checkpoint-3500 to Hub





üîç Running custom evaluation at step 4000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-4000


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 64.20
F1: 64.20
Edit Distance (normalized): 64.88
‚úÖ Step 4000: EM=64.20, F1=64.20, EditDist=64.88
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4000 to Hub...
‚úÖ Pushed checkpoint-4000 to Hub





üîç Running custom evaluation at step 4500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-4500


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 64.20
F1: 64.20
Edit Distance (normalized): 64.88
‚úÖ Step 4500: EM=64.20, F1=64.20, EditDist=64.88
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-4500 to Hub...
‚úÖ Pushed checkpoint-4500 to Hub





üîç Running custom evaluation at step 5000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-5000


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 64.20
F1: 64.20
Edit Distance (normalized): 64.88
‚úÖ Step 5000: EM=64.20, F1=64.20, EditDist=64.88
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5000 to Hub...
‚úÖ Pushed checkpoint-5000 to Hub





üîç Running custom evaluation at step 5500...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-5500


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 64.20
F1: 64.20
Edit Distance (normalized): 64.88
‚úÖ Step 5500: EM=64.20, F1=64.20, EditDist=64.88
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-5500 to Hub...
‚úÖ Pushed checkpoint-5500 to Hub





üîç Running custom evaluation at step 6000...


Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


üß™ Evaluating checkpoint: outputs/canine-tydiqa/checkpoint-6000


Evaluating:   0%|          | 0/28096 [00:00<?, ?samples/s]

Examples evaluated: 500
Exact Match: 64.00
F1: 64.00
Edit Distance (normalized): 64.69
‚úÖ Step 6000: EM=64.00, F1=64.00, EditDist=64.69
üíæ Updated trainer_state.json with custom metrics
‚òÅÔ∏è  Pushing checkpoint-6000 to Hub...
‚úÖ Pushed checkpoint-6000 to Hub




### Predictions...