In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Set CUDA_VISIBLE_DEVICES to use only the first GPU (index 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# Cell 1: Install Necessary Libraries
!pip install -q transformers datasets peft accelerate evaluate scikit-learn torch

# For Kaggle environments, you might also need to install the specific version of `bitsandbytes` if using QLoRA
!pip install -q bitsandbytes

In [None]:
# Cell 2: Import Libraries and Log in to Hugging Face
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig # Make sure PeftModel is imported
import evaluate
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import os

# Optional: Log in to Hugging Face if the adapter repository is private.
# You can set up your HF_TOKEN as a Kaggle Secret or run `huggingface_hub.login()`
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
from huggingface_hub import login

# hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("Hugging Face token not found. If the adapter is private, please add it as a Kaggle Secret (HF_TOKEN) and re-run.")

# Ensure a stable environment, if needed, you might want to set a seed
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# Cell 3: Define Model and Adapter Paths, and Load Tokenizer
base_model_id = "meta-llama/Llama-3.2-3B"
adaptor_repo_id = "te4bag/GRIT-Full-BoolQ-llama-3.2-3B-Energy-0.9"
max_length = 2048 # As per your training notebook

# Load the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# --- CRITICAL FIX: Ensure padding token is defined ---
# Llama models often don't have a padding token by default.
# We explicitly set it to the EOS token for batching.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id # Also set pad_token_id for explicit clarity

# Verify that pad_token is now set
print(f"Tokenizer loaded. Pad token: '{tokenizer.pad_token}', Pad token ID: {tokenizer.pad_token_id}")
if tokenizer.pad_token is None:
    raise ValueError("Padding token is still not set after attempting to assign it. Please check tokenizer configuration.")

In [None]:
# Cell 4: Load Base Model and PEFT Adapter (MODIFIED TO FIX PAD_TOKEN_ID)
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Quantization configuration for loading the base model (as done in your training script)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Assuming bf16 as in your GRITConfig
    bnb_4bit_use_double_quant=True,
)

# Load the base model for sequence classification
print(f"Loading base model: {base_model_id}...")
if device == "cuda":
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model_id,
        num_labels=2,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
else:
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model_id,
        num_labels=2,
        device_map="auto"
    )

# --- CRITICAL FIX: Set the model's pad_token_id from the tokenizer's pad_token_id ---
# This ensures the model's internal configuration knows about the padding token.
model.config.pad_token_id = tokenizer.pad_token_id
print(f"Model's pad_token_id set to: {model.config.pad_token_id}")

# Load the PEFT adapter weights on top of the base model
print(f"Loading PEFT adapter from: {adaptor_repo_id}...")
model = PeftModel.from_pretrained(model, adaptor_repo_id)

# Set the model to evaluation mode
model.eval()

print("Base model and PEFT adapter loaded and merged successfully. Model set to eval mode.")

In [None]:
# Cell 5: Load and Preprocess the Dataset (QNLI Test Split)
print(f"Loading dataset: nyu-mll/glue - QNLI test split...")
# Load 'validation' split as test split have no fucking labels 
test_dataset = load_dataset("google/boolq", split="validation")

print(f"Original test dataset size: {len(test_dataset)}")

# Define the preprocessing function
def preprocess_function(examples):
    # QNLI uses 'question' and 'sentence' as text inputs
    return tokenizer(examples["question"], examples["passage"], truncation=True, max_length=max_length)

# Apply preprocessing to the test dataset
print("Tokenizing test dataset...")
# Using batched=True for faster processing
processed_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Remove original text columns and ensure 'labels' are present and in PyTorch format
processed_test_dataset = processed_test_dataset.remove_columns(["question", "passage"])
processed_test_dataset.set_format("torch", columns=["answer", "input_ids", "attention_mask"])

print("Dataset preprocessing complete for QNLI test split.")

In [None]:
# Cell 6: Define Data Collator and Dataloader
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# Data collator will dynamically pad sequences to the longest length in a batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

# Create DataLoader for the test set
# Using a small batch size for evaluation to minimize memory
test_dataloader = DataLoader(processed_test_dataset, batch_size=8, collate_fn=data_collator) # Adjust batch_size as needed
print("DataLoader created for the QNLI test set.")

In [None]:
# Cell 7: Perform Inference and Collect Predictions
print("Starting inference on the QNLI test set...")
predictions = []
references = []

model.to(device) # Ensure model is on the correct device

for batch in test_dataloader:
    # Move batch to the same device as the model
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["answer"].to(device) 
    
    with torch.no_grad(): # Essential for inference to disable gradient calculations
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    # Get predicted class IDs (0 or 1 for QNLI)
    preds = torch.argmax(logits, dim=-1).cpu().numpy()

    # Collect true labels
    references.extend(labels.cpu().numpy())
    predictions.extend(preds)
    # REMEMBER TO REMOVE `break` FOR FULL EVALUATION!
    # break

print("Inference complete. Collected predictions and true labels.")

In [None]:
references, predictions

In [None]:
# Cell 8: Compute and Display Metrics (Revised for clean labels from custom collator)

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
import numpy as np

# If CustomDataCollatorWithPadding worked, references should only contain 0s and 1s.
print(f"Unique labels in references: {np.unique(references)}")
print(f"Unique predictions: {np.unique(predictions)}")

# Calculate metrics using scikit-learn
# No filtering needed if custom collator correctly extracted and passed 0/1 labels.
if len(references) == 0:
    print("\nWARNING: No samples available for metric computation.")
else:
    acc = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions, average='binary', pos_label=1)
    precision = precision_score(references, predictions, average='binary', pos_label=1)
    recall = recall_score(references, predictions, average='binary', pos_label=1)

    print("\n--- Evaluation Results for GLUE QNLI (Test Split) ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (Binary for class 1): {f1:.4f}")
    print(f"Precision (Binary for class 1): {precision:.4f}")
    print(f"Recall (Binary for class 1): {recall:.4f}")

    print("\nNote: QNLI, unlike MNLI, does not have separate 'matched' and 'mismatched' test splits. The evaluation is performed on the single standard 'test' split.")

    print("\n--- Detailed Classification Report ---")
    print(classification_report(references, predictions, target_names=["entailment", "not_entailment"]))