<a href="https://colab.research.google.com/github/Rohit01-zoey/gemma270m-competition/blob/main/arcc-lora/lora_for_finetuned_arc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip uninstall -y transformers tokenizers peft -y
!pip install --upgrade transformers accelerate peft datasets bitsandbytes

Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Found existing installation: peft 0.18.0
Uninstalling peft-0.18.0:
  Successfully uninstalled peft-0.18.0
Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting peft
  Using cached peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.57.3-py3-none-any.whl (12.0 MB)
Using cached peft-0.18.0-py3-none-any.whl (556 kB)
Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
Installing collected packages: tokenizers, transformers, peft
Successfully installed peft-0.18.

In [1]:
from datasets import load_dataset
import json

# Output file
out_path = "/content/arc_easy_sft.jsonl"

# Load ARC-Easy instead of ARC-Challenge
arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy")

# Helper: convert "choices" dict into letter‚Üítext mapping
def choices_to_string(choices):
    out = []
    for c in choices["label"]:
        idx = choices["label"].index(c)
        txt = choices["text"][idx]
        out.append(f"({c}) {txt}")
    return "\n".join(out)

# Build SFT lines
with open(out_path, "w", encoding="utf-8") as fout:
    for item in arc_easy["train"]:
        q = item["question"].strip()
        choices = item["choices"]
        answer = item["answerKey"].strip()

        # Format prompt
        prompt = q + "\nOptions:\n" + choices_to_string(choices)

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"‚úì Wrote ARC-Easy dataset to: {out_path}")
print(f"Total examples: {len(arc_easy['train'])}")

# Update the data path
SFT_DATA_PATH = "/content/arc_easy_sft.jsonl"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úì Wrote ARC-Easy dataset to: /content/arc_easy_sft.jsonl
Total examples: 2251


In [2]:
from datasets import load_dataset
import json

out_path = "/content/boolq_sft.jsonl"

# Load BoolQ - yes/no questions (very different format)
boolq = load_dataset("google/boolq", split="train")

# Take a subset
boolq = boolq.shuffle(seed=42).select(range(2000))

with open(out_path, "w", encoding="utf-8") as fout:
    for item in boolq:
        question = item["question"].strip()
        passage = item["passage"].strip()
        answer = "Yes" if item["answer"] else "No"

        # Format: passage + question
        prompt = f"Passage: {passage}\n\nQuestion: {question}\nAnswer with Yes or No."

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"‚úì Wrote BoolQ dataset: {out_path}")

# Update paths
SFT_DATA_PATH = "/content/boolq_sft.jsonl"
#LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_boolq"

‚úì Wrote BoolQ dataset: /content/boolq_sft.jsonl


In [3]:
IFT_CHECKPOINT = "/content/drive/MyDrive/checkpoint-85000-darsh"
LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_sft_for_IFT_ARC-C"
EPOCHS = 3

In [4]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import os
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Fix tokenizer_config.json - REMOVE the problematic fields
tokenizer_config_path = os.path.join(IFT_CHECKPOINT, "tokenizer_config.json")

# Read the config
with open(tokenizer_config_path, 'r') as f:
    config = json.load(f)

# Remove ALL potentially problematic fields
problematic_fields = ["model_specific_special_tokens", "extra_special_tokens"]
for field in problematic_fields:
    if field in config:
        print(f"‚úì Removing {field} field")
        del config[field]

# Save the fixed config
with open(tokenizer_config_path, 'w') as f:
    json.dump(config, f, indent=2)

print("‚úì Fixed tokenizer_config.json")

# Clear any cached tokenizer
import importlib
import transformers
if hasattr(transformers, 'tokenization_utils_base'):
    importlib.reload(transformers.tokenization_utils_base)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True,
    use_fast=True,
    local_files_only=True
)

# Ensure padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"‚úì Tokenizer loaded: {tokenizer.__class__.__name__}")

print("Loading model...")
# Load config first
config = AutoConfig.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    IFT_CHECKPOINT,
    device_map="auto",
    torch_dtype=torch.float32,  # Use full precision
    trust_remote_code=True
)

print("‚úì Model loaded successfully")

Using device: cuda
‚úì Fixed tokenizer_config.json
Loading tokenizer...


The tokenizer you are loading from '/content/drive/MyDrive/checkpoint-85000-darsh' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!


‚úì Tokenizer loaded: GemmaTokenizerFast
Loading model...
‚úì Model loaded successfully


In [5]:
print("Configuring LoRA...")

# For Gemma3, target the attention projection layers
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Configuring LoRA...
trainable params: 1,474,560 || all params: 269,574,656 || trainable%: 0.5470


In [6]:
from datasets import load_dataset

print("Re-tokenizing WITHOUT aggressive masking...")

raw_dataset = load_dataset("json", data_files={"train": SFT_DATA_PATH})
dataset = raw_dataset["train"]

def format_and_tokenize(example):
    # Apply chat template
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    # Tokenize the full text
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
        add_special_tokens=True
    )

    # DON'T mask anything - train on full sequence
    # This is appropriate for your task
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

tokenized_dataset = dataset.map(
    format_and_tokenize,
    remove_columns=dataset.column_names,
    desc="Tokenizing dataset"
)

print(f"‚úì Dataset tokenized: {len(tokenized_dataset)} examples")

# Verify
sample = tokenized_dataset[0]
masked = sum(1 for x in sample['labels'] if x == -100)
valid = sum(1 for x in sample['labels'] if x != -100)
print(f"  Masked tokens: {masked}")
print(f"  Valid tokens: {valid}")
print(f"  Masking ratio: {masked / len(sample['labels']) * 100:.1f}%")

Re-tokenizing WITHOUT aggressive masking...


Generating train split: 0 examples [00:00, ? examples/s]

Tokenizing dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

‚úì Dataset tokenized: 2000 examples
  Masked tokens: 0
  Valid tokens: 153
  Masking ratio: 0.0%


In [7]:
# Detailed diagnostic
print("\n" + "="*50)
print("TOKENIZATION DIAGNOSTIC")
print("="*50)

sample_idx = 0
sample = tokenized_dataset[sample_idx]

print(f"\n1. Lengths:")
print(f"   Input IDs: {len(sample['input_ids'])}")
print(f"   Labels: {len(sample['labels'])}")

print(f"\n2. Label masking:")
masked_count = sum(1 for x in sample['labels'] if x == -100)
actual_count = sum(1 for x in sample['labels'] if x != -100)
print(f"   Masked tokens (-100): {masked_count}")
print(f"   Actual labels: {actual_count}")
print(f"   Masking ratio: {masked_count / len(sample['labels']):.2%}")

print(f"\n3. Decoded sample:")
print("   Full input:")
full_text = tokenizer.decode(sample['input_ids'])
print(f"   {full_text[:300]}...")

print("\n   Labels only (non-masked):")
label_tokens = [tok if tok != -100 else tokenizer.pad_token_id for tok in sample['labels']]
actual_labels = [tok for tok in sample['labels'] if tok != -100]
if actual_labels:
    label_text = tokenizer.decode(actual_labels)
    print(f"   '{label_text}'")
    print(f"   Raw token IDs: {actual_labels}")

print("\n4. First 30 tokens comparison:")
for i in range(min(30, len(sample['input_ids']))):
    token = tokenizer.decode([sample['input_ids'][i]])
    label = sample['labels'][i]
    print(f"   {i:3d}: '{token:15s}' | Label: {label if label != -100 else 'MASKED'}")

print("="*50 + "\n")


TOKENIZATION DIAGNOSTIC

1. Lengths:
   Input IDs: 153
   Labels: 153

2. Label masking:
   Masked tokens (-100): 0
   Actual labels: 153
   Masking ratio: 0.00%

3. Decoded sample:
   Full input:
   <|im_start|><|im_start|>user
Passage: Henry Daniel Mills is a fictional character in ABC's television series Once Upon a Time. Henry is the boy Emma Swan gave up to adoption; Regina Mills adopted him. Henry was originally portrayed as a child by Jared S. Gilmore, who won the Young Artist Award for B...

   Labels only (non-masked):
   '<|im_start|><|im_start|>user
Passage: Henry Daniel Mills is a fictional character in ABC's television series Once Upon a Time. Henry is the boy Emma Swan gave up to adoption; Regina Mills adopted him. Henry was originally portrayed as a child by Jared S. Gilmore, who won the Young Artist Award for Best Performance in a TV Series -- Leading Young Actor in 2012. For the show's seventh and final season, Andrew J. West later took over the role of Henry as an ad

In [9]:
from dataclasses import dataclass
from typing import Any, Dict, List
import torch

# Custom data collator
@dataclass
class CustomDataCollator:
    tokenizer: AutoTokenizer

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract input_ids and labels
        input_ids = [f["input_ids"] for f in features]
        labels = [f["labels"] for f in features]

        # Find max length in batch
        max_length = max(len(ids) for ids in input_ids)

        # Pad sequences
        padded_input_ids = []
        padded_labels = []
        attention_mask = []

        for ids, lbls in zip(input_ids, labels):
            padding_length = max_length - len(ids)

            # Pad input_ids and attention_mask
            padded_input_ids.append(ids + [self.tokenizer.pad_token_id] * padding_length)
            attention_mask.append([1] * len(ids) + [0] * padding_length)

            # Pad labels (use -100 for padding tokens so they're ignored in loss)
            padded_labels.append(lbls + [-100] * padding_length)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long)
        }

# Create data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

print("‚úì Custom data collator created")

‚úì Custom data collator created


In [10]:
# Manual training step to see what's going on
print("=== MANUAL TRAINING STEP TEST ===\n")

model.train()

# Get a batch
batch = data_collator([tokenized_dataset[0], tokenized_dataset[1], tokenized_dataset[2], tokenized_dataset[3]])
batch = {k: v.to(device) for k, v in batch.items()}

print("1. Batch info:")
print(f"   Batch size: {batch['input_ids'].shape}")
print(f"   Device: {batch['input_ids'].device}")

print("\n2. Forward pass:")
outputs = model(**batch)
loss = outputs.loss

print(f"   Loss: {loss.item()}")
print(f"   Loss dtype: {loss.dtype}")
print(f"   Loss requires_grad: {loss.requires_grad}")

if loss.item() == 0.0:
    print("\n   ‚ö†Ô∏è Loss is exactly 0.0 - checking logits...")
    logits = outputs.logits
    print(f"   Logits shape: {logits.shape}")
    print(f"   Logits range: [{logits.min().item():.4f}, {logits.max().item():.4f}]")
    print(f"   Logits mean: {logits.mean().item():.4f}")
    print(f"   Logits std: {logits.std().item():.4f}")

    # Check predictions
    predictions = logits.argmax(dim=-1)
    print(f"\n   Predictions shape: {predictions.shape}")
    print(f"   First 20 predictions: {predictions[0, :20].tolist()}")
    print(f"   First 20 labels: {batch['labels'][0, :20].tolist()}")

    # Check accuracy
    matches = (predictions == batch['labels']).float()
    valid_mask = (batch['labels'] != -100)
    accuracy = matches[valid_mask].mean().item()
    print(f"\n   Accuracy on this batch: {accuracy:.2%}")

print("\n3. Backward pass:")
try:
    loss.backward()
    print("   ‚úì Backward pass successful")

    # Check if gradients exist
    has_grads = False
    for name, param in model.named_parameters():
        if param.requires_grad and param.grad is not None:
            has_grads = True
            print(f"   ‚úì Gradient found for: {name[:50]}... | Grad mean: {param.grad.mean().item():.6f}")
            break

    if not has_grads:
        print("   ‚úó NO GRADIENTS FOUND!")

except Exception as e:
    print(f"   ‚úó Backward pass failed: {e}")

print("\n" + "="*50)

=== MANUAL TRAINING STEP TEST ===

1. Batch info:
   Batch size: torch.Size([4, 218])
   Device: cuda:0

2. Forward pass:
   Loss: 3.488351583480835
   Loss dtype: torch.float32
   Loss requires_grad: True

3. Backward pass:
   ‚úì Backward pass successful
   ‚úì Gradient found for: base_model.model.model.layers.0.self_attn.q_proj.l... | Grad mean: 0.000000



In [11]:
# Recreate trainer with fresh optimizer state
training_args = TrainingArguments(
    output_dir=LORA_OUTPUT_DIR,
    per_device_train_batch_size=1,  # Smaller batch for stability
    gradient_accumulation_steps=16,  # Compensate with more accumulation
    learning_rate=1e-4,  # Lower LR for stability
    num_train_epochs=EPOCHS,
    fp16=False,  # NO FP16 - use full precision
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    optim="adamw_torch",
    warmup_steps=100,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0  # Gradient clipping
)

# Recreate trainer (this resets optimizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("‚úì Trainer recreated with fresh optimizer")

# Train
print("\nStarting training with full label supervision...")
trainer.train()

# Save
trainer.save_model(LORA_OUTPUT_DIR)
tokenizer.save_pretrained(LORA_OUTPUT_DIR)
print(f"‚úì Training complete! LoRA adapters saved to: {LORA_OUTPUT_DIR}")

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úì Trainer recreated with fresh optimizer

Starting training with full label supervision...


Step,Training Loss
10,3.0962
20,3.0697
30,3.0229
40,2.8547
50,2.7455
60,2.7606
70,2.657
80,2.6707
90,2.6187
100,2.4997


‚úì Training complete! LoRA adapters saved to: /content/drive/MyDrive/gemma3_lora_sft_for_IFT_ARC-C


In [12]:
# Debug: Check if loss is actually being computed
print("=== DEBUGGING TRAINING ===")

# 1. Check a single batch manually
sample_batch = data_collator([tokenized_dataset[0], tokenized_dataset[1]])

print("\n1. Sample batch shapes:")
print(f"   input_ids: {sample_batch['input_ids'].shape}")
print(f"   labels: {sample_batch['labels'].shape}")
print(f"   attention_mask: {sample_batch['attention_mask'].shape}")

# 2. Check label content
print("\n2. Labels content:")
print(f"   First 30 labels: {sample_batch['labels'][0][:30]}")
print(f"   Num valid labels (not -100): {(sample_batch['labels'] != -100).sum()}")
print(f"   Num -100 labels: {(sample_batch['labels'] == -100).sum()}")

# 3. Manual forward pass
print("\n3. Testing manual forward pass:")
model.eval()
with torch.no_grad():
    # Move batch to device
    batch_device = {k: v.to(device) for k, v in sample_batch.items()}
    outputs = model(**batch_device)
    loss = outputs.loss
    print(f"   Manual loss: {loss.item()}")

# 4. Check if model parameters are actually being updated
print("\n4. Checking if LoRA params are trainable:")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"   Trainable parameters: {trainable_params:,}")

# 5. Check optimizer state
if hasattr(trainer, 'optimizer'):
    print("\n5. Optimizer learning rate:")
    print(f"   LR: {trainer.optimizer.param_groups[0]['lr']}")

=== DEBUGGING TRAINING ===

1. Sample batch shapes:
   input_ids: torch.Size([2, 153])
   labels: torch.Size([2, 153])
   attention_mask: torch.Size([2, 153])

2. Labels content:
   First 30 labels: tensor([262145, 262145,   2364,    107,   8653,    676, 236787,  12297,  13108,
         40161,    563,    496,  57728,   2872,    528,  21593, 236789, 236751,
         13617,   3605,   9920,  26831,    496,   7578, 236761,  12297,    563,
           506,   6938,  36569])
   Num valid labels (not -100): 276
   Num -100 labels: 30

3. Testing manual forward pass:
   Manual loss: 2.9825456142425537

4. Checking if LoRA params are trainable:
   Trainable parameters: 1,474,560

5. Optimizer learning rate:
   LR: 0.0


In [13]:
print("\n" + "="*50)
print("Testing inference with trained LoRA model...")
print("="*50)

# Reload tokenizer first (with fixes)
test_tokenizer = AutoTokenizer.from_pretrained(
    LORA_OUTPUT_DIR,  # Load from LoRA output dir which has the saved tokenizer
    trust_remote_code=True
)

if test_tokenizer.pad_token is None:
    test_tokenizer.pad_token = test_tokenizer.eos_token

# Reload base model
base_model = AutoModelForCausalLM.from_pretrained(
    IFT_CHECKPOINT,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Load LoRA adapters
lora_model = PeftModel.from_pretrained(base_model, LORA_OUTPUT_DIR)
lora_model.eval()

# Test prompt
test_messages = [
    {
        "role": "user",
        "content": "If John has 5 apples and buys 3 more, how many apples does he have?"
    }
]

test_prompt = test_tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = test_tokenizer(test_prompt, return_tensors="pt").to(device)

print(f"\nTest Input:\n{test_prompt}")
print("\nGenerating response...")

with torch.no_grad():
    outputs = lora_model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,  # Changed to greedy for debugging
        pad_token_id=test_tokenizer.pad_token_id,
        eos_token_id=test_tokenizer.eos_token_id
    )

response = test_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nModel Response:\n{response}")


Testing inference with trained LoRA model...


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Test Input:
<|im_start|>user
If John has 5 apples and buys 3 more, how many apples does he have?<|im_end|>
<|im_start|>assistant


Generating response...

Model Response:
user
If John has 5 apples and buys 3 more, how many apples does he have?
assistant



In [20]:
print("\n" + "="*50)
print("Merging LoRA weights into base model...")
print("="*50)

merged_model = lora_model.merge_and_unload()
merged_output_path = "/content/drive/MyDrive/gemma3_lora_merged"

merged_model.save_pretrained(merged_output_path)
tokenizer.save_pretrained(merged_output_path)

print(f"‚úì Merged model saved to: {merged_output_path}")
print("\nAll done! üéâ")

Checking trainable parameters:
trainable params: 1,474,560 || all params: 269,574,656 || trainable%: 0.5470

Checking data collator output:
Input IDs shape: torch.Size([2, 76])
Labels shape: torch.Size([2, 76])
Labels sample (first 20): tensor([262145, 262145,   2364,    107,  38447,   8150,    531,   6962,    914,
          4916,   6077,    684,  71113,   1091, 236761,  15311,   5716,   3761,
           795,   6360])
Number of -100 in labels: 13
Number of valid labels: 63

Checking tokenized dataset:
Sample input_ids: [262145, 262145, 2364, 107, 38447, 8150, 531, 6962, 914, 4916, 6077, 684, 71113, 1091, 236761, 15311, 5716, 3761, 795, 6360]
Sample labels: [262145, 262145, 2364, 107, 38447, 8150, 531, 6962, 914, 4916, 6077, 684, 71113, 1091, 236761, 15311, 5716, 3761, 795, 6360]
Are they the same? True
