In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip uninstall -y transformers tokenizers peft -y
!pip install --upgrade transformers accelerate peft datasets bitsandbytes

Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Found existing installation: peft 0.18.0
Uninstalling peft-0.18.0:
  Successfully uninstalled peft-0.18.0
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6

In [None]:
from datasets import load_dataset
import json

# Output file
out_path = "/content/arc_easy_sft.jsonl"

# Load ARC-Easy instead of ARC-Challenge
arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy")

# Helper: convert "choices" dict into letter→text mapping
def choices_to_string(choices):
    out = []
    for c in choices["label"]:
        idx = choices["label"].index(c)
        txt = choices["text"][idx]
        out.append(f"({c}) {txt}")
    return "\n".join(out)

# Build SFT lines
with open(out_path, "w", encoding="utf-8") as fout:
    for item in arc_easy["train"]:
        q = item["question"].strip()
        choices = item["choices"]
        answer = item["answerKey"].strip()

        # Format prompt
        prompt = q + "\nOptions:\n" + choices_to_string(choices)

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Wrote ARC-Easy dataset to: {out_path}")
print(f"Total examples: {len(arc_easy['train'])}")

SFT_DATA_PATH = "/content/arc_easy_sft.jsonl"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ARC-Easy/train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

ARC-Easy/test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

ARC-Easy/validation-00000-of-00001.parqu(…):   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

✓ Wrote ARC-Easy dataset to: /content/arc_easy_sft.jsonl
Total examples: 2251


In [None]:
from datasets import load_dataset
import json

out_path = "/content/boolq_sft.jsonl"

boolq = load_dataset("google/boolq", split="train")
boolq = boolq.shuffle(seed=42).select(range(2000))

with open(out_path, "w", encoding="utf-8") as fout:
    for item in boolq:
        question = item["question"].strip()
        passage = item["passage"].strip()
        answer = "Yes" if item["answer"] else "No"

        # Format: passage + question
        prompt = f"Passage: {passage}\n\nQuestion: {question}\nAnswer with Yes or No."

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

SFT_DATA_PATH = "/content/boolq_sft.jsonl"

In [None]:
IFT_CHECKPOINT = "/content/drive/MyDrive/checkpoint-85000-darsh"
LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_sft_for_IFT_ARC-C"
#LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_boolq"
EPOCHS = 3

In [None]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import os
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

tokenizer_config_path = os.path.join(IFT_CHECKPOINT, "tokenizer_config.json")

with open(tokenizer_config_path, 'r') as f:
    config = json.load(f)

problematic_fields = ["model_specific_special_tokens", "extra_special_tokens"]
for field in problematic_fields:
    if field in config:
        print(f"✓ Removing {field} field")
        del config[field]

with open(tokenizer_config_path, 'w') as f:
    json.dump(config, f, indent=2)

print("✓ Fixed tokenizer_config.json")

import importlib
import transformers
if hasattr(transformers, 'tokenization_utils_base'):
    importlib.reload(transformers.tokenization_utils_base)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True,
    use_fast=True,
    local_files_only=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"✓ Tokenizer loaded: {tokenizer.__class__.__name__}")

print("Loading model...")

config = AutoConfig.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    IFT_CHECKPOINT,
    device_map="auto",
    torch_dtype=torch.float32,
    trust_remote_code=True
)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset("json", data_files={"train": SFT_DATA_PATH})
dataset = raw_dataset["train"]

def format_and_tokenize(example):
    # Apply chat template
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    # Tokenize the full text
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
        add_special_tokens=True
    )

    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

tokenized_dataset = dataset.map(
    format_and_tokenize,
    remove_columns=dataset.column_names,
    desc="Tokenizing dataset"
)

print(f"Dataset tokenized: {len(tokenized_dataset)} examples")

sample = tokenized_dataset[0]
masked = sum(1 for x in sample['labels'] if x == -100)
valid = sum(1 for x in sample['labels'] if x != -100)
print(f"  Masked tokens: {masked}")
print(f"  Valid tokens: {valid}")
print(f"  Masking ratio: {masked / len(sample['labels']) * 100:.1f}%")

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List
import torch

@dataclass
class CustomDataCollator:
    tokenizer: AutoTokenizer

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract input_ids and labels
        input_ids = [f["input_ids"] for f in features]
        labels = [f["labels"] for f in features]

        # Find max length in batch
        max_length = max(len(ids) for ids in input_ids)

        # Pad sequences
        padded_input_ids = []
        padded_labels = []
        attention_mask = []

        for ids, lbls in zip(input_ids, labels):
            padding_length = max_length - len(ids)

            # Pad input_ids and attention_mask
            padded_input_ids.append(ids + [self.tokenizer.pad_token_id] * padding_length)
            attention_mask.append([1] * len(ids) + [0] * padding_length)

            # Pad labels
            padded_labels.append(lbls + [-100] * padding_length)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long)
        }

# Create data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir=LORA_OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    num_train_epochs=EPOCHS,
    fp16=False,
    logging_steps=10,

    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,

    report_to="none",
    optim="adamw_torch",
    warmup_steps=100,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0
)

#Split
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
print("\nStarting training")
trainer.train()

trainer.save_model(LORA_OUTPUT_DIR)
tokenizer.save_pretrained(LORA_OUTPUT_DIR)
print(f"Training complete. LoRA adapters saved to: {LORA_OUTPUT_DIR}")

In [None]:
print("\n" + "="*50)
print("Merging LoRA weights into base model...")
print("="*50)

merged_model = lora_model.merge_and_unload()
merged_output_path = "/content/drive/MyDrive/gemma3_lora_merged"

merged_model.save_pretrained(merged_output_path)
tokenizer.save_pretrained(merged_output_path)

print(f"✓ Merged model saved to: {merged_output_path}")
print("\nAll done! 🎉")