In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip uninstall -y transformers tokenizers peft -y
!pip install --upgrade transformers accelerate peft datasets bitsandbytes

In [None]:
from datasets import load_dataset
import json

out_path = "/content/arc_easy_sft.jsonl"

arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy")

def choices_to_string(choices):
    out = []
    for c in choices["label"]:
        idx = choices["label"].index(c)
        txt = choices["text"][idx]
        out.append(f"({c}) {txt}")
    return "\n".join(out)

with open(out_path, "w", encoding="utf-8") as fout:
    for item in arc_easy["train"]:
        q = item["question"].strip()
        choices = item["choices"]
        answer = item["answerKey"].strip()

        prompt = q + "\nOptions:\n" + choices_to_string(choices)

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Wrote ARC-Easy dataset to: {out_path}")
print(f"Total examples: {len(arc_easy['train'])}")

SFT_DATA_PATH = "/content/arc_easy_sft.jsonl"

In [None]:
from datasets import load_dataset
import json

out_path = "/content/boolq_sft.jsonl"

boolq = load_dataset("google/boolq", split="train")
boolq = boolq.shuffle(seed=42).select(range(2000))

with open(out_path, "w", encoding="utf-8") as fout:
    for item in boolq:
        question = item["question"].strip()
        passage = item["passage"].strip()
        answer = "Yes" if item["answer"] else "No"

        prompt = f"Passage: {passage}\n\nQuestion: {question}\nAnswer with Yes or No."

        record = {
            "messages": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": answer}
            ]
        }
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

SFT_DATA_PATH = "/content/boolq_sft.jsonl"

In [None]:
IFT_CHECKPOINT = "/content/drive/MyDrive/checkpoint-85000-darsh"
LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_sft_for_IFT_ARC-C"
#LORA_OUTPUT_DIR = "/content/drive/MyDrive/gemma3_lora_boolq"
EPOCHS = 3

In [None]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import os
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

tokenizer_config_path = os.path.join(IFT_CHECKPOINT, "tokenizer_config.json")

with open(tokenizer_config_path, 'r') as f:
    config = json.load(f)

problematic_fields = ["model_specific_special_tokens", "extra_special_tokens"]
for field in problematic_fields:
    if field in config:
        print(f"Removing {field} field")
        del config[field]

with open(tokenizer_config_path, 'w') as f:
    json.dump(config, f, indent=2)

print("Fixed tokenizer_config.json")

import importlib
import transformers
if hasattr(transformers, 'tokenization_utils_base'):
    importlib.reload(transformers.tokenization_utils_base)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True,
    use_fast=True,
    local_files_only=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

print("Loading model...")

config = AutoConfig.from_pretrained(
    IFT_CHECKPOINT,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    IFT_CHECKPOINT,
    device_map="auto",
    torch_dtype=torch.float32,
    trust_remote_code=True
)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset("json", data_files={"train": SFT_DATA_PATH})
dataset = raw_dataset["train"]

def format_and_tokenize(example):
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
        add_special_tokens=True
    )

    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

tokenized_dataset = dataset.map(
    format_and_tokenize,
    remove_columns=dataset.column_names,
    desc="Tokenizing dataset"
)

print(f"Dataset tokenized: {len(tokenized_dataset)} examples")

sample = tokenized_dataset[0]
masked = sum(1 for x in sample['labels'] if x == -100)
valid = sum(1 for x in sample['labels'] if x != -100)
print(f"  Masked tokens: {masked}")
print(f"  Valid tokens: {valid}")
print(f"  Masking ratio: {masked / len(sample['labels']) * 100:.1f}%")

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List
import torch

@dataclass
class CustomDataCollator:
    tokenizer: AutoTokenizer

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_ids = [f["input_ids"] for f in features]
        labels = [f["labels"] for f in features]
        max_length = max(len(ids) for ids in input_ids)

        padded_input_ids = []
        padded_labels = []
        attention_mask = []

        for ids, lbls in zip(input_ids, labels):
            padding_length = max_length - len(ids)
            padded_input_ids.append(ids + [self.tokenizer.pad_token_id] * padding_length)
            attention_mask.append([1] * len(ids) + [0] * padding_length)
            padded_labels.append(lbls + [-100] * padding_length)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long)
        }

data_collator = CustomDataCollator(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir=LORA_OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    num_train_epochs=EPOCHS,
    fp16=False,
    logging_steps=10,

    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,

    report_to="none",
    optim="adamw_torch",
    warmup_steps=100,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0
)

train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("\nStarting training")
trainer.train()

trainer.save_model(LORA_OUTPUT_DIR)
tokenizer.save_pretrained(LORA_OUTPUT_DIR)
print(f"Training complete. LoRA adapters saved to: {LORA_OUTPUT_DIR}")

In [None]:
print("\n" + "="*50)
print("Merging LoRA weights into base model...")
print("="*50)

merged_model = lora_model.merge_and_unload()
merged_output_path = "/content/drive/MyDrive/gemma3_lora_merged"

merged_model.save_pretrained(merged_output_path)
tokenizer.save_pretrained(merged_output_path)

print(f"Merged model saved to: {merged_output_path}")