In [1]:
%%capture
%pip install -U transformers datasets accelerate peft trl bitsandbytes wandb gradio

In [4]:
# Imports
import os
import torch
import random
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    set_seed,
)
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb


In [5]:
dataset = load_dataset("rvv-karma/Math-QA")


In [6]:
# Use only question-answer columns
def format_qa(example):
    return {"text": f"Question: {example['question']}\nAnswer: {example['answer']}"}

dataset = dataset.map(format_qa)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
# Load Hugging Face and Weights & Biases tokens
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
wb_token = user_secrets.get_secret("wandb")

login(token=hf_token)
wandb.login(key=wb_token)

run = wandb.init(
    project='HOME Fine-tune Gemma-2- 2B on MMLU', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muu0712[0m ([33muu0712-engineering-student-council[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
# ✅ Load tokenizer and model
model_id = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [9]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [10]:
model.gradient_checkpointing_enable()

In [11]:
# ✅ Prepare LoRA config
model = prepare_model_for_kbit_training(model)


In [12]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training


peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,597,440 || all params: 2,615,939,328 || trainable%: 0.0611


In [13]:
# ✅ Tokenize dataset
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize, batched=True)
val_data = val_data.map(tokenize, batched=True)

Map:   0%|          | 0/31500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

In [14]:
print (train_data[0])

{'topic': 'Fractal geometry', 'sub_topic': 'The Hausdorff dimension of the Cantor dust.', 'question': 'What is the Hausdorff dimension of the Cantor dust constructed by removing the middle third of a line segment of length 1, then removing the middle third of each remaining segment, and so on, for a total of 10 iterations?', 'answer': 'The Hausdorff dimension of the Cantor set can be calculated using the formula:\n\nHausdorff dimension = log(N) / log(1/r)\n\nwhere N is the number of self-similar pieces and r is the scaling factor.\n\nFor the Cantor set, after each iteration, the line segment is divided into two equal parts (N = 2) and the length of each part is 1/3 of the original length (r = 1/3).\n\nSo, the Hausdorff dimension of the Cantor set is:\n\nHausdorff dimension = log(2) / log(1/3) ≈ 0.6309\n\nThe number of iterations does not affect the Hausdorff dimension, as it is a property of the fractal itself. Therefore, the Hausdorff dimension of the Cantor dust after 10 iterations i

In [15]:
# ✅ Define training args
training_args = TrainingArguments(
    output_dir="./gemma-mathqa",
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    # eval_steps=1000,
    logging_steps=1,
    save_steps=200,
    num_train_epochs=3,
    # evaluation_strategy="steps",
    save_total_limit=1,
    learning_rate=2e-4,
    bf16=True,
    # fp16=True,
    report_to="wandb"
)

In [19]:
# ✅ Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig
)


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
# ✅ Start training
trainer.train()R

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,0.5969
2,0.7437
3,0.7703
4,0.6531
5,0.7622
6,0.7887
7,0.8288
8,0.7567
9,0.9255
10,0.8946


KeyboardInterrupt: 