
<p align="center">
  <img src="https://github.com/Owaiskhan9654/Granite-4.0-Fine-Tuning/blob/main/ibm-granite-4-0-release.png?raw=true" 
       alt="Granite 4.0"
       width="450">
</p>

# **Install dependencies**

In [1]:
import os, sys

os.environ["XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"   # also suppresses XLA logs
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

sys.stderr = open(os.devnull, "w")


In [2]:
!pip install -q --no-deps transformers accelerate datasets peft evaluate sentencepiece safetensors bitsandbytes protobuf==3.20.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
%%capture --no-stderr

import sys, os
import warnings
from pydantic._internal._generate_schema import UnsupportedFieldAttributeWarning
warnings.filterwarnings("ignore", category=UnsupportedFieldAttributeWarning)
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling


from peft import LoraConfig, get_peft_model
from peft import PeftModel

# **Configurations**

In [4]:
# -----------------------
# Model trainings config
# -----------------------
MODEL_NAME = "ibm-granite/granite-4.0-micro"
DATA_PATH = "/kaggle/working/data.jsonl"   # upload your jsonl here or change path
OUTPUT_DIR = "/kaggle/working/granite-finetuned"
TRAIN_BATCH_SIZE = 1
GRAD_ACCUM = 8
EPOCHS = 300
LEARNING_RATE = 2e-4
MAX_LENGTH = 1024

os.makedirs(OUTPUT_DIR, exist_ok=True)


# **Data Preperations**

### **Example data format (instruction tuning)**


* {"instruction":"Summarize the following: ...", "input":"Long document ...", "output":"Short summary..."}
* {"instruction":"Translate to French", "input":"Hello", "output":"Bonjour"}


In [5]:
if not os.path.exists(DATA_PATH):
    print(f"{DATA_PATH} not found — writing a tiny sample so the notebook runs end-to-end.")
    sample = [
        {"instruction":"Summarize the text.", "input":"Kaggle free GPUs are T4s.", "output":"Kaggle gives free T4 GPUs with limits."},
        {"instruction":"Translate to French", "input":"Hello", "output":"Bonjour"}
    ]
    with open(DATA_PATH, "w", encoding="utf-8") as f:
        for ex in sample:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

print("Data file path:", DATA_PATH)


/kaggle/working/data.jsonl not found — writing a tiny sample so the notebook runs end-to-end.
Data file path: /kaggle/working/data.jsonl


# **Tokenizer & dataset loading**

In [6]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading dataset (jsonl)...")
# Robust loader for jsonlines
raw_ds = load_dataset("json", data_files={"train": DATA_PATH}, split="train")

# Build single text prompt for SFT; adjust template if you prefer a different format.
def build_prompt(example):
    instr = example.get("instruction", "")
    inp = example.get("input", "")
    out = example.get("output", "")
    prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
    return prompt

raw_ds = raw_ds.map(lambda ex: {"text": build_prompt(ex)}, remove_columns=raw_ds.column_names)

# Tokenize
print("Tokenizing...")
def tokenize_fn(batch):
    toks = tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")
    toks["labels"] = toks["input_ids"].copy()
    
    return toks

train_ds = raw_ds.map(tokenize_fn, batched=True, remove_columns=["text"]) 
print("Prepared dataset — examples:", len(train_ds))


Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Loading dataset (jsonl)...


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Tokenizing...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Prepared dataset — examples: 2


# **Model load (BitsAndBytesConfig / fallback)**

In [7]:
print("Preparing model load — trying bitsandbytes 4-bit first if available...")
use_bnb = False
quant_config = None
try:
    import bitsandbytes as bnb  # noqa: F401
    from transformers import BitsAndBytesConfig
    if torch.cuda.is_available():
        use_bnb = True
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    else:
        print("bitsandbytes installed but CUDA not available — using fp16 fallback.")
except Exception as e:
    print("bitsandbytes import failed or not installed — using fp16 fallback. Error:", str(e))

if use_bnb and quant_config is not None:
    print("Loading model with quantization_config (4-bit)...")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=quant_config)
else:
    print("Fallback load: loading model in fp16 on GPU if available (no bitsandbytes).")
    device_map = "auto" if torch.cuda.is_available() else None
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=(torch.float16 if torch.cuda.is_available() else torch.float32), device_map=device_map)

print("Model loaded — device map:", getattr(model, "hf_device_map", getattr(model, "device_map", None)))

Preparing model load — trying bitsandbytes 4-bit first if available...
Loading model with quantization_config (4-bit)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded — device map: {'model.embed_tokens': 0, 'lm_head': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.norm': 1, 'model.rotary_emb': 1}


# **Apply LoRA (PEFT)**

In [8]:
print("Applying LoRA adapters (PEFT)...")
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.03,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.config.use_cache = False
print("PEFT/LoRA applied. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))


Applying LoRA adapters (PEFT)...
PEFT/LoRA applied. Trainable params: 2621440


# **Training setup & run (Trainer)**

In [9]:
print("Setting up Trainer...")

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    save_steps=50,
    save_total_limit=3,
    remove_unused_columns=False,
    report_to="none",
    label_names=["labels"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=data_collator,
)

print("Starting training — this may take a while depending on data and GPU resources.")
trainer.train()

Setting up Trainer...
Starting training — this may take a while depending on data and GPU resources.


Step,Training Loss
10,1.9414
20,0.4685
30,0.3783
40,0.3083
50,0.2841
60,0.2673
70,0.2586
80,0.2877
90,0.0604
100,0.0937


TrainOutput(global_step=300, training_loss=0.2003529746333758, metrics={'train_runtime': 654.5458, 'train_samples_per_second': 0.917, 'train_steps_per_second': 0.458, 'total_flos': 1.160683978752e+16, 'train_loss': 0.2003529746333758, 'epoch': 300.0})

# **Save adapters + tokenizer**

In [10]:
print("Saving LoRA adapter and tokenizer to:", OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved.")

Saving LoRA adapter and tokenizer to: /kaggle/working/granite-finetuned
Saved.


# **Quick inference test**

In [11]:
base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
peft_model = PeftModel.from_pretrained(base, OUTPUT_DIR)
peft_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GraniteMoeHybridForCausalLM(
      (model): GraniteMoeHybridModel(
        (embed_tokens): Embedding(100352, 2560, padding_idx=100256)
        (layers): ModuleList(
          (0-39): 40 x GraniteMoeHybridDecoderLayer(
            (input_layernorm): GraniteMoeHybridRMSNorm((2560,), eps=1e-05)
            (post_attention_layernorm): GraniteMoeHybridRMSNorm((2560,), eps=1e-05)
            (shared_mlp): GraniteMoeHybridMLP(
              (activation): SiLU()
              (input_linear): Linear(in_features=2560, out_features=16384, bias=False)
              (output_linear): Linear(in_features=8192, out_features=2560, bias=False)
            )
            (self_attn): GraniteMoeHybridAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2560, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.03, inplace=False)
            

# **Evaluation**

In [12]:
prompt = "### Instruction:\nSummarize the text.\n\n### Input:\nKaggle T4 GPU.\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(next(peft_model.parameters()).device)
out = peft_model.generate(**inputs, max_new_tokens=150)
print(tokenizer.decode(out[0], skip_special_tokens=True))

### Instruction:
Summarize the text.

### Input:
Kaggle T4 GPU.

### Response:
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
