# Training Jupyter Notebook

this model is trained on google colab, with T4 gpu as runtime

In [1]:
!pip install -q -U torch transformers peft datasets bitsandbytes trl accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import sys
import logging
import torch
import gc
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

###################
# T4 Specific Configs
###################
torch_dtype = torch.float16
attn_implementation = "eager"

###################
# Hyper-parameters
###################
training_config = {
    "fp16": True,
    "bf16": False,
    "do_eval": False,
    "learning_rate": 2.0e-04,
    "log_level": "info",
    "logging_steps": 5,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 2,
    "remove_unused_columns": True,
    "save_steps": 50,
    "save_total_limit": 1,
    "seed": 42,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs": {"use_reentrant": False},
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.03,
}

# Create the SFTConfig object
train_conf = SFTConfig(**training_config)

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
peft_conf = LoraConfig(**peft_config)

################
# Model Loading
################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    quantization_config=bnb_config,
    torch_dtype=torch_dtype,
    attn_implementation=attn_implementation,
    trust_remote_code=True,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

##################
# Data Processing
##################
dataset = load_dataset("json", data_files="stories_output.json", split="train")

def format_to_messages(example):
    return {
        "messages": [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["output"]}
        ]
    }

formatted_dataset = dataset.map(format_to_messages)

def apply_chat_template(example):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

processed_dataset = formatted_dataset.map(
    apply_chat_template,
    num_proc=1,
    desc="Applying chat template",
)

train_dataset = processed_dataset

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/config.json
Model config Phi3Config {
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "dtype": "float16",
  "embd_pdrop": 0.0,
  "eo

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

Could not locate the custom_generate/generate.py inside microsoft/Phi-3-mini-4k-instruct.
loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/added_tokens.json
loading file spe

In [9]:
torch.cuda.empty_cache()
gc.collect()

###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    train_dataset=train_dataset,
    processing_class=tokenizer,   # <--- FIX 1: Renamed from 'tokenizer'
    peft_config=peft_conf,        # <--- FIX 2: Added back to fix 'quantized' error
)

# <--- FIX 3: T4 CRASH PREVENTION --->
# We must manually force the LoRA adapters to Float32 *after* the trainer creates them.
# If we don't, the T4 GPU will crash because it doesn't support the model's native BFloat16.
print("Adjusting model for T4 compatibility (casting adapters to Float32)...")
for name, param in trainer.model.named_parameters():
    if "lora" in name or "modules_to_save" in name:
        param.data = param.data.to(torch.float32)

print(f"Starting training on {len(train_dataset)} samples...")
train_result = trainer.train()

# Metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

# Save Model
print("Saving adapter model...")
trainer.save_model(train_conf.output_dir)
print(f"Model saved to {train_conf.output_dir}")

Tokenizing train dataset:   0%|          | 0/1106 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1106 [00:00<?, ? examples/s]

Using auto half precision backend
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Adjusting model for T4 compatibility (casting adapters to Float32)...
Starting training on 1106 samples...


The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: greeting, text, output_mode, model, hash, instruction, output, queue_id, messages. If greeting, text, output_mode, model, hash, instruction, output, queue_id, messages are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,106
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 139
  Number of trainable parameters = 25,165,824
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
5,2.0185
10,1.7751
15,1.6053
20,1.4403
25,1.348
30,1.348
35,1.3174
40,1.2379
45,1.2359
50,1.223


Saving model checkpoint to ./checkpoint_dir/checkpoint-50
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/config.json
Model config Phi3Config {
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta":

***** train metrics *****
  entropy                  =     1.0463
  epoch                    =        1.0
  mean_token_accuracy      =     0.7158
  num_tokens               =   440599.0
  total_flos               =  9875106GF
  train_loss               =     1.2342
  train_runtime            = 0:19:25.49
  train_samples_per_second =      0.949
  train_steps_per_second   =      0.119
Saving adapter model...


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/f39ac1d28e925b323eae81227eaba4464caced4e/config.json
Model config Phi3Config {
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings

Model saved to ./checkpoint_dir


In [18]:
import torch

# 1. Setup your prompt
instruction = "Write a story about a young kid who is confused about life but reaches enlightenment by finding himself in a position where he feel nothing matters"
messages = [{"role": "user", "content": instruction}]

# 2. Tokenize
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

print("Generating story... (This might take about 1 minute)")

# 3. Generate with the FIX
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=2000,
        do_sample=True,
        temperature=0.4,
        top_p=0.9,
        repetition_penalty=1.1,
        use_cache=False  # <--- THIS FIXES THE ERROR
    )

# 4. Print
print("-" * 50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print("-" * 50)

Generating story... (This might take about 1 minute)
--------------------------------------------------
Write a story about a young kid who is confused about life but reaches enlightenment by finding himself in a position where he feel nothing matters The morning sun, still low on the eastern horizon, cast long shadows across the dusty path. Jaxon trudged slowly towards his small hut at the edge of the village settlement. His simple cotton tunic was already damp with sweat from the humid air and the early heat that promised to climb steadily throughout the day. He carried two clay pots filled with water for both drinking and cooking, their contents heavy under his slight frame.

As he approached his dwelling, an older woman named Nia sat outside her own home, mending a torn reed mat beside a well-worn wooden bench. She paused as she saw him coming, wiping her hands on her coarse cloth. "Rabb Rakha," she murmured softly before returning to her task. Jaxon offered no reply; it seemed unn

In [19]:
from google.colab import drive
import shutil
import os

# 1. Mount Google Drive
# You will see a pop-up asking for permission. Click "Connect to Google Drive".
drive.mount('/content/drive')

# 2. Define Source and Destination
source_file = "my_finetuned_model.zip"
destination_folder = "/content/drive/MyDrive/Colab_Exports"

# 3. Create the folder in Drive if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# 4. Copy the file (This is very fast because it stays within Google's cloud)
print(f"Copying {source_file} to Google Drive...")
shutil.copy(source_file, f"{destination_folder}/{source_file}")

print(f"Success! File saved to: {destination_folder}/{source_file}")
print("You can now go to drive.google.com and download it comfortably.")

Mounted at /content/drive
Copying my_finetuned_model.zip to Google Drive...
Success! File saved to: /content/drive/MyDrive/Colab_Exports/my_finetuned_model.zip
You can now go to drive.google.com and download it comfortably.
