In [None]:
# 1. Install the Standard, Stable Stack
!pip install -U torch "transformers>=4.45.0" "peft" "accelerate" "bitsandbytes" "trl" "datasets"

# 2. Restart Runtime manually (Click 'Runtime' -> 'Restart Session')

Collecting torch
  Downloading torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (31 kB)
Collecting transformers>=4.45.0
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting trl
  Downloading trl-0.28.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch)
  Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training

model_id = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

# 1. Strict BitsAndBytes Config (CRITICAL FOR T4 GPU)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, # Changed to bfloat16 for T4 GPUs
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# 2. Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16 # Changed to bfloat16 for T4 GPUs
)

# Lock the data types so bfloat16 doesn't sneak in
model = prepare_model_for_kbit_training(model)

# 3. Simple LoRA Config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# 4. Load Data
dataset = load_dataset("json", data_files="train_data.json", split="train")

# 5. Trainer
sft_config = SFTConfig(
    output_dir="./outputs",
    max_steps=60,
    per_device_train_batch_size=2,
    learning_rate=2e-4,
    fp16=False,               # Set to False as we're using bf16
    bf16=True,                # Explicitly enable bf16 for T4 GPU
    logging_steps=10,
    max_length=512,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    peft_config=peft_config,
)

trainer.train()

# 6. Save the result
trainer.model.save_pretrained("./student_model") # Changed from model.save_pretrained
tokenizer.save_pretrained("./student_model")

import shutil
shutil.make_archive("student_model", 'zip', "./student_model")
print("✅ SUCCESS! Your student model is ready. Download student_model.zip now.")



Loading weights:   0%|          | 0/146 [00:00<?, ?it/s]

Step,Training Loss
10,0.896157
20,0.771818
30,0.782433
40,0.771141
50,0.768306
60,0.668701


✅ SUCCESS! Your student model is ready. Download student_model.zip now.


In [None]:
from google.colab import drive
drive.mount('/content/drive')