In [None]:
!pip uninstall -y transformers accelerate peft bitsandbytes datasets trl scipy triton
!pip install --upgrade transformers==4.41.2 -q
!pip install --upgrade peft==0.11.1 -q
!pip install --upgrade accelerate==0.30.1 -q
!pip install bitsandbytes -q
!pip install --upgrade datasets==2.19.1 -q
!pip install --upgrade trl==0.8.6 -q
!pip install --upgrade scipy -q
!pip install --upgrade triton -q

!pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
import torch
import gc
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer


torch.cuda.empty_cache()
gc.collect()

if torch.cuda.is_available():
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("No GPU detected. This will be very slow.")


In [None]:
# QLoRA config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 # bf16 is optimal for Ampere GPUs like T4
)

print("\n Loading model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Set a padding token if one is not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto", # Automatically place the model on the available device (GPU)
    trust_remote_code=True,
    use_cache=False, # Disable caching for training
)
print("Model and tokenizer loaded successfully!")


In [None]:
model = prepare_model_for_kbit_training(model)

# LoRA (Low-Rank Adaptation) is a technique to efficiently fine-tune large models.
peft_config = LoraConfig(
    r=16,                   # Rank of the update matrices. Lower means fewer parameters.
    lora_alpha=32,          # A scaling factor for the LoRA weights.
    lora_dropout=0.05,      # Dropout probability for LoRA layers.
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"]
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Print the percentage of trainable parameters
trainable_params, total_params = model.get_nb_trainable_parameters()
print(f"\nTrainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")



In [None]:
# print("\n🔄 Loading and preparing dataset...")
# dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
# print(f"✅ Dataset loaded with {len(dataset)} samples.")


# --- 5. Load and Prepare Dataset ---
print("\n🔄 Loading and preparing dataset...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
# **NEW** Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"✅ Training samples: {len(train_dataset)}, Evaluation samples: {len(eval_dataset)}")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=2,      # Can increase slightly if memory allows
    gradient_accumulation_steps=4,      # Effective batch size of 8
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_32bit",

    logging_strategy="steps",
    logging_steps=25,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, # Loads the best model found during training
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to=[],
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=512,
    packing=False,
)

print("\n🚀 Starting training...")
# Clear cache one last time before training
torch.cuda.empty_cache()
gc.collect()

# Train the model! This should take about 5-10 minutes on a T4.
trainer.train()

print("\n🎉 Training completed!")


In [None]:
print("\n🧪 Testing the fine-tuned model...")
model.config.use_cache = True

prompt = "### Instruction:\nWrite a short story about a robot who discovers music.\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to("cuda")

# Generate text
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=250, pad_token_id=tokenizer.eos_token_id)

# Decode and print the output
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n--- PROMPT ---")
print(prompt)
print("\n--- MODEL RESPONSE ---")
print(response_text[len(prompt):]) # Print only the newly generated part
