In [None]:
# =============================================================================
# COLAB 2: LoRA FINE-TUNING (r=16) FOR SmolLM2-135M WITH UNSLOTH (FIXED)
# - Pre-tokenize + hard-truncate to ‚â§512 (avoids CE shape mismatch)
# - W&B disabled by default (optional)
# - 4-bit friendly optimizer
# - Robust merge (supports different Unsloth versions)
# =============================================================================

# Cell 1: Install
# -----------------------------------------------------------------------------
print("üì¶ Installing Unsloth and dependencies...")
!pip install -q unsloth
!pip install -q --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q datasets trl transformers accelerate bitsandbytes

print("‚úÖ Installation complete!")

# Cell 2: Imports & environment
# -----------------------------------------------------------------------------
import os
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Disable Weights & Biases by default (enable by setting to "false")
os.environ["WANDB_DISABLED"] = "true"

# If TorchDynamo still interferes with Unsloth fused CE, uncomment:
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

print(f"üî• PyTorch: {torch.__version__}")
print(f"üéÆ CUDA available: {torch.cuda.is_available()}")
print(f"üéØ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Cell 3: Configuration
# -----------------------------------------------------------------------------
max_seq_length = 512
dtype = None
load_in_4bit = True

# LoRA configuration
USE_LORA = True
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05

# Training configuration
batch_size = 2
gradient_accumulation_steps = 4
num_train_epochs = 1               # ignored when max_steps > 0
learning_rate = 2e-4               # a bit higher for LoRA
max_steps = 50

print(f"""
üîß Configuration:
   ‚Ä¢ Using LoRA: {USE_LORA}
   ‚Ä¢ LoRA Rank (r): {lora_r}
   ‚Ä¢ LoRA Alpha: {lora_alpha}
   ‚Ä¢ LoRA Dropout: {lora_dropout}
   ‚Ä¢ Max Sequence Length: {max_seq_length}
   ‚Ä¢ Batch Size: {batch_size}
   ‚Ä¢ Learning Rate: {learning_rate}
   ‚Ä¢ Max Steps: {max_steps}
""")

# Cell 4: Load model & tokenizer
# -----------------------------------------------------------------------------
print("üì• Loading SmolLM2-135M model...")
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Ensure PAD/EOS & truncation are explicit
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = max_seq_length
tokenizer.truncation_side = "right"

print(f"‚úÖ Model loaded: {model_name}")
try:
    print(f"üìä Base params: {model.num_parameters() / 1e6:.1f}M")
except:
    pass

# Cell 5: Apply LoRA adapters (attention + MLP only)
# -----------------------------------------------------------------------------
print("üîß Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",      # attention
        "gate_proj", "up_proj", "down_proj",         # MLP
    ],
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Trainable: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.2f}%)")

# Cell 6: Load dataset (same subset for fair comparison)
# -----------------------------------------------------------------------------
print("üìö Loading dataset (500 examples)...")
raw_ds = load_dataset("yahma/alpaca-cleaned", split="train[:500]")
print(f"‚úÖ Dataset size: {len(raw_ds)}")
print("üìù Sample:")
print(raw_ds[0])

# Cell 7: Build prompts, pre-tokenize, hard-truncate to ‚â§512
# -----------------------------------------------------------------------------
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def build_texts(examples):
    insts, ins, outs = examples["instruction"], examples["input"], examples["output"]
    texts = []
    for inst, inp, out in zip(insts, ins, outs):
        if inp:
            inst = inst + "\n" + inp
        texts.append(alpaca_prompt.format(inst, out) + EOS_TOKEN)
    return {"text": texts}

print("üß± Formatting to prompt strings...")
fmt_ds = raw_ds.map(build_texts, batched=True)

def tokenize_and_truncate(examples):
    enc = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding=False,
        add_special_tokens=False,
    )
    # Labels mirror inputs for causal LM
    enc["labels"] = [ids.copy() for ids in enc["input_ids"]]
    if "attention_mask" not in enc:
        enc["attention_mask"] = [[1]*len(ids) for ids in enc["input_ids"]]
    return enc

print("‚úÇÔ∏è Tokenizing with hard truncation...")
tok_ds = fmt_ds.map(
    tokenize_and_truncate,
    batched=True,
    remove_columns=list(fmt_ds.features),   # keep only tokenized fields
)

max_len = max(len(x) for x in tok_ds["input_ids"])
print("üîé Max tokenized length (should be ‚â§ 512):", max_len)

# Cell 8: Training arguments
# -----------------------------------------------------------------------------
print("‚öôÔ∏è Configuring TrainingArguments...")
training_args = TrainingArguments(
    output_dir="./lora_finetuned_smollm2",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=5,
    max_steps=max_steps,                      # precedence over epochs
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_bnb_8bit",                   # 4-bit friendly
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    save_strategy="steps",
    save_steps=25,
    report_to=[] if os.environ.get("WANDB_DISABLED","true").lower()=="true" else ["wandb"],
)

# Cell 9: Trainer (use tokenized dataset; no dataset_text_field)
# -----------------------------------------------------------------------------
print("üèãÔ∏è Initializing trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tok_ds,
    max_seq_length=max_seq_length,
    packing=False,   # we already tokenized & truncated
    args=training_args,
)
print("‚úÖ Trainer ready!")

# Cell 10: Train
# -----------------------------------------------------------------------------
print("üöÄ Starting LoRA fine-tuning...")
print("=" * 60)
train_result = trainer.train()
metrics = train_result.metrics or {}
print("=" * 60)
print("‚úÖ Training complete!")
print("üìä Stats:")
print("   ‚Ä¢ Steps:", metrics.get("train_steps", metrics.get("global_step", "N/A")))
print("   ‚Ä¢ Train loss:", metrics.get("train_loss", "N/A"))
print("   ‚Ä¢ Time (s):", metrics.get("train_runtime", "N/A"))

# Cell 11: Save LoRA adapters
# -----------------------------------------------------------------------------
print("üíæ Saving LoRA adapters...")
model.save_pretrained("smollm2_lora_adapters")
tokenizer.save_pretrained("smollm2_lora_adapters")
print("‚úÖ Adapters ‚Üí ./smollm2_lora_adapters")
print("üîç Size:")
!du -sh smollm2_lora_adapters

# Cell 12: Save merged model (optional; single artifact)
# -----------------------------------------------------------------------------
print("\nüíæ Merging LoRA adapters with base model (optional)...")
merged_ok = False
try:
    # Preferred helper (if available in your Unsloth version)
    model.save_pretrained_merged(
        "smollm2_lora_merged",
        tokenizer,
        save_method="merged_16bit",
    )
    merged_ok = True
except Exception as e:
    print("   save_pretrained_merged unavailable, trying manual merge:", repr(e))
    try:
        FastLanguageModel.merge_lora_weights(model)
        model.save_pretrained("smollm2_lora_merged")
        tokenizer.save_pretrained("smollm2_lora_merged")
        merged_ok = True
    except Exception as e2:
        print("   Manual merge failed (not critical):", repr(e2))

print("‚úÖ Merged model ‚Üí ./smollm2_lora_merged" if merged_ok else "‚ÑπÔ∏è Skipping merge (adapters are saved and usable).")

# Cell 13: Inference test
# -----------------------------------------------------------------------------
print("\nüß™ Inference test...\n")
FastLanguageModel.for_inference(model)

def build_prompt_for_infer(instruction, input_text=""):
    text = instruction if not input_text else instruction + "\n" + input_text
    return alpaca_prompt.format(text, "")

@torch.inference_mode()
def generate_response(instruction, input_text=""):
    prompt = build_prompt_for_infer(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.split("### Response:")[-1].strip()

print("=" * 60)
print("INFERENCE RESULTS (LoRA Fine-tuned Model)")
print("=" * 60)
tests = [
    "What is machine learning?",
    "Write a Python function to calculate Fibonacci numbers.",
    "Explain the water cycle in simple terms.",
]
for i, instruction in enumerate(tests, 1):
    print(f"\n[Test {i}]")
    print("Instruction:", instruction)
    print("Response:", generate_response(instruction))
    print("-" * 60)

# Cell 14: Summary
# -----------------------------------------------------------------------------
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë             LORA FINE-TUNING COMPLETE ‚Äî SUMMARY            ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
‚Ä¢ Model: HuggingFaceTB/SmolLM2-135M-Instruct
‚Ä¢ LoRA: r=16 over attention & MLP (dropout=0.05)
‚Ä¢ Data: 500 Alpaca examples (hard-truncated to 512 tokens)
‚Ä¢ Steps: 50 (demo)
‚Ä¢ Saved:
    - Adapters: ./smollm2_lora_adapters  (tiny files)
    - Merged model (optional): ./smollm2_lora_merged
""")


üì¶ Installing Unsloth and dependencies...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
‚úÖ Installation complete!
üî• PyTorch: 2.8.0+cu126
üéÆ CUDA available: True
üéØ GPU: Tesla T4

üîß Configuration:
   ‚Ä¢ Using LoRA: True
   ‚Ä¢ LoRA Rank (r): 16
   ‚Ä¢ LoRA Alpha: 16
   ‚Ä¢ LoRA Dropout: 0.05
   ‚Ä¢ Max Sequence Length: 512
   ‚Ä¢ Batch Size: 2
   ‚Ä¢ Learning Rate: 0.0002
   ‚Ä¢ Max Steps: 50

üì• Loading SmolLM2-135M model...
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading 

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

‚úÇÔ∏è Tokenizing with hard truncation...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


üîé Max tokenized length (should be ‚â§ 512): 512
‚öôÔ∏è Configuring TrainingArguments...
üèãÔ∏è Initializing trainer...
‚úÖ Trainer ready!
üöÄ Starting LoRA fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,399,488 (3.50% trained)


Step,Training Loss
5,2.1522
10,1.838
15,1.9809
20,1.8436
25,1.662
30,1.6786
35,1.6251
40,1.6697
45,1.4621
50,1.5704


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


‚úÖ Training complete!
üìä Stats:
   ‚Ä¢ Steps: N/A
   ‚Ä¢ Train loss: 1.7482541847229003
   ‚Ä¢ Time (s): 122.2901
üíæ Saving LoRA adapters...
‚úÖ Adapters ‚Üí ./smollm2_lora_adapters
üîç Size:
24M	smollm2_lora_adapters

üíæ Merging LoRA adapters with base model (optional)...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `smollm2_lora_merged`: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.52it/s]


Successfully copied all 1 files from cache to `smollm2_lora_merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 304.73it/s]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.96s/it]


Unsloth: Merge process complete. Saved to `/content/smollm2_lora_merged`
‚úÖ Merged model ‚Üí ./smollm2_lora_merged

üß™ Inference test...

INFERENCE RESULTS (LoRA Fine-tuned Model)

[Test 1]
Instruction: What is machine learning?
Response: Machine learning is a branch of artificial intelligence that focuses on enabling computers to learn from data and improve their performance over time. It involves using algorithms and statistical techniques to analyze and interpret data to identify patterns and relationships that can inform predictions, classify new data, and make informed decisions. Machine learning is used in a variety of applications, including but not limited to:

- Predicting stock prices and trading volumes
- Analyzing customer behavior and sales data
- Identifying spam and phishing attempts
- Understanding and classifying complex data sets

Machine learning is commonly used in various industries, such as:

- Healthcare: Predicting patient outcomes,
--------------------------