In [None]:
# =============================================================================
# COLAB: NEAR-FULL FINETUNING (LoRA r=64) FOR SmolLM2-135M WITH UNSLOTH
# - Fixes sequence > max_length issue by pre-tokenizing + truncating
# - Avoids CE shape mismatch in Unsloth fused loss
# - Makes W&B optional (disabled by default)
# - Safer LoRA targets (attention + MLP only)
# =============================================================================

# Cell 1: Install
# -----------------------------------------------------------------------------
print("üì¶ Installing Unsloth and dependencies...")
!pip install -q unsloth
!pip install -q --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q datasets trl transformers accelerate bitsandbytes

print("‚úÖ Installation complete!")

# Cell 2: Imports & environment
# -----------------------------------------------------------------------------
import os
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Disable W&B by default (remove or set to "false" if you want logging)
os.environ["WANDB_DISABLED"] = "true"

# If you still hit a TorchDynamo path in fused CE, un-comment the next line:
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

print(f"üî• PyTorch: {torch.__version__}")
print(f"üéÆ CUDA available: {torch.cuda.is_available()}")
print(f"üéØ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Cell 3: Config
# -----------------------------------------------------------------------------
max_seq_length = 512
dtype = None
load_in_4bit = True

# LoRA: high-rank for near-full coverage on tiny model
lora_rank = 64
lora_alpha = 64
lora_dropout = 0.0

# Training
batch_size = 2
gradient_accumulation_steps = 4
num_train_epochs = 1        # ignored when max_steps > 0 (kept for clarity)
learning_rate = 2e-5
max_steps = 50              # fast demo

print(f"""
üîß Configuration:
   ‚Ä¢ LoRA Rank: {lora_rank}
   ‚Ä¢ Max Sequence Length: {max_seq_length}
   ‚Ä¢ Batch Size: {batch_size}
   ‚Ä¢ Grad Accum: {gradient_accumulation_steps}
   ‚Ä¢ LR: {learning_rate}
   ‚Ä¢ Max Steps: {max_steps}
""")

# Cell 4: Load model & tokenizer
# -----------------------------------------------------------------------------
print("üì• Loading model...")
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Ensure pad/eos + truncation behavior are explicit and aligned
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = max_seq_length
tokenizer.truncation_side = "right"

print(f"‚úÖ Loaded: {model_name}")
try:
    print(f"üìä Parameters: {model.num_parameters() / 1e6:.1f}M")
except:
    pass

# Cell 5: Apply LoRA (attention + MLP only)
# -----------------------------------------------------------------------------
print("üîß Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",                             # stable for tiny models
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Trainable: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.1f}%)")

# Cell 6: Load dataset (subset for speed)
# -----------------------------------------------------------------------------
print("üìö Loading dataset (500 examples)...")
raw_ds = load_dataset("yahma/alpaca-cleaned", split="train[:500]")
print("‚úÖ Dataset size:", len(raw_ds))

# Cell 7: Build prompts, pre-tokenize, and hard-truncate to ‚â§ max_seq_length
# -----------------------------------------------------------------------------
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token

def build_texts(examples):
    insts, ins, outs = examples["instruction"], examples["input"], examples["output"]
    texts = []
    for instruction, input_text, output in zip(insts, ins, outs):
        if input_text:
            instruction = instruction + "\n" + input_text
        texts.append(alpaca_prompt.format(instruction, output) + EOS_TOKEN)
    return {"text": texts}

print("üß± Formatting to prompt strings...")
fmt_ds = raw_ds.map(build_texts, batched=True)

def tokenize_and_truncate(examples):
    enc = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding=False,
        add_special_tokens=False,
    )
    # Labels mirror inputs for causal LM
    enc["labels"] = [ids.copy() for ids in enc["input_ids"]]
    if "attention_mask" not in enc:
        enc["attention_mask"] = [[1]*len(ids) for ids in enc["input_ids"]]
    return enc

print("‚úÇÔ∏è Tokenizing with hard truncation...")
tok_ds = fmt_ds.map(
    tokenize_and_truncate,
    batched=True,
    remove_columns=list(fmt_ds.features),   # keep only tokenized fields
)

max_len = max(len(x) for x in tok_ds["input_ids"])
print("üîé Max tokenized length (should be ‚â§ 512):", max_len)

# Cell 8: Training arguments
# -----------------------------------------------------------------------------
print("‚öôÔ∏è Configuring TrainingArguments...")
training_args = TrainingArguments(
    output_dir="./full_finetuned_smollm2",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=5,
    max_steps=max_steps,                    # takes precedence over epochs
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_bnb_8bit",                 # robust with bitsandbytes
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    save_strategy="steps",
    save_steps=25,
    report_to=[] if os.environ.get("WANDB_DISABLED","true").lower()=="true" else ["wandb"],
)

# Cell 9: Trainer (use tokenized dataset; no on-the-fly formatting)
# -----------------------------------------------------------------------------
print("üèãÔ∏è Initializing trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tok_ds,
    max_seq_length=max_seq_length,
    packing=False,   # explicit; we pre-tokenized & truncated
    args=training_args,
)

print("‚úÖ Trainer ready!")

# Cell 10: Train
# -----------------------------------------------------------------------------
print("üöÄ Training...")
print("=" * 60)
train_result = trainer.train()
metrics = train_result.metrics or {}
print("=" * 60)
print("‚úÖ Training complete!")
print("üìä Stats:")
print("   ‚Ä¢ Steps:", metrics.get("train_steps", metrics.get("global_step", "N/A")))
print("   ‚Ä¢ Train loss:", metrics.get("train_loss", "N/A"))
print("   ‚Ä¢ Time (s):", metrics.get("train_runtime", "N/A"))

# Cell 11: Save adapters AND (optional) merged weights
# -----------------------------------------------------------------------------
print("üíæ Saving LoRA adapters...")
model.save_pretrained("smollm2_full_finetuned_adapters")
tokenizer.save_pretrained("smollm2_full_finetuned_adapters")
print("‚úÖ Saved adapters ‚Üí ./smollm2_full_finetuned_adapters")

# Optional: merge LoRA into base for a single artifact
try:
    print("üîß Merging LoRA into base weights (optional)...")
    FastLanguageModel.merge_lora_weights(model)
    model.save_pretrained("smollm2_full_finetuned_merged")
    tokenizer.save_pretrained("smollm2_full_finetuned_merged")
    print("‚úÖ Merged model ‚Üí ./smollm2_full_finetuned_merged")
except Exception as e:
    print("‚ÑπÔ∏è Skipping merge (not critical). Reason:", repr(e))

# Cell 12: Inference demo
# -----------------------------------------------------------------------------
print("\nüß™ Inference demo...\n")
FastLanguageModel.for_inference(model)

def build_prompt(instruction, input_text=""):
    text = instruction if not input_text else instruction + "\n" + input_text
    return alpaca_prompt.format(text, "")

@torch.inference_mode()
def generate_response(instruction, input_text=""):
    prompt = build_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.split("### Response:")[-1].strip()

tests = [
    "What is machine learning?",
    "Write a Python function to calculate Fibonacci numbers.",
    "Explain the water cycle in simple terms.",
]
print("=" * 60)
print("INFERENCE RESULTS")
print("=" * 60)
for i, t in enumerate(tests, 1):
    print(f"\n[Test {i}]")
    print("Instruction:", t)
    print("Response:", generate_response(t))
    print("-" * 60)

# Cell 13: Summary
# -----------------------------------------------------------------------------
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë         FINETUNING COMPLETE ‚Äî SUMMARY                      ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
‚Ä¢ Model: HuggingFaceTB/SmolLM2-135M-Instruct
‚Ä¢ LoRA: r=64 over attention & MLP (gate/up/down)
‚Ä¢ Data: 500 Alpaca examples (hard-truncated to 512 tokens)
‚Ä¢ Steps: 50 (demo)
‚Ä¢ Saved: ./smollm2_full_finetuned_adapters (+ optional merged model)
""")


üì¶ Installing Unsloth and dependencies...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
‚úÖ Installation complete!
üî• PyTorch: 2.8.0+cu126
üéÆ CUDA available: True
üéØ GPU: Tesla T4

üîß Configuration:
   ‚Ä¢ LoRA Rank: 64
   ‚Ä¢ Max Sequence Length: 512
   ‚Ä¢ Batch Size: 2
   ‚Ä¢ Grad Accum: 4
   ‚Ä¢ LR: 2e-05
   ‚Ä¢ Max Steps: 50

üì• Loading model...
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M-Instruct does not have a 

Unsloth 2025.10.12 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


‚úÖ Trainable: 19.54M / 100.97M (19.4%)
üìö Loading dataset (500 examples)...
‚úÖ Dataset size: 500
üß± Formatting to prompt strings...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

‚úÇÔ∏è Tokenizing with hard truncation...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


üîé Max tokenized length (should be ‚â§ 512): 512
‚öôÔ∏è Configuring TrainingArguments...
üèãÔ∏è Initializing trainer...
‚úÖ Trainer ready!
üöÄ Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,537,920 of 154,052,928 (12.68% trained)


Step,Training Loss
5,2.157
10,1.8785
15,2.116
20,2.0408
25,1.907
30,1.9244
35,1.9649
40,2.0027
45,1.8049
50,2.0309


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


‚úÖ Training complete!
üìä Stats:
   ‚Ä¢ Steps: N/A
   ‚Ä¢ Train loss: 1.9827165794372559
   ‚Ä¢ Time (s): 79.4809
üíæ Saving LoRA adapters...
‚úÖ Saved adapters ‚Üí ./smollm2_full_finetuned_adapters
üîß Merging LoRA into base weights (optional)...
‚ÑπÔ∏è Skipping merge (not critical). Reason: AttributeError("type object 'FastLanguageModel' has no attribute 'merge_lora_weights'")

üß™ Inference demo...

INFERENCE RESULTS

[Test 1]
Instruction: What is machine learning?
Response: Machine learning plays a crucial role in artificial intelligence by enabling computers to learn from experience and improve their performance and efficiency over time. It allows computers to learn from data and experience, and is used to make predictions and decisions in various applications, such as recommendation systems in e-commerce, and natural language processing in chatbots.

### Question:
Is there a difference
------------------------------------------------------------

[Test 2]
Instruction: Write 