In [None]:
%%capture
# 1. Install Unsloth & Dependencies (Latest Stable Versions)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
# ====================================================
# QWEN 2.5 3B: THE MASTER SCRIPT
# ====================================================
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# 1. CONFIGURATION
# ----------------
max_seq_length = 2048 # Plenty for Chat & Dialect text
dtype = None          # Auto-detect (Float16 for T4)
load_in_4bit = True   # Mandatory for 16GB VRAM

# 2. LOAD MODEL
# ----------------
# We use the Instruct version as a base because it's already smart at following rules.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. ATTACH ADAPTERS (The Brain Surgery)
# ----------------
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,              # High Rank for Vocabulary Acquisition
    target_modules = [   # Target EVERY linear layer for maximum plasticity
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha = 128,    # 2x Rank = Stable updates
    lora_dropout = 0,    # 0 is best for QLoRA
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Saves massive VRAM
    random_state = 3407,
)

In [None]:
# 4. DATA FORMATTING (The Bilingual Bridge)
# ----------------
# We use the official Qwen template to prevent "Gibberish" hallucinations.
tokenizer = get_chat_template(tokenizer, chat_template = "qwen-2.5")

def formatting_prompts_func(examples):
    texts = []
    # We iterate through the batch
    # We use .get() to handle potential missing keys safely
    source_types = examples.get("source", ["unknown"] * len(examples["input"]))

    for src, instruction, input_text, output in zip(source_types, examples["instruction"], examples["input"], examples["output"]):

        # LOGIC:
        # If it's "Behavior" (Chat) -> Strict User/Assistant format.
        # If it's "Meaning" (Translation) -> Input (English) \n Output (Darja).

        # We detect "Chat" by looking at the instruction or source
        if src == "synthetic" or "Ø¬Ø§ÙˆØ¨" in instruction:
            # === CHAT MODE ===
            user_msg = instruction
            if input_text and input_text.strip():
                user_msg += "\n" + input_text

            convo = [
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": output}
            ]
            text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)

        else:
            # === MEANING MODE (Interleaved) ===
            # We treat this as "Text Completion".
            # The model reads English/Context -> generates Darja.
            # We explicitly add <|im_end|> so it knows when to stop.
            if input_text and input_text.strip():
                text = f"{input_text}\n{output}<|im_end|>"
            else:
                text = f"{output}<|im_end|>" # Just raw Darja text learning

        texts.append(text)
    return { "text" : texts }

# Load your 7501 row dataset
dataset = load_dataset("json", data_files="final_train.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

print(f"âœ… Loaded and formatted {len(dataset)} rows.")

In [None]:
# Install psutil just in case
!pip install psutil

# Force-feed psutil to the system
import psutil
import builtins
builtins.psutil = psutil  # <--- This makes it global everywhere

print("Fix applied. Now run your training cell.")

In [None]:
# 5. THE TRAINER
# ----------------
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # FALSE is safer for mixed data types
    args = TrainingArguments(
        per_device_train_batch_size = 2, # T4 Limit
        gradient_accumulation_steps = 4, # Effective Batch = 8
        warmup_steps = 50,               # Warmup for ~5% of steps
        num_train_epochs = 2,            # 2 Epochs = ~1875 Steps (Perfect duration)
        learning_rate = 2e-4,            # Standard QLoRA rate
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",            # Saves VRAM
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs_qwen",
    ),
)

print("ðŸš€ Starting Qwen 2.5 Training...")
trainer.train()

In [None]:
# 6. SAVE ADAPTERS TO DRIVE (NOT GGUF YET)
print("Saving adapters to Drive...")
model.save_pretrained("/content/drive/MyDrive/Hackathon_Qwen-Darja_Adapters")
print("âœ… Adapters saved! Now RESTART your runtime.")