# Qwen/Qwen2.5-3B-Instruct

### Google colab code for rendering

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import os
from pathlib import Path

# 🔎 Search terms (edit if you renamed the file)
TARGET_BASENAME = "Qwen2.5-3B-Instruct.ipynb"   # exact filename you told me
ALSO_MATCH_SUBSTR = "qwen"                      # fallback: find any .ipynb containing this

roots = ["/content/drive/MyDrive", "/content/drive/Shared drives"]
hits_exact, hits_sub = [], []

def walk(root):
    for dirpath, dirnames, filenames in os.walk(root):
        # skip super noisy system dirs
        if any(x in dirpath for x in ("/.Trash", "/.ipynb_checkpoints", "/System Volume", "/.shortcut-targets-by-id")):
            continue
        for f in filenames:
            if f.lower().endswith(".ipynb"):
                p = Path(dirpath) / f
                # exact (case-insensitive) filename match
                if f.casefold() == TARGET_BASENAME.casefold():
                    hits_exact.append(p)
                # substring match for exploration
                if ALSO_MATCH_SUBSTR and ALSO_MATCH_SUBSTR.casefold() in f.casefold():
                    hits_sub.append(p)

for r in roots:
    if os.path.exists(r):
        walk(r)

print("=== Exact filename matches ===")
if hits_exact:
    for i, p in enumerate(hits_exact, 1):
        print(f"{i}. {p}")
else:
    print("None")

print("\n=== Substring matches (to help you spot it) ===")
if hits_sub:
    for i, p in enumerate(hits_sub, 1):
        print(f"{i}. {p}")
else:
    print("None")

print("\nTIP: In the left Files pane, right-click your notebook → Copy path, and paste it here if needed.")


###Load

In [None]:
!nvidia-smi
!pip -q install "transformers>=4.43.3" "trl>=0.9.6" "peft>=0.12.0" "accelerate>=0.33.0" bitsandbytes datasets sentencepiece

Fri Sep  5 09:01:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

###Load the base model (Qwen/Qwen2.5-3B-Instruct) + setup LoRA

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

In [None]:
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"

**🔹 Why we picked this config**

We are on Colab Free (Tesla T4, 16 GB VRAM).
A normal Qwen2.5-3B model in half precision (fp16) needs ~12–14 GB just to load. That leaves almost no room for training (gradients, optimizer, dataset). You’d hit OOM (out of memory) quickly.

**So we do:**

load_in_4bit=True → shrinks model weights by ~4×.

bnb_4bit_quant_type="nf4" → keeps accuracy high, better than plain fp4.

bnb_4bit_use_double_quant=True → squeezes memory further, at tiny cost.

bnb_4bit_compute_dtype=torch.bfloat16 → still computes in higher precision so training doesn’t collapse.

Result: the 3B model fits comfortably in T4 memory, and you can actually fine-tune it with LoRA.

🔹 **What happens if we don’t do this?**
Case 1: Load in fp16 (no quantization)

VRAM use: ~12–14 GB just for weights.

With optimizer + gradients, you need ~20+ GB → T4 crashes (OOM).

You couldn’t fine-tune at all.

Case 2: Load in int8 (8-bit)

VRAM use: ~8–10 GB.

Might just fit, but leaves very little for training.

Training can still OOM unless you keep batch size = 1 and seq length tiny.

Accuracy slightly better than 4-bit, but slower and heavier.

Case 3: Load in int4 (our config)

VRAM use: ~4–5 GB.

Leaves ~10 GB free for training.

Stable with nf4 + bfloat16.

Perfect for Colab Free.

In [None]:
bnb_cfg = BitsAndBytesConfig(
                              load_in_4bit =True,                    # use 4-bit to fit model save GPU since we are in google colab free
                              bnb_4bit_use_double_quant=True,        # extra memory saving
                              bnb_4bit_quant_type="nf4",             # recommended format
                              bnb_4bit_compute_dtype=torch.bfloat16  # safe compute type

)

In [None]:
# Load tokeninzer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL,use_fast=True)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**Self-Attention**

Figures out which tokens should pay attention to which others.

Uses 4 linear projections:

q_proj → turns tokens into queries

k_proj → turns tokens into keys

v_proj → turns tokens into values

o_proj → outputs the weighted result

**Feed-Forward (MLP)**

After attention, tokens go through a big neural network.

In Qwen/LLaMA-style models these are:

gate_proj

up_proj

down_proj

MODELS BUILD ON BILIONS AND MILIONS OF PARAMS BUT WE NEED TO TUNE SPECIFIC TO TUNE

In [None]:
# LoRA Config

lora_cfg= LoraConfig(
                      r =16,                        # rank (size of adapter matrices)
                      lora_alpha= 32,               # scalling factor
                      lora_dropout = 0.05,          # helps to avoud overfitting
                      bias = None,                  # saves param
                      task_type = "CAUSAL_LM",      # this is a causal language model
                      target_modules =[
                           "q_proj","k_proj","v_proj","o_proj",   # attention layers
                           "gate_proj","up_proj","down_proj"      # feed-forward layers
                      ]
)

###Load & prepare dataset

In [None]:
from datasets import load_dataset

In [None]:
from datasets import load_dataset

# Load tweet sentiment dataset
raw_ds = load_dataset("tweet_eval", "sentiment")

# Map numeric labels -> words
label_map = {0: "negative", 1: "neutral", 2: "positive"}

def to_instruction(example):
    return {
        "instruction": "Classify the sentiment of this tweet.",
        "input": example["text"],
        "output": label_map[example["label"]]
    }

# Apply mapping
ds = raw_ds.map(to_instruction, remove_columns=raw_ds["train"].column_names)

# Keep a small subset (200 train / 200 validation) so it runs on free Colab
ds_small = {
    "train": ds["train"].select(range(min(200, len(ds["train"])))),
    "validation": ds["validation"].select(range(min(200, len(ds["validation"]))))
}

print("Samples:", len(ds_small["train"]), "train /", len(ds_small["validation"]), "val")
print(ds_small["train"][0])


Samples: 200 train / 200 val
{'instruction': 'Classify the sentiment of this tweet.', 'input': '"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"', 'output': 'positive'}


###Prompt building

In [None]:
from datasets import DatasetDict

In [None]:
from datasets import DatasetDict

def build_prompt(example):
    system = "You are a helpful assistant."
    instr  = (example.get("instruction") or "").strip()
    inp    = (example.get("input") or "").strip()
    out    = (example.get("output") or "").strip()

    # Build the user message
    if instr and inp:
        user = f"{instr}\n\nInput:\n{inp}"
    else:
        user = instr or inp

    # Final combined prompt
    return {
        "text": f"<s>[SYSTEM]\n{system}\n[/SYSTEM]\n"
                f"[USER]\n{user}\n[/USER]\n"
                f"[ASSISTANT]\n{out}</s>"
    }

# Apply formatting to both train and validation
ds_text = DatasetDict({
    "train": ds_small["train"].map(build_prompt),
    "validation": ds_small["validation"].map(build_prompt)
})

# Show one example
print(ds_text["train"][0]["text"][:400])


<s>[SYSTEM]
You are a helpful assistant.
[/SYSTEM]
[USER]
Classify the sentiment of this tweet.

Input:
"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
[/USER]
[ASSISTANT]
positive</s>


###Train (QLoRA on T4-safe settings)

In [None]:
from trl import SFTTrainer, SFTConfig

In [None]:
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto"
)

In [None]:
BATCH   = 1
ACCUM   = 16

cfg = SFTConfig(
    num_train_epochs=2,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    gradient_accumulation_steps=ACCUM,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=True,
    warmup_ratio=0.03,
    weight_decay=0.0,
    gradient_checkpointing=True,
    packing=True,
    report_to=[],
    remove_unused_columns=False,
)


kw = dict(
    model=model,
    args=cfg,
    train_dataset=ds_text["train"],
    eval_dataset=ds_text["validation"],
    peft_config=lora_cfg,
)

sig = inspect.signature(SFTTrainer.__init__).parameters
if "dataset_text_field" in sig:
    kw["dataset_text_field"] = "text"
if "processing_class" in sig:
    kw["processing_class"] = tokenizer
elif "tokenizer" in sig:
    kw["tokenizer"] = tokenizer

trainer = SFTTrainer(**kw)
print("✅ SFTTrainer constructed")



✅ SFTTrainer constructed


#Quick inference (chat)

In [None]:
# === Minimal Gradio chat UI for your fine-tuned LoRA model ===
!pip -q install gradio>=4.0.0

import os, torch, gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# --- Set your base + possible adapter dirs ---
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"
ADAPTER_DIRS = ["/content/qwen3b-qlora_trainer", "/content/qwen3b-qlora"]  # whichever exists
TOK_DIRS     = [d for d in ADAPTER_DIRS if os.path.isdir(d)]  # try tokenizer from saved dir first

# --- Try to reuse tokenizer/model from the current session if available ---
tokenizer = globals().get("tokenizer", None)
model     = globals().get("model", None)

def ensure_tokenizer():
    global tokenizer
    if tokenizer is not None:
        return tokenizer
    # Prefer tokenizer from saved adapter dir (if present), else from base model
    tok_src = next((d for d in TOK_DIRS if os.path.isdir(d)), BASE_MODEL)
    tokenizer = AutoTokenizer.from_pretrained(tok_src, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

def ensure_model():
    global model, tokenizer
    tok = ensure_tokenizer()
    if model is not None:
        # Already loaded (possibly LoRA-wrapped)
        return model

    # Load base in 4-bit (T4 friendly)
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16  # T4 prefers fp16
    )
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_cfg,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    base.config.use_cache = False

    # If we have saved adapters, attach them; else just use base
    adapter_src = next((d for d in ADAPTER_DIRS if os.path.isdir(d)), None)
    if adapter_src:
        model = PeftModel.from_pretrained(base, adapter_src)
    else:
        model = base
    return model

def build_prompt(instruction, user_input):
    system = "You are a helpful assistant."
    user = instruction.strip()
    if user_input.strip():
        user += f"\n\nInput:\n{user_input.strip()}"
    return (
        f"<s>[SYSTEM]\n{system}\n[/SYSTEM]\n"
        f"[USER]\n{user}\n[/USER]\n"
        f"[ASSISTANT]\n"
    )

@torch.inference_mode()
def generate(instruction, user_input, max_new_tokens, temperature, top_p):
    tok = ensure_tokenizer()
    mdl = ensure_model()
    prompt = build_prompt(instruction, user_input)
    inputs = tok(prompt, return_tensors="pt").to(mdl.device)
    out = mdl.generate(
        **inputs,
        max_new_tokens=int(max_new_tokens),
        do_sample=True,
        temperature=float(temperature),
        top_p=float(top_p),
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.pad_token_id
    )
    text = tok.decode(out[0], skip_special_tokens=False)
    # Extract assistant span
    return text.split("[ASSISTANT]\n")[-1].split("</s>")[0].strip()

with gr.Blocks(title="Qwen2.5-3B LoRA — Test UI") as demo:
    gr.Markdown("## 🔧 Test your fine-tuned Qwen2.5-3B (LoRA)")
    with gr.Row():
        instruction = gr.Textbox(label="Instruction", value="Classify the sentiment of this tweet.")
        user_input  = gr.Textbox(label="Input", value="I absolutely adore this product! ❤️")
    with gr.Row():
        max_new_tokens = gr.Slider(16, 512, value=128, step=1, label="max_new_tokens")
        temperature    = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
        top_p          = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
    run_btn = gr.Button("Generate")
    output  = gr.Textbox(label="Model Output")

    run_btn.click(
        fn=generate,
        inputs=[instruction, user_input, max_new_tokens, temperature, top_p],
        outputs=[output]
    )

demo.launch(share=False)  # set share=True if you want a public link

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

