In [1]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import bitsandbytes as bnb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model_name = "Qwen/Qwen2.5-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
        llm_int8_enable_fp32_cpu_offload=True  # Letting the CPU help wit the load
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)
print("Model loaded with 4-bit quantization (QLoRA base).")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.42s/it]

Model loaded with 4-bit quantization (QLoRA base).





In [2]:
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=16, alpha=16, dropout=0.05):
        super().__init__()
        self.r = r
        self.alpha = alpha
        self.dropout = nn.Dropout(dropout)

        # Low-rank trainable matrices
        self.A = nn.Parameter(torch.randn(r, in_features) * 0.01)
        self.B = nn.Parameter(torch.randn(out_features, r) * 0.01)
        self.scaling = alpha / r

    def forward(self, x):
        delta = self.B @ self.A  # [out_features, in_features]
        delta = self.scaling * delta
        return self.dropout(x @ delta.T)

In [3]:
for name, module in model.named_modules():
    if "attn" in name.lower() or "attention" in name.lower():
        print(name, module)


model.layers.0.self_attn Qwen2Attention(
  (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
  (k_proj): Linear4bit(in_features=2048, out_features=256, bias=True)
  (v_proj): Linear4bit(in_features=2048, out_features=256, bias=True)
  (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
)
model.layers.0.self_attn.q_proj Linear4bit(in_features=2048, out_features=2048, bias=True)
model.layers.0.self_attn.k_proj Linear4bit(in_features=2048, out_features=256, bias=True)
model.layers.0.self_attn.v_proj Linear4bit(in_features=2048, out_features=256, bias=True)
model.layers.0.self_attn.o_proj Linear4bit(in_features=2048, out_features=2048, bias=False)
model.layers.0.post_attention_layernorm Qwen2RMSNorm((2048,), eps=1e-06)
model.layers.1.self_attn Qwen2Attention(
  (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
  (k_proj): Linear4bit(in_features=2048, out_features=256, bias=True)
  (v_proj): Linear4bit(in_features=2048, out_features=25

In [4]:
# NOTE: actual Qwen2.5 attention layers may have different naming
def add_lora_to_model(model, r=16, alpha=16, dropout=0.05):
    for name, module in model.named_modules():
        if "attention" in name.lower():
            if hasattr(module, "q_proj"):
                module.q_proj_lora = LoRALinear(
                    module.q_proj.in_features,
                    module.q_proj.out_features,
                    r=r,
                    alpha=alpha,
                    dropout=dropout
                )
            if hasattr(module, "v_proj"):
                module.v_proj_lora = LoRALinear(
                    module.v_proj.in_features,
                    module.v_proj.out_features,
                    r=r,
                    alpha=alpha,
                    dropout=dropout
                )

add_lora_to_model(model)
print("LoRA modules added to attention layers.")

LoRA modules added to attention layers.


In [5]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [6]:
# Training Loop
# Only a skeleton for now
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)

# TO IMPLEMENT
for batch in dataset['train'].select(range(2)):  # only a few examples as placeholder
    # TODO: tokenize batch, move to device, compute loss
    optimizer.zero_grad()
    # loss.backward()
    # optimizer.step()
    pass

print("Notebook setup complete. Training loop placeholder ready.")


Notebook setup complete. Training loop placeholder ready.
