In [8]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import bitsandbytes as bnb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model_name = "Qwen/Qwen2.5-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,  # keep high precision for now
    device_map="auto"
)

print("Model loaded. 4-bit quantization not applied yet.")


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 47.51it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


Model loaded. 4-bit quantization not applied yet.


In [9]:
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=16, alpha=16, dropout=0.05):
        super().__init__()
        self.r = r
        self.alpha = alpha
        self.dropout = nn.Dropout(dropout)

        # Low-rank trainable matrices
        self.A = nn.Parameter(torch.randn(r, in_features) * 0.01)
        self.B = nn.Parameter(torch.randn(out_features, r) * 0.01)
        self.scaling = alpha / r

    def forward(self, x):
        delta = self.B @ self.A  # [out_features, in_features]
        delta = self.scaling * delta
        return self.dropout(x @ delta.T)

In [10]:
# NOTE: actual Qwen2.5 attention layers may have different naming
def add_lora_to_model(model, r=16, alpha=16, dropout=0.05):
    for name, module in model.named_modules():
        if "attention" in name.lower():
            if hasattr(module, "q_proj"):
                module.q_proj_lora = LoRALinear(
                    module.q_proj.in_features,
                    module.q_proj.out_features,
                    r=r,
                    alpha=alpha,
                    dropout=dropout
                )
            if hasattr(module, "v_proj"):
                module.v_proj_lora = LoRALinear(
                    module.v_proj.in_features,
                    module.v_proj.out_featur,
                    es,
                    r=r,
                    alpha=alpha,
                    dropout=dropout
                )

add_lora_to_model(model)
print("LoRA modules added to attention layers.")

LoRA modules added to attention layers.


In [11]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [12]:
# Training Loop
# Only a skeleton for now
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)

# TO IMPLEMENT
for batch in dataset['train'].select(range(2)):  # only a few examples as placeholder
    # TODO: tokenize batch, move to device, compute loss
    optimizer.zero_grad()
    # loss.backward()
    # optimizer.step()
    pass

print("Notebook setup complete. Training loop placeholder ready.")


Notebook setup complete. Training loop placeholder ready.
