In [None]:
!pip install -U \
  bitsandbytes \
  transformers \
  peft \
  accelerate \
  datasets


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)


In [None]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

DATA_DIR = "/kaggle/input/instruct-dataset"
TRAIN_FILE = os.path.join(DATA_DIR, "train.jsonl")
VAL_FILE = os.path.join(DATA_DIR, "val.jsonl")

ADAPTER_DIR = "./adapters"

MAX_SEQ_LEN = 512

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
dataset = load_dataset(
    "json",
    data_files={
        "train": TRAIN_FILE,
        "validation": VAL_FILE,
    }
)

print(dataset)


In [None]:
def format_prompt(example):
    return (
        "### Instruction:\n"
        f"{example['instruction']}\n\n"
        "### Input:\n"
        f"{example['input']}\n\n"
        "### Response:\n"
        f"{example['output']}"
    )


In [None]:
def tokenize_fn(example):
    text = format_prompt(example)
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding=False,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_ds = dataset.map(
    tokenize_fn,
    remove_columns=dataset["train"].column_names,
    batched=False,
)

print(tokenized_ds)


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)


model = get_peft_model(model, lora_config)


In [None]:
def print_trainable_params(model):
    trainable = 0
    total = 0
    for _, param in model.named_parameters():
        total += param.numel()
        if param.requires_grad:
            trainable += param.numel()
    print(f"Trainable params: {trainable}")
    print(f"Total params: {total}")
    print(f"Trainable %: {100 * trainable / total:.2f}%")

print_trainable_params(model)


In [None]:
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    bf16=False,
    optim="adamw_torch",
    report_to="none",
    run_name="qwen-qlora-go",
)


In [None]:
from transformers import DataCollatorWithPadding
import torch

class MinimalCausalLMCollator(DataCollatorWithPadding):
    def __call__(self, features):
        labels = [f.pop("labels") for f in features]

        batch = super().__call__(features)

        max_len = batch["input_ids"].size(1)

        padded_labels = []
        for l in labels:
            padded_labels.append(l + [-100] * (max_len - len(l)))

        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)
        return batch


In [None]:
data_collator = MinimalCausalLMCollator(tokenizer=tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)


In [None]:
trainer.train()

In [None]:
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

print(f"Adapter saved to {ADAPTER_DIR}")

In [None]:
!zip -r adaptermodel.zip ./adapters