In [1]:
!pip install transformers datasets peft accelerate bitsandbytes trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc

def get_memory_usage():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        return round(torch.cuda.memory_allocated() / (1024**2), 2)  # in MB
    else:
        return "No CUDA available"

# Load tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Full precision model
print("\n--- Loading Full Precision Model ---")
torch.cuda.empty_cache()
full_model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
print("Memory Used (Full Precision):", get_memory_usage(), "MB")

# Print dtype
print("Full precision dtype:", full_model.transformer.h[0].mlp.c_fc.weight.dtype)

del full_model
torch.cuda.empty_cache()
gc.collect()

# Quantized model (4-bit)
print("\n--- Loading 4-bit Quantized Model ---")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

quant_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
print("Memory Used (4-bit Quantized):", get_memory_usage(), "MB")
print("Quantized model dtype:", quant_model.transformer.h[0].mlp.c_fc.weight.dtype)



--- Loading Full Precision Model ---
Memory Used (Full Precision): 763.86 MB
Full precision dtype: torch.float32

--- Loading 4-bit Quantized Model ---
Memory Used (4-bit Quantized): 320.2 MB
Quantized model dtype: torch.uint8


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import PromptTuningConfig, get_peft_model, TaskType, PromptTuningInit

# 🔹 Load dataset (SST-2 sentiment classification)
dataset = load_dataset("sst2", split="train[:5000]")  # small sample for demo

# 🔹 Model to use
model_name = "gpt2"

# 🔹 Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 needs this

# 🔹 Load quantized model using QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# 🔹 Prompt tuning config
prompt_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=20,
    tokenizer_name_or_path=model_name,
    prompt_tuning_init_text="Classify the sentiment:",
)

# 🔹 Add Prompt Tuning layer to model
model = get_peft_model(model, prompt_config)

# 🔹 Prepare the dataset for causal LM format
def format_sample(example):
    text = f"Sentiment: {example['sentence']} ->"
    label = " positive" if example["label"] == 1 else " negative"
    full_input = text + label
    tokenized = tokenizer(full_input, padding="max_length", truncation=True, max_length=64)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(format_sample, remove_columns=dataset.column_names)




Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
# 🔹 Training arguments
training_args = TrainingArguments(
    output_dir="./qlora-prompt-gpt2",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=10,
    save_steps=50,
    learning_rate=5e-4,
    remove_unused_columns=False,
    fp16=True,
    report_to="none"
)

# 🔹 Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.9478
20,6.4945
30,6.1166
40,5.7039
50,5.2313
60,4.6737
70,4.4271
80,4.0862
90,3.8404
100,3.3801


TrainOutput(global_step=6250, training_loss=1.1814487664794922, metrics={'train_runtime': 446.1079, 'train_samples_per_second': 56.04, 'train_steps_per_second': 14.01, 'total_flos': 816537600000000.0, 'train_loss': 1.1814487664794922, 'epoch': 5.0})

In [10]:
input_text = "Sentiment: I love reading about history ->"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentiment: I love reading about history -> negative
