# Prompt Tuning

In this notebook, we will look into how to perform prompt tuning for a text classification task.

Load the required libraries and the config parameters

In [1]:
from collections import Counter

In [2]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
#import wandb

In [3]:
#wandb.init(project="prompt_learning_methods", name="prompt_tuning")

In [4]:
seed = 42
device = "cuda"
model_name_or_path = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer_name_or_path = "meta-llama/Llama-3.2-3B-Instruct"
dataset_name = "twitter_complaints"
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 1e-4
num_epochs = 10
batch_size = 8
set_seed(seed)

## Dataset Preparation

### Load the dataset

In [5]:
dataset = load_dataset(path="ought/raft", name=dataset_name)

In [6]:
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
print(classes)

['Unlabeled', 'complaint', 'no complaint']


In [7]:
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 50
    })
    test: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 3399
    })
})


In [8]:
dataset["train"][0]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [9]:
Counter(dataset["train"]["Label"])

Counter({2: 33, 1: 17})

### Preprocess the dataset

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) #token=hf_token

In [11]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(f"{target_max_length=}")

target_max_length=4


In [12]:
dataset.data['train']['Tweet text']

<pyarrow.lib.ChunkedArray object at 0x000001E1FD64C2E0>
[
  [
    "@HMRCcustomers No this is my first job",
    "@KristaMariePark Thank you for your interest! If you decide to cancel, you can call Customer Care  (... 17 chars omitted)",
    "If I can't get my 3rd pair of @beatsbydre powerbeats to work today I'm doneski man. This is a slap (... 42 chars omitted)",
    "@EE On Rosneath Arial having good upload and download speeds but terrible latency 200ms. Why is th (... 3 chars omitted)",
    "Couples wallpaper, so cute. :) #BrothersAtHome",
    ...
    "@asblough Yep! It should send you a notification with your driver’s name and what time they’ll (... 15 chars omitted)",
    "@Wavy2Timez for real",
    "@KenyaPower_Care  no power in south b area... is it scheduled.",
    "Honda won't do anything about water leaking in brand new car. Frustrated! @HondaCustSvc @AmericanH (... 4 chars omitted)",
    "@CBSNews @Dodge @ChryslerCares My driver side air bag has been recalled and replaced, bu

In [13]:
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids

        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
        labels["input_ids"][i] = labels["input_ids"][i][:max_length]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [14]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["train"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [15]:
next(iter(train_dataloader))

{'input_ids': tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128000,  49462,
            1495,    551,    571,  79833,    299,  14895,   1455,   8659,    279,
           15629,    369,    757,    198,   2535,    551,    220,   2201,  12458,
          128009],
         [128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128000,  49462,   1495,    551,    571,   2149,   3716,     49,    638,
              78,     34,   5518,  21694,   9523,    369,   2109, 

In [16]:
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]

        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
    return model_inputs


test_dataset = dataset["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [17]:
test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [18]:
next(iter(test_dataloader))

{'input_ids': tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128000,
           49462,   1495,    551,    571,     33,   9225,   5515,   1898,  11361,
             369,  11889,    856,  10280,   3118,    449,    279,   4868,   9313,
            2103,    389,    433,      0,  10532,  16535,    916,  16843,     40,
              15,  22272,     87,  12188,     52,     17,    198,   2535,    551,
             220],
         [128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128000,  49462,   1495,    551,  21603,
             287,    709,    389,    674,   3097,   5319,  45796, 

## Create the PEFT model, Optimizer and LR Scheduler

## Prompt Tuning config

In [19]:
prompt_tuning_init_text="Classify if the tweet is a complaint or no complaint.\n"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_name_or_path,
)

In [20]:
#from huggingface_hub import login
#login()

In [21]:
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [22]:
# creating model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=bnb_config , device_map="cuda")
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant":False})
model.gradient_checkpointing_disable()

model.config.use_cache = False
model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 39,936 || all params: 3,212,789,760 || trainable%: 0.0012


PeftModelForCausalLM(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 3072)
      (layers): ModuleList(
        (0-27): 28 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
            (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
            (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
            (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
            (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
            (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
          (post_attention_layernorm): Llama

In [23]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [24]:
model

PeftModelForCausalLM(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 3072)
      (layers): ModuleList(
        (0-27): 28 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
            (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
            (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
            (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
            (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
            (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
          (post_attention_layernorm): Llama

In [25]:
model.print_trainable_parameters()

trainable params: 39,936 || all params: 3,212,789,760 || trainable%: 0.0012


In [26]:
model = model.to(device)

In [27]:
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [28]:
#trainable_params = [p for p in model.parameters() if p.requires_grad]
#optimizer = torch.optim.AdamW(trainable_params, lr=lr)

In [29]:
gradient_accumulation_steps=8

In [30]:
# Scheduler (optional simple linear scheduler)
#from torch.optim.lr_scheduler import LambdaLR
#num_update_steps_per_epoch = max(1, len(train_dataloader) // gradient_accumulation_steps)
#total_training_steps = num_epochs * num_update_steps_per_epoch
#scheduler = LambdaLR(optimizer, lambda step: max(0.0, 1.0 - step / max(1, total_training_steps)))

In [31]:
#fp16 = True#
#scaler = torch.cuda.amp.GradScaler(enabled=fp16)

In [32]:
import gc
def free_cuda_cache():
    gc.collect()
    try:
        torch.cuda.empty_cache()
    except Exception:
        pass

free_cuda_cache()

## Qualitative evaluation on test samples before finetuning

In [33]:
model.eval()
i = 33
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=20, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand &gt; different sizing
Label :  Complaint

Reasoning: The tweet is a complaint because it expresses dissatisfaction with the shopping experience and the


## Training and Evaluation loop

In [34]:
# training and evaluation
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.autocast(dtype=torch.float16, device_type="cuda"):
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
    #wandb.log({"train": {"perplexity": train_ppl, "loss": train_epoch_loss, "epoch": epoch},
    #           "val": {"perplexity": eval_ppl, "loss": eval_epoch_loss, "epoch": epoch}})


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=0: train_ppl=tensor(366.6601, device='cuda:0') train_epoch_loss=tensor(5.9044, device='cuda:0') eval_ppl=tensor(215.9265, device='cuda:0') eval_epoch_loss=tensor(5.3749, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=1: train_ppl=tensor(142.6033, device='cuda:0') train_epoch_loss=tensor(4.9601, device='cuda:0') eval_ppl=tensor(96.9611, device='cuda:0') eval_epoch_loss=tensor(4.5743, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=2: train_ppl=tensor(73.0468, device='cuda:0') train_epoch_loss=tensor(4.2911, device='cuda:0') eval_ppl=tensor(54.0876, device='cuda:0') eval_epoch_loss=tensor(3.9906, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=3: train_ppl=tensor(42.8518, device='cuda:0') train_epoch_loss=tensor(3.7577, device='cuda:0') eval_ppl=tensor(33.2497, device='cuda:0') eval_epoch_loss=tensor(3.5040, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=4: train_ppl=tensor(24.8467, device='cuda:0') train_epoch_loss=tensor(3.2127, device='cuda:0') eval_ppl=tensor(19.8159, device='cuda:0') eval_epoch_loss=tensor(2.9865, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=5: train_ppl=tensor(13.7717, device='cuda:0') train_epoch_loss=tensor(2.6226, device='cuda:0') eval_ppl=tensor(12.0950, device='cuda:0') eval_epoch_loss=tensor(2.4928, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=6: train_ppl=tensor(9.4553, device='cuda:0') train_epoch_loss=tensor(2.2466, device='cuda:0') eval_ppl=tensor(7.8945, device='cuda:0') eval_epoch_loss=tensor(2.0662, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=7: train_ppl=tensor(6.5450, device='cuda:0') train_epoch_loss=tensor(1.8787, device='cuda:0') eval_ppl=tensor(5.8505, device='cuda:0') eval_epoch_loss=tensor(1.7665, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=8: train_ppl=tensor(5.3045, device='cuda:0') train_epoch_loss=tensor(1.6686, device='cuda:0') eval_ppl=tensor(4.8654, device='cuda:0') eval_epoch_loss=tensor(1.5822, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

epoch=9: train_ppl=tensor(3.9536, device='cuda:0') train_epoch_loss=tensor(1.3746, device='cuda:0') eval_ppl=tensor(4.5345, device='cuda:0') eval_epoch_loss=tensor(1.5117, device='cuda:0')


## Qualitative evaluation on test samples after finetuning

In [35]:
model.eval()
i = 33
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=5, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=False)[0])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


<|begin_of_text|>Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand &gt; different sizing
Label :  Complaint<|eot_id|>


## Saving the model and optionally pushing it to Hub

You can push model to hub or save model locally.

- Option1: Pushing the model to Hugging Face Hub
```python
model.push_to_hub(
    f"llama_prompt_tuning",
    token = "hf_..."
)
```
token (`bool` or `str`, *optional*):
    `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
    when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
    is not specified.
    Or you can get your token from https://huggingface.co/settings/token
```
- Or save model locally
```python
peft_model_id = f"mistral_prompt_tuning"
model.save_pretrained(peft_model_id)
```

In [36]:
# saving model
peft_model_id = "llama_prompt_tuning"
model.push_to_hub(peft_model_id, private=True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/shariq00/llama_prompt_tuning/commit/53f76ee395190a0d8aed5f86e2f6cbf22540c4e1', commit_message='Upload model', commit_description='', oid='53f76ee395190a0d8aed5f86e2f6cbf22540c4e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shariq00/llama_prompt_tuning', endpoint='https://huggingface.co', repo_type='model', repo_id='shariq00/llama_prompt_tuning'), pr_revision=None, pr_num=None)

In [37]:
# save full model (base + adapters merged)
save_dir = "./llama_prompt_tuning_local"

# this will save only adapters (much smaller, recommended)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./llama_prompt_tuning_local\\tokenizer_config.json',
 './llama_prompt_tuning_local\\special_tokens_map.json',
 './llama_prompt_tuning_local\\tokenizer.json')

### Check the size of the checkpoint

![Screenshot 2024-01-01 at 4.06.10 PM.png](attachment:d4e63655-b0af-4d8b-b50f-e42121336414.png)

In [38]:
!nvidia-smi

Fri Sep 19 06:56:02 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 522.06       Driver Version: 522.06       CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   68C    P5    22W /  N/A |   6494MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Load the PEFT checkpoint and do the qualitative analysis on test samples

In [39]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch

#dataset = load_dataset("ought/raft", "twitter_complaints")
#peft_model_id = "shariq00/llama_prompt_tuning"
#device = "cuda"
#text_column = "Tweet text"
#label_column = "text_label"
#config = PeftConfig.from_pretrained(peft_model_id)
#model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.float16)
#model = PeftModel.from_pretrained(model, peft_model_id)
#tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [40]:
#model.to(device)
model.eval()
i = 36
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")
# print(dataset["test"][i]["Tweet text"])

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Tweet text : @virginmedia Instead of spending money on advertising, why not fix the slow speeds in the RG2 area. CLOWNS
Label :  Complain  #advertising #RG2 #


In [41]:
!nvidia-smi

Fri Sep 19 06:56:03 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 522.06       Driver Version: 522.06       CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   71C    P0    39W /  N/A |   6494MiB /  8192MiB |     22%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces