# **HF - Prefix tuning for conditional generation**

https://huggingface.co/docs/peft/main/en/task_guides/seq2seq-prefix-tuning

https://github.com/huggingface/peft

https://github.com/huggingface/peft/tree/main/examples

https://github.com/huggingface/peft/tree/main/examples/causal_language_modeling

In [1]:
!pip install -q peft transformers datasets

**Setup**

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import os

# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

device = "cuda"
model_name_or_path = "t5-large"
tokenizer_name_or_path = "t5-large"

text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-2
num_epochs = 5
batch_size = 8

**Load dataset**

The sentences_allagree subset of the financial_phrasebank dataset.
This dataset contains financial news categorized by sentiment.

In [3]:
from datasets import load_dataset

dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

classes = dataset["train"].features["label"].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

In [4]:
dataset["train"][0]

{'sentence': "Outotec 's net profit for the second quarter of 2007 jumped to 16.8 mln euro ( $ 23.1 mln ) from 4.6 mln euro ( $ 6.3 mln ) a year ago .",
 'label': 2,
 'text_label': 'positive'}

**Preprocess dataset**

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [6]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]

In [7]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

**Train model**

In [8]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 983,040 || all params: 738,651,136 || trainable%: 0.13308583065659835


In [15]:
model.peft_config

{'default': PrefixTuningConfig(peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path='t5-large', revision=None, task_type='SEQ_2_SEQ_LM', inference_mode=True, num_virtual_tokens=20, token_dim=1024, num_transformer_submodules=2, num_attention_heads=16, num_layers=24, encoder_hidden_size=1024, prefix_projection=False)}

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [10]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 255/255 [01:11<00:00,  3.58it/s]
100%|██████████| 29/29 [00:06<00:00,  4.60it/s]


epoch=0: train_ppl=tensor(5.6424, device='cuda:0') train_epoch_loss=tensor(1.7303, device='cuda:0') eval_ppl=tensor(1.1503, device='cuda:0') eval_epoch_loss=tensor(0.1401, device='cuda:0')


100%|██████████| 255/255 [01:09<00:00,  3.68it/s]
100%|██████████| 29/29 [00:06<00:00,  4.52it/s]


epoch=1: train_ppl=tensor(1.1198, device='cuda:0') train_epoch_loss=tensor(0.1131, device='cuda:0') eval_ppl=tensor(1.1079, device='cuda:0') eval_epoch_loss=tensor(0.1025, device='cuda:0')


100%|██████████| 255/255 [01:06<00:00,  3.84it/s]
100%|██████████| 29/29 [00:06<00:00,  4.48it/s]


epoch=2: train_ppl=tensor(1.0965, device='cuda:0') train_epoch_loss=tensor(0.0921, device='cuda:0') eval_ppl=tensor(1.0942, device='cuda:0') eval_epoch_loss=tensor(0.0900, device='cuda:0')


100%|██████████| 255/255 [01:07<00:00,  3.79it/s]
100%|██████████| 29/29 [00:06<00:00,  4.48it/s]


epoch=3: train_ppl=tensor(1.0758, device='cuda:0') train_epoch_loss=tensor(0.0731, device='cuda:0') eval_ppl=tensor(1.1188, device='cuda:0') eval_epoch_loss=tensor(0.1122, device='cuda:0')


100%|██████████| 255/255 [01:06<00:00,  3.82it/s]
100%|██████████| 29/29 [00:06<00:00,  4.50it/s]

epoch=4: train_ppl=tensor(1.0669, device='cuda:0') train_epoch_loss=tensor(0.0648, device='cuda:0') eval_ppl=tensor(1.1007, device='cuda:0') eval_epoch_loss=tensor(0.0960, device='cuda:0')





In [11]:
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["validation"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['validation']['text_label'][:10]=}")

accuracy=95.15418502202643 % on the evaluation dataset
eval_preds[:10]=['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'neutral', 'negative', 'positive']
dataset['validation']['text_label'][:10]=['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'neutral', 'negative', 'positive']


**Inference**

In [12]:
from peft import PeftModel, PeftConfig

peft_model_id = "stevhliu/t5-large_PREFIX_TUNING_SEQ2SEQ"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_config.json:   0%|          | 0.00/326 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

In [13]:
inputs = tokenizer(
    "The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",
    return_tensors="pt",
)

In [14]:
model.to(device)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['positive']
