[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tanikina/low-resource-nlp-lab/blob/main/notebooks/PEFT_Tutorial.ipynb)

## 🤗 PEFT
PEFT stands for Parameter-Efficient Fine-Tuning. [PEFT Library](https://github.com/huggingface/peft) supports different adaptation methods for PLMs by fine-tuning only a small number of parameters instead of updating all the model's parameters which decreases computational and storage costs. E.g., prompt tuning, LoRA and IA3 are suppoted by `PEFT` and it is also integrated with `Transformers` and `Accelerate` libraries to support distributed training and inference for very big models.

You can find some addditional tutorials and examples using PEFT here: [https://huggingface.co/docs/peft/index](https://huggingface.co/docs/peft/index)

In [None]:
!pip install transformers[torch]
!pip install datasets
!pip install peft

In [None]:
import datasets
from transformers import AutoTokenizer

import torch
from torch import nn

import matplotlib.pyplot as plt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Defining the task and hyperparameters
task = "intent_classification"
model_name = "xlm-roberta-base"
batch_size = 16
num_epochs = 20
encode_prev_turn = False

In [None]:
# Combine the speaker information with the input text
def add_speaker(example):
    example["text"] = example["speaker"] + " - " + example["text"]
    return example

In [None]:
# Defining the tokenizer and pre-processing the data
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

data = datasets.load_dataset("DFKI/radr_intents")
# Preparing the training data
train_task_dataset = train_task_dataset = data["train"] # datasets.Dataset.from_csv("radr_intents/train.csv")
train_task_dataset = train_task_dataset.map(add_speaker)
train_task_dataset = train_task_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
train_task_dataset = train_task_dataset.rename_column("label","labels")
train_task_dataset = train_task_dataset.remove_columns(['id', 'speaker', 'text'])

# Preparing the validation data
dev_task_dataset = dev_task_dataset = data["validation"] # datasets.Dataset.from_csv("radr_intents/dev.csv")
dev_task_dataset = dev_task_dataset.map(add_speaker)
dev_task_dataset = dev_task_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
dev_task_dataset = dev_task_dataset.rename_column("label","labels")
dev_task_dataset = dev_task_dataset.remove_columns(['id', 'speaker', 'text'])

# Printing some examples
for sample_i, sample in enumerate(dev_task_dataset):
    if sample_i > 2:
        break
    print(sample)
    print(tokenizer.batch_decode([sample["input_ids"][:30]], skip_special_tokens=True))

{'labels': 2, 'input_ids': [0, 345, 27816, 20, 345, 27816, 1256, 23752, 38953, 13, 4, 1439, 108879, 198, 404, 186, 169846, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['UAV - UAV hat Softwareprobleme, wir versuchen es zu beheben.']
{'labels': 2, 'input_ids': [0, 20602, 20, 823, 4, 493, 43254, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['TL - Ja, verstanden.']
{'labels': 1, 'input_ids': [0, 20602, 20, 40787, 111697, 4, 2964, 4077, 599, 38250, 87523, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['TL - Bitte melden, wenn wieder einsatzbereit.']


### ⚙️ Model Preparation

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
    LoraConfig,
)

import evaluate
import torch

task = "radr_intents"
num_epochs = 20
lr = 1e-3
batch_size = 16

Here we can choose between `PromptEncoderConfig` for P-tuning introduced in [GPT Understands, Too (Liu et al., 2021)](https://www.semanticscholar.org/paper/GPT-Understands%2C-Too-Liu-Zheng/bc37c6bdb8f39929a58b30464f72d6aa46cddc17) and `LoraConfig` for [LoRA: Low-Rank Adaptation of Large Language Models (Hu et al., 2021)](https://www.semanticscholar.org/paper/LoRA%3A-Low-Rank-Adaptation-of-Large-Language-Models-Hu-Shen/a8ca46b171467ceb2d7652fbfb67fe701ad86092).

In [None]:
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=30, encoder_hidden_size=128)

In [None]:
peft_config = LoraConfig(
            r=8,
            lora_alpha=16,
            bias="none",
            task_type="SEQ_CLS",
            target_modules=["key", "query", "value"],
        )

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 833,800 || all params: 278,883,600 || trainable%: 0.2989777814113128


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

### 🚀 Training

In [None]:
training_args = TrainingArguments(
    output_dir="roberta-base-peft",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    full_determinism=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_task_dataset,
    eval_dataset=dev_task_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.769225
2,No log,1.761213
3,No log,1.809234
4,1.780500,1.827829
5,1.780500,1.549241
6,1.780500,1.346341
7,1.523500,1.10788
8,1.523500,1.147788
9,1.523500,1.077515
10,1.196300,1.099513


TrainOutput(global_step=3280, training_loss=1.2007528537657204, metrics={'train_runtime': 210.9168, 'train_samples_per_second': 247.491, 'train_steps_per_second': 15.551, 'total_flos': 885640102603776.0, 'train_loss': 1.2007528537657204, 'epoch': 20.0})

### ✅ Evaluation

In [None]:
id2label = {0: 'disconfirm', 1: 'order', 2: 'info_provide', 3: 'info_request', 4: 'call', 5: 'call_response', 6: 'other', 7: 'confirm'}
texts = ["Caller - I would like to order a pizza", "Agent - What toppings would you like?", "Caller - I want pepperoni and mushrooms", "Agent - Would you like to add a drink to your order?"]
inputs = tokenizer(texts, truncation=True, padding="longest", return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs).logits
    print(outputs)
    labels = [id2label[label] for label in torch.argmax(outputs, dim=-1).tolist()]
    for text, label in zip(texts, labels):
        print(text, ">>>", label)

tensor([[-2.5557, -1.8112, -2.0377, -0.0248,  7.3617,  0.9375, -0.8120, -2.7155],
        [ 0.2110,  1.4479,  3.4419,  3.2218, -5.0600, -3.8530, -0.5526,  0.1037],
        [-0.8891,  1.6071,  1.1785,  5.2296, -3.2375, -3.1483, -0.1494, -1.5106]],
       device='cuda:0')
UAV - UAV für Teamleader >>> call
UGV 2 - Wir haben einen Kanister im Ergeschoss gefunden >>> info_provide
TL - Was ist deine aktuelle Position? >>> info_request


# PEFT Fine-tuning for Agent 1 (Keyword Extraction)
This notebook adds cells to fine-tune a causal LLM (e.g., `gpt2`, `facebook/opt-...`, or `meta-...`) using **PEFT (LoRA)** to perform **Agent 1: Keyword Extraction**.

**Assumptions & dataset:**
- You have a CSV file with two columns: `conversations` and `extracted_keywords`.  
- Each cell in `conversations` is a JSON string containing the conversation thread (as in the project spec).  
- Each cell in `extracted_keywords` is a JSON string representing the target keywords array (the expected output from Agent 1).  
- The model will be trained to output a valid JSON string that matches the target schema.

The following cells:
1. Install required packages.
2. Load and preview the CSV.
3. Prepare prompt/response pairs.
4. Tokenize and create HuggingFace Dataset.
5. Configure PEFT (LoRA) and Trainer.
6. Train and save the fine-tuned model.


In [None]:
# Install required packages (uncomment and run if needed)
# !pip install -q transformers datasets accelerate peft bitsandbytes sentencepiece safetensors
print('Install packages if running in a fresh environment: transformers, datasets, accelerate, peft, bitsandbytes')

In [None]:
import pandas as pd
from pathlib import Path
data_path = Path('/mnt/data/agent1_dataset.csv')  # <-- Replace with your CSV path
assert data_path.exists(), f"Dataset CSV not found at {data_path}. Please upload it."

df = pd.read_csv(data_path)
print('Total rows:', len(df))
display(df.head(3))


In [None]:
import json

def make_prompt(convo_json_str: str) -> str:
    # The prompt instructs the model to extract keywords and return ONLY a valid JSON array string.
    prompt = ("You are Agent 1: a keyword extractor that reads a conversation JSON (string) and returns \\"
              "a JSON array of objects with fields: id, author, keywords. The output must be a valid JSON string.\\n\\n")
    prompt += "Conversation JSON:\n" + convo_json_str + "\n\nExtracted keywords JSON:"
    return prompt

def make_target(keywords_json_str: str) -> str:
    # target should be the exact JSON string representing the keywords array
    # Ensure it's compact (no newlines) to simplify tokenization
    obj = json.loads(keywords_json_str) if isinstance(keywords_json_str, str) else keywords_json_str
    return json.dumps(obj, ensure_ascii=False)

# Build examples
examples = []
for idx, row in df.iterrows():
    convo = row['conversations']
    keywords = row['extracted_keywords']
    try:
        prompt = make_prompt(convo)
        target = make_target(keywords)
        examples.append({'prompt': prompt, 'target': target})
    except Exception as e:
        # skip malformed rows but log
        print(f"Skipping row {idx} due to error: {e}")

print('Prepared examples:', len(examples))
# show one example
import textwrap
print('PROMPT SAMPLE:\n', textwrap.shorten(examples[0]['prompt'], width=400))
print('\nTARGET SAMPLE:\n', examples[0]['target'])

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

model_name_or_path = 'gpt2'  # change to a larger, compatible model as needed
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Create HF dataset
ds = Dataset.from_list(examples)

# Define a simple tokenization mapping: inputs are prompt, labels are prompt + target with labels masking prompt tokens
max_length = 512

def tokenize_fn(example):
    prompt = example['prompt']
    target = example['target']
    input_text = prompt + ' ' + target
    tokenized = tokenizer(input_text, truncation=True, max_length=max_length, padding='max_length')
    # create labels: mask prompt tokens with -100 so loss only computed on target tokens
    prompt_tokens = tokenizer(prompt, truncation=True, max_length=max_length, padding='max_length')
    # find prompt length (number of non-pad tokens)
    prompt_len = sum(1 for t in prompt_tokens['input_ids'] if t != tokenizer.pad_token_id)
    labels = tokenized['input_ids'].copy()
    for i in range(min(prompt_len, len(labels))):
        labels[i] = -100
    tokenized['labels'] = labels
    return tokenized

tokenized_ds = ds.map(tokenize_fn, remove_columns=ds.column_names, batched=False)
print(tokenized_ds[0].keys())


In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, return_dict=True)
# Resize token embeddings if we added pad token
model.resize_token_embeddings(len(tokenizer))

# Prepare LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['c_attn', 'q_proj', 'v_proj'] if 'gpt' in model_name_or_path else None,
    lora_dropout=0.1,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir='./agent1_fineturn',
    per_device_train_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    warmup_steps=10,
    logging_steps=10,
    save_total_limit=2,
    fp16=False
)

# Define data collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator
)

# Train
trainer.train()
# Save
trainer.save_model('./agent1_finetuned_model')
tokenizer.save_pretrained('./agent1_finetuned_model')


In [None]:
# Example inference using the fine-tuned model
from transformers import pipeline
pipe = pipeline('text-generation', model='./agent1_finetuned_model', tokenizer=tokenizer, device=0 if False else -1)
sample_prompt = examples[0]['prompt']
print('PROMPT:\n', sample_prompt)
out = pipe(sample_prompt, max_length=512, do_sample=False, num_return_sequences=1)
print('\nGENERATED:\n', out[0]['generated_text'])
