<a href="https://colab.research.google.com/gist/SauravMaheshkar/f01daa85639d23e817b6450e1d0eb45f/p-tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

📦 Packages and Basic Setup
---

In [None]:
%%capture
!pip install -q peft transformers datasets evaluate wandb ml-collections

In [None]:
import os
from google.colab import userdata

key = userdata.get("W&B")
os.environ["WANDB_API_KEY"] = key

In [None]:
# @title ⚙️ Configuration

import ml_collections


def get_config() -> ml_collections.ConfigDict:
    config = ml_collections.ConfigDict()
    config.model: str = "roberta-base"  # @param {type: "string"}
    config.task: str = "mrpc"  # @param {type: "string"}
    config.batch_size: int = 128  # @param {type: "number"}
    config.num_epochs: int = 10  # @param {type: "number"}
    config.learning_rate: int = 1e-3  # @param {type: "number"}
    config.dataset: str = "glue"  # @param {type: "string"}
    config.wandb_entity: str = "sauravmaheshkar"  # @param {type: "string"}

    return config


config = get_config()

In [None]:
import wandb

wandb.init(
    project="softprompts",
    entity=config.wandb_entity,
    job_type="train",
    group="p-tuning",
    config=config.to_dict(),
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_WATCH"] = "false"
os.environ["WANDB_LOG_MODEL"] = "true"

## 💿 The Dataset
---

In [None]:
from datasets import load_dataset

dataset = load_dataset(config.dataset, config.task)

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

if any(k in config.model for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(config.model, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(
        examples["sentence1"], examples["sentence2"], truncation=True, max_length=None
    )
    return outputs


tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

## ✍️ Model Architecture & Training
---

In [None]:
from peft import PromptEncoderConfig

peft_config = PromptEncoderConfig(
    task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128
)

In [None]:
from peft import get_peft_model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    config.model, return_dict=True
)
model = get_peft_model(model, peft_config)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load(config.dataset, config.task)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=f"{config.model}-peft-p-tuning",
    learning_rate=config.learning_rate,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    num_train_epochs=config.num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=["wandb"],
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_results = trainer.train()

In [None]:
wandb.config.train_results = train_results
wandb.finish()