# Project Eval/Training Notebook

## Imports

In [8]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import evaluate
import numpy as np
from transformers import Trainer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
import transformers
import peft

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Evaluation

Define some utility functions:

In [10]:
from torch.utils.data import DataLoader
from peft import PeftModel
from tqdm import tqdm
def load_model(model_name, checkpoint, tokenizer):
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model.config.pad_token_id = tokenizer.pad_token_id
    model = PeftModel.from_pretrained(model, checkpoint)
    model = model.to(device)
    return model

def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=150,
                             padding=True, return_tensors="pt").to(device)
    return outputs

Load the model

In [6]:
model_name = "distilgpt2"
checkpoint = "checkpoint-11446"  # assuming the google drive folder is downloaded in this directory
file_name = None

if any(k in model_name for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
model = load_model(model_name, checkpoint, tokenizer)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Test some toy inputs and check the output of the pretrained model
dataset_orig = {
    "premise": ["The cat is sitting on a mat"], 
    "hypothesis": ["The cat is sitting down"]
}
dataset = Dataset.from_dict(dataset_orig)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["premise", "hypothesis"])

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [30]:
# Number of correct predictions
correct = torch.tensor(0, device=device)
# Set the data needed
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
eval_dataloader = DataLoader(tokenized_datasets, batch_size=1, collate_fn=data_collator)
# List of predictions to write to a file (if output file is given)
ret = []
total_len = 0
# Evaluation loop
for i, d in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)):
    f = d.to(device)
    output = model(**f)
    logits = output.logits
    argmax = torch.argmax(logits, dim=1).float()
    # If the labels are given, compute the test accuracy
    if "labels" in f:
        correct += torch.sum(f["labels"] == argmax)
    total_len += len(argmax)
    ret += list(argmax.int().cpu().numpy())

lab_map = {
    0: "Entailment", 1: "Neutral", 2: "Contradiction"
}
for i in range(total_len):
    print(f"For premise '{dataset_orig['premise'][i]}' and hypothesis '{dataset_orig['hypothesis'][i]}', the predicted result is '{lab_map[ret[i]]}'.")

100%|██████████| 1/1 [00:00<00:00, 13.29it/s]

For premise 'The cat is sitting on a mat' and hypothesis 'The cat is sitting down', the predicted result is 'Entailment'.





## Training

This is a simplified version of the training code, it is mostly used for testing the training code bit-by-bit.

In [31]:
dataset = load_dataset("snli")

In [32]:
model_name = "distilgpt2"
if any(k in model_name for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side=padding_side)

In [33]:
metric = evaluate.load("glue", "mnli")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [34]:
train_percent = 0.1  # how much of the training data to train on?
sz = int(len(dataset["train"]) * train_percent)
# Use a small subset of the training/testing data
sz = 200
dataset = dataset.filter(lambda x: x["label"] != -1)
indices = np.random.default_rng().choice(len(dataset["train"]),
                                                 size=sz, replace=False)
dataset["train"] = dataset["train"].select(indices)
indices2 = np.random.default_rng().choice(len(dataset["test"]),
                                                 size=100, replace=False)
dataset["test"] = dataset["test"].select(indices2)
print(dataset["train"], dataset["test"])

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 200
}) Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 100
})


In [35]:
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=None, padding=True,return_tensors="pt").to(device)
    return outputs
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["premise", "hypothesis"],
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [37]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)
model.config.pad_token_id = tokenizer.pad_token_id
prefix_tuning_config = peft.PrefixTuningConfig(
    peft_type="PREFIX_TUNING",
    task_type="SEQ_CLS",
    num_virtual_tokens=5,
    prefix_projection=False, inference_mode=False)  # can turn on prefix project for a better result
model = peft.get_peft_model(model, prefix_tuning_config)
model.print_trainable_parameters()
model = model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 48,384 || all params: 81,963,264 || trainable%: 0.05903132408196921


In [38]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [39]:
training_args = TrainingArguments(
    output_dir="output_models/gpt2-snli-finetune",
    disable_tqdm=False,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    log_level = "info"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
***** Running training *****
  Num examples = 200
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14
  Number of trainable parameters = 48,384


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to output_models/gpt2-snli-finetune/checkpoint-7
loading configuration file config.json from cache at /home/vboxuser/.cache/huggingface/hub/models--distilgpt2/snapshots/2290a62682d06624634c1f46a6ad5be0f47f38aa/config.json
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.