# Lightweight Fine-Tuning Project

This project is to classify emotions using a foundation model (GPT2). The purpose is to compare before and after light weight fine-tuning, how the model performances. 


here are the choices for PEFT techique, foundation model used, evaluation approach and the dataset for fine-tuning:

* PEFT technique: Lora techique
* Model: GPT-2
* Evaluation approach: Classification evaluation approaches such as accuracy, confusion matrix, auc, f1 score, precision-recall curve
* Fine-tuning dataset: zeroshot/twitter-financial-news-sentiment

In [2]:
# !pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0


In [3]:
# imports modules

import numpy as np
import pandas as pd
import torch
import os

from datasets import load_dataset
from collections import Counter

from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.metrics import recall_score, f1_score

from peft import PeftModel, LoraConfig, TaskType
from peft import get_peft_model
from peft import AutoPeftModelForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Helper function

# Evaluate the predictions 
def classification_scores(model_name, y_true, y_pred):
    
    accuracy = balanced_accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    
    return pd.DataFrame({'Balanced Accuracy': np.round(accuracy, 3), 
                         'Precision': np.round(precision, 3), 
                         'Recall': np.round(recall, 3),
                         'F1': np.round(f1, 3)}, 
                        index=[model_name])

## Load the dataset zeroshot/twitter-financial-news-sentiment from datasets 



In [5]:
dataset_name = "zeroshot/twitter-financial-news-sentiment"

dataset = load_dataset(dataset_name)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})


In [6]:
# number of labels
counts = Counter(dataset["train"]["label"])
sorted_counts = sorted(counts.items())
print(sorted_counts)

[(0, 1442), (1, 1923), (2, 6178)]


In [7]:
for entry in dataset["train"].select(range(6)):
    text = entry["text"]
    label = entry["label"]
    print(f"label={label}, text={text}")

label=0, text=$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT
label=0, text=$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3
label=0, text=$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb
label=0, text=$ESS: BTIG Research cuts to Neutral https://t.co/MCyfTsXc2N
label=0, text=$FNKO - Funko slides after Piper Jaffray PT cut https://t.co/z37IJmCQzB
label=0, text=$FTI - TechnipFMC downgraded at Berenberg but called Top Pick at Deutsche Bank https://t.co/XKcPDilIuU


## load GPT-2 tokenizer and tokenize the dataset


In [8]:
# load GPT-2 and tokenizer and evaluate on the test set

# model name
model_name = 'gpt2'
number_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"],padding="max_length", truncation=True)


def tokenize_datasets():
    
    tokenizer.pad_token = tokenizer.eos_token

    tokenized_dataset = {}
    splits = ["train", "validation"]

    for split in splits:
        tokenized_dataset[split] = dataset[split].map(tokenize_function, batched=True)

    return tokenized_dataset


tokenized_dataset = tokenize_datasets()

print(tokenized_dataset)

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 9543
}), 'validation': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2388
})}




## Load and Setup the model, Train the classification head

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [10]:

# Define a compute metrics function using scikit-learn
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }



def train_model(model_name, number_labels,tokenized_dataset, requires_grad = False,  train=False, lora=False):
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=number_labels,
        id2label={0: "Bearish", 1: "Bullish", 2: "Neutral"},
        label2id={"Bearish": 0, "Bullish": 1, "Neutral": 2 }
    )

    # Freeze all the parameter of the base model
    for param in model.base_model.parameters():
        param.requires_grad = requires_grad

    # Ensure the model's config recognizes the padding token
    model.config.pad_token_id = tokenizer.pad_token_id
    # Use the HuggingFace Trainer class to handle the training and eval loop 

    # Check if MPS is available and use it if possible
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)

    if lora:
        # Define LoRA configuration without the dropout parameter
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,  # Low-rank dimension
            lora_alpha=16,  # Scaling factor
            lora_dropout=0.1,
            target_modules=["attn.c_attn", "attn.c_proj"],  # Adjust the target modules to match GPT-2's architecture
            inference_mode=False
        )

        # Create PEFT model with LoRA


        peft_model = get_peft_model(model, peft_config)

        peft_model.print_trainable_parameters()

        model = peft_model


    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./output",
            learning_rate=2e-3,
            per_device_train_batch_size=4,
            per_gpu_eval_batch_size=4,
            num_train_epochs=1,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        ),
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    if train:
        print(trainer.train())   

    print(trainer.evaluate())
    
    if lora:
        # Save the PEFT weights and tokenizer

        model.save_pretrained("gpt2-lora")
        tokenizer.save_pretrained("gpt2-lora-tokenizer")



    predictions = trainer.predict(tokenized_dataset["validation"])

    return predictions

def classification_performance(model_name, tokenized_dataset, predictions):

    df = pd.DataFrame(tokenized_dataset["validation"])

    df = df[["text", "label"]]

    # add model predictions to the dataframe

    df["predicted_label"] = np.argmax(predictions[0], axis=1)

    return classification_scores(model_name, y_true=df["label"], y_pred=df["predicted_label"])




## Performance without training

In [13]:

predictions_wo_train = train_model(model_name, number_labels,tokenized_dataset,  requires_grad = False, train=False, lora=False)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 12.424674034118652, 'eval_accuracy': 0.14530988274706869, 'eval_precision': 0.18159183102531826, 'eval_recall': 0.33183781681578894, 'eval_f1': 0.08509780056565543, 'eval_runtime': 219.5365, 'eval_samples_per_second': 10.877, 'eval_steps_per_second': 2.719}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


In [14]:
# view the results

classification_performance('gpt-2-wo-train', tokenized_dataset, predictions_wo_train)    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Balanced Accuracy,Precision,Recall,F1
gpt-2-wo-train,0.332,0.182,0.332,0.085


## Performance with only training the classification head

In [15]:
predictions_cls_head = train_model(model_name, number_labels,tokenized_dataset,  requires_grad = False, train=True, lora=False)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7816,0.696731,0.724037,0.635429,0.546451,0.571216


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


TrainOutput(global_step=2386, training_loss=0.8927352278550332, metrics={'train_runtime': 1207.2106, 'train_samples_per_second': 7.905, 'train_steps_per_second': 1.976, 'total_flos': 4987154811322368.0, 'train_loss': 0.8927352278550332, 'epoch': 1.0})


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.6967308521270752, 'eval_accuracy': 0.724036850921273, 'eval_precision': 0.63542882758808, 'eval_recall': 0.5464511188350301, 'eval_f1': 0.5712162022952477, 'eval_runtime': 227.5547, 'eval_samples_per_second': 10.494, 'eval_steps_per_second': 2.624, 'epoch': 1.0}


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


In [17]:
classification_performance('gpt2-classification-head', tokenized_dataset, predictions_cls_head)  

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1
gpt2-classification-head,0.546,0.635,0.546,0.571


## Fine-tuning the gpt2 model


In [12]:
predictions_fine_tune = train_model(model_name, number_labels,tokenized_dataset,  requires_grad = True, train=True, lora=False)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7289,0.692716,0.729899,0.43483,0.524523,0.474179


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Checkpoint destination directory ./output/checkpoint-2386 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


TrainOutput(global_step=2386, training_loss=0.8360278120784328, metrics={'train_runtime': 2748.8817, 'train_samples_per_second': 3.472, 'train_steps_per_second': 0.868, 'total_flos': 4987154811322368.0, 'train_loss': 0.8360278120784328, 'epoch': 1.0})


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.6927160620689392, 'eval_accuracy': 0.7298994974874372, 'eval_precision': 0.43482956810582357, 'eval_recall': 0.5245228652730165, 'eval_f1': 0.4741786105363874, 'eval_runtime': 224.3259, 'eval_samples_per_second': 10.645, 'eval_steps_per_second': 2.661, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


In [13]:
classification_performance('gpt2-fine-tune', tokenized_dataset, predictions_fine_tune)  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Balanced Accuracy,Precision,Recall,F1
gpt2-fine-tune,0.525,0.435,0.525,0.474


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [10]:
predictions_peft = train_model(model_name, number_labels,tokenized_dataset,  requires_grad = False, train=True, lora=True)



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 446,976 || all params: 124,886,784 || trainable%: 0.3579049645477299


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.899,0.85337,0.655779,0.218593,0.333333,0.264036


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Checkpoint destination directory ./output/checkpoint-2386 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


TrainOutput(global_step=2386, training_loss=0.9605849746486667, metrics={'train_runtime': 2369.9788, 'train_samples_per_second': 4.027, 'train_steps_per_second': 1.007, 'total_flos': 5013226905403392.0, 'train_loss': 0.9605849746486667, 'epoch': 1.0})


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.8533703088760376, 'eval_accuracy': 0.6557788944723618, 'eval_precision': 0.2185929648241206, 'eval_recall': 0.3333333333333333, 'eval_f1': 0.26403641881638845, 'eval_runtime': 235.1204, 'eval_samples_per_second': 10.156, 'eval_steps_per_second': 2.539, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


In [11]:
classification_performance('gpt2-peft', tokenized_dataset, predictions_peft) 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Balanced Accuracy,Precision,Recall,F1
gpt2-peft,0.333,0.219,0.333,0.264


In [119]:
# Do I need to save the config?
# config = peft_model.config
# config.save_pretrained('gpt2-lora-config')

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [11]:


peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,  # Low-rank dimension
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,
    target_modules=["attn.c_attn", "attn.c_proj"],  # Adjust the target modules to match GPT-2's architecture
    inference_mode=False
)

# lora_model = AutoPeftModelForSequenceClassification.from_pretrained(model_name, config=peft_config, num_labels=number_labels)

tokenizer_lora = AutoTokenizer.from_pretrained("gpt2-lora-tokenizer")




In [12]:
# Loading PEFT pretrained model
peft_model_id = 'gpt2-lora'
print("Loading PEFT pretrained model...")
# config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=peft_model_id,
    config=peft_config,
    num_labels=number_labels
    )
# these are for GPT model: since we have custom padding token we need to initialise it for the model
# resize model embedding to match new tokenizer
inference_model.resize_token_embeddings(len(tokenizer_lora))
# fix model padding token id
inference_model.config.pad_token_id = inference_model.config.eos_token_id

inference_model.to(device)
inference_model.eval()
print("Finised loading PEFT pretrained model!")

Loading PEFT pretrained model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Finised loading PEFT pretrained model!


In [13]:
input_text = "stock is gonna be high"

tokenizer_lora.pad_token = tokenizer_lora.eos_token

inputs = tokenizer_lora(input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)



print(inputs)

{'input_ids': tensor([[13578,   318,  8066,   307,  1029]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [14]:
inputs = {key: value.to(device) for key, value in inputs.items()}

In [15]:
print(inputs)

{'input_ids': tensor([[13578,   318,  8066,   307,  1029]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [16]:
 # Set the model to evaluation mode to disable dropout, batch norm, etc.
inference_model.eval()


with torch.no_grad():
    outputs = inference_model(**inputs)

# Predict labels by taking the argmax over the logits
pred_labels = torch.argmax(torch.nn.functional.softmax(outputs.logits, dim=-1), dim=-1, keepdim=False)

# # Append true and predicted labels to their respective tensors
# true_labels = torch.cat((true_labels, batch['labels'].detach().cpu()))
# predicted_labels = torch.cat((predicted_labels, pred_labels.detach().cpu()))

# # Store true and predicted labels in a dictionary and return
# outputs = dict()
# outputs['true_labels']

In [17]:
pred_labels

tensor([1])

In [18]:
# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer_lora(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset["validation"].map(preprocess_function, batched=True)

# Convert to PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Run inference on the dataset
predictions = []
inference_model.eval()
with torch.no_grad():
    for i in range(len(tokenized_dataset)):
        inputs = {key: val.unsqueeze(0).to(device) for key, val in tokenized_dataset[i].items()}
        outputs = inference_model(**inputs)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        predictions.append(predicted_label)

# Print predictions
for i, prediction in enumerate(predictions):
    print(f"Sample {i}: Predicted label: {prediction}")

Map: 100%|██████████| 2388/2388 [00:01<00:00, 2163.64 examples/s]


Sample 0: Predicted label: 2
Sample 1: Predicted label: 1
Sample 2: Predicted label: 2
Sample 3: Predicted label: 1
Sample 4: Predicted label: 2
Sample 5: Predicted label: 2
Sample 6: Predicted label: 2
Sample 7: Predicted label: 2
Sample 8: Predicted label: 2
Sample 9: Predicted label: 0
Sample 10: Predicted label: 1
Sample 11: Predicted label: 1
Sample 12: Predicted label: 2
Sample 13: Predicted label: 1
Sample 14: Predicted label: 1
Sample 15: Predicted label: 1
Sample 16: Predicted label: 1
Sample 17: Predicted label: 0
Sample 18: Predicted label: 0
Sample 19: Predicted label: 0
Sample 20: Predicted label: 0
Sample 21: Predicted label: 0
Sample 22: Predicted label: 1
Sample 23: Predicted label: 1
Sample 24: Predicted label: 0
Sample 25: Predicted label: 1
Sample 26: Predicted label: 0
Sample 27: Predicted label: 1
Sample 28: Predicted label: 1
Sample 29: Predicted label: 1
Sample 30: Predicted label: 1
Sample 31: Predicted label: 2
Sample 32: Predicted label: 1
Sample 33: Predicted

In [22]:
 classification_scores('gpt2-lora', tokenized_dataset['label'], predictions)

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1
gpt2-lora,0.548,0.62,0.548,0.563
