# Data Augmentation for the JUSTICE dataset with GPT-3.5

## Hackathon : Responsible Machine Learning (2022-2023)

### Théo Di Piazza

# 1 - Setup the notebook

In [None]:
#@title Download libraries (Require transformers)
!pip install transformers

In [None]:
#@title Import libraries
import os
import json
import torch
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from torch.optim import Adam, AdamW, SGD
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, GPT2Config

In [None]:
#@title Paths
path_project = '/content'

path_dataug = os.path.join(path_project, 'justice_augmented.csv') # dataset for data aug
path_train = os.path.join(path_project, 'justice_train.csv') # train justice 
path_test = os.path.join(path_project, 'justice_test.csv') # test justice

# 1 - Load model for sentence classification

In [None]:
#@title Model and Tokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # device
configuration = GPT2Config() # config

# Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("distilgpt2") # token
tokenizer.pad_token = tokenizer.eos_token # padding token to handle batch sizes

# Model
model = GPT2ForSequenceClassification(configuration).from_pretrained("distilgpt2", num_labels=2).to(device) # GPT2 Model
model.config.pad_token_id = model.config.eos_token_id

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2 - Load JUSTICE data from ETHICS

## 2.1 - Recreate train/test

In [None]:
#@title Load initialize data to recreate dataset from scratch

# Load and concatenate train, test from JUSTICE then split
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
df = pd.concat([train, test])
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42) # split

# Add data augmentation to the train set if needed
'''data_aug = pd.read_csv(path_dataug)
train_df = pd.concat([train_df, data_aug])'''

'data_aug = pd.read_csv(path_dataug)\ntrain_df = pd.concat([train_df, data_aug])'

## 2.2 - Class CustomDataset

In [None]:
#@title Class CustomDataset

class CustomDataset(Dataset):
  '''
    CustomDataset to fine-tune or evaluate GPT-2 on JUSTICE
    path_project (str): path of the project
    train (True or False): True to load train set, False otherwise
  '''
  def __init__(self, path_project, train=True):
    super(Dataset, self).__init__()

    # Load train 
    if(train):
      self.df = train_df
    # Or load test
    else:
      self.df = test_df

    # Initialize labels and texts
    self.labels = np.array(list(self.df['label'])) # 1 if fair, 0 otherwise
    self.texts = list(self.df["scenario"].astype(str))

  def __len__(self):
    '''
      Return length
    '''
    return len(self.labels)

  def __getitem__(self, index):
    '''
      Return texts, labels 
    '''
    # Get texts
    texts = self.texts[index]
    # Get labels
    labels = self.labels[index]

    return texts, labels

## 2.3 - Load Data with CustomLoader

In [None]:
#@title Load data with DataLoader

# Dataset
dataset_train = CustomDataset(path_project, train=True)
dataset_test = CustomDataset(path_project, train=False)

# DataLoader
batch_size = 32
loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)

print(f'Train and Test datas loaded! Train: {len(dataset_train)} samples - Test: {len(dataset_test)} samples!')

Train and Test datas loaded! Train: 19596 samples - Test: 4899 samples!


# 3 - Training

## 3.1 - Optimizer

In [None]:
#@title Optimizer - Frozen transformer
decomposed_params = [{'params': model.score.parameters()}]
optimizer = AdamW(decomposed_params, lr = 1e-4, eps = 1e-8)

In [None]:
#@title Optimizer - Not frozen transformer
decomposed_params = [{'params': model.score.parameters(), 'lr':1e-4}, {'params': model.transformer.parameters(), 'lr':1e-5}]
optimizer = AdamW(decomposed_params, eps=1e-8)

## 3.2 - Training step

In [None]:
#@title evaluate function
def evaluate(model, test_loader):
  '''
  Evaluate a model for a given test_loader.
  Returns accuracy, loss
  '''
  loss_test = 0
  labels_test, predictions_test = [], []
  model.eval()

  for batch_id, (texts, label) in enumerate(test_loader):
    
    # Get inputs_id and attention_mask from encoding
    encoding = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    label = label.to(device)

    # Make prediction
    outputs = model(input_ids, attention_mask=attention_mask, labels=label)
    loss, logits = outputs[0].item(), outputs[1] # logits
    p = torch.nn.functional.softmax(logits, dim=1) # probabilities
    predictions = p.argmax(axis=1).tolist() # class predicted

    # Save loss, labels and predictions
    loss_test += loss
    labels_test += label.tolist()
    predictions_test += predictions

  # Compute accuracy
  correct = (np.array(predictions_test) == np.array(labels_test))
  accuracy = correct.sum() / correct.size

  return accuracy, loss_test

In [None]:
#@title Train steps

epochs = 40
loss_train, loss_test = [], []
accuracy_train, accuracy_test = [], []

print(f'- START! epochs: {epochs} - device: {device} - train: {len(dataset_train)} samples! -\n')

for epoch in range(epochs):

  labels_epoch, predictions_epoch = [], []
  loss_train_epoch = 0

  for batch_id, (texts, label) in enumerate(loader_train):
    
    model.train()

    # Get inputs_id and attention_mask from encoding
    encoding = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    label = label.to(device)

    # Make prediction
    outputs = model(input_ids, attention_mask=attention_mask, labels=label)
    loss, logits = outputs.loss, outputs[1] # loss, logits
    p = torch.nn.functional.softmax(logits, dim=1) # probabilities
    predictions = p.argmax(axis=1).tolist() # class predicted

    # Save loss, labels and predictions
    loss_train_epoch += loss.item()
    labels_epoch += label.tolist()
    predictions_epoch += predictions

    # Optimizer
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (batch_id%100==0) and (not batch_id==0):
      print(f'--- Step {batch_id}/{len(loader_train)}. Avg.loss = {loss_train_epoch/batch_id:.2f}')

  # Compute accuracy train for the epoch
  correct = (np.array(predictions_epoch) == np.array(labels_epoch))
  accuracy_train_epoch = correct.sum() / correct.size

  # Compute accuracy train for the epoch
  accuracy_test_epoch, loss_test_epoch = evaluate(model, loader_test)
  
  print(f'-- Train - Epoch {epoch} : Accuracy: {accuracy_train_epoch:.2f} - Avg.Loss: {loss_train_epoch/len(loader_train):.2f} --')
  print(f'-- Validation - Epoch {epoch} : Accuracy: {accuracy_test_epoch:.2f} - Avg.Loss: {loss_test_epoch/len(loader_test):.2f} --\n')

  # Save metrics
  loss_train.append(loss_train_epoch); loss_test.append(loss_test_epoch);
  accuracy_train.append(accuracy_train_epoch); accuracy_test.append(accuracy_test_epoch)

## 3.3 - Save metrics

In [None]:
# Save metrics
metrics = {'loss_train': loss_train,
           'loss_test': loss_test,
           'accuracy_train': accuracy_train,
           'accuracy_test': accuracy_test}
with open('metrics_without_FT_v2.txt', "w") as output:
    output.write(str(metrics))

# Save the model
torch.save(model.state_dict(), 'model_with_finetuning.pth')

# 4 - Qualitative results

In [None]:
#@title Extract some examples

model.eval()
# Iterate over test loader to extract some examples
for batch_id, (texts, label) in enumerate(loader_test):
  
  # Get inputs_id and attention_mask from encoding
  encoding = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True)
  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)
  label = label.to(device)

  # Make prediction
  outputs = model(input_ids, attention_mask=attention_mask, labels=label)
  loss, logits = outputs[0].item(), outputs[1] # logits
  p = torch.nn.functional.softmax(logits, dim=1) # probabilities

  break

In [None]:
#@title Display texts
index_to_display = 5
texts[index_to_display]

('I deserve to have my dog walked by my husband every day.',
 tensor(1, device='cuda:0'))

In [None]:
#@title Display labels
label[index_to_display]

tensor(1, device='cuda:0')

In [None]:
#@title Display probabilities
p[index_to_display]

tensor([0.6562, 0.3438], device='cuda:0', grad_fn=<SelectBackward0>)