In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score as kappa
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2Model
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import os
import time
import pathlib

# log folder to save log files
log_folder = '/content/drive/MyDrive/asap/'

# target column
target_column = "target_score"

# hyper parameters
hp = {
    "base_model": "gpt2",
    "lr": 1e-4,
    "num_epochs": 8,
    "batch_size": 2,
    "use_amp": True,
    "mixed_precision": "fp16",
}

# Prepare ASAP Dataset

In [9]:
# Original kaggle training set
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")

# Smaller training set used for this project
dataset_df = pd.DataFrame(
  {
    'essay_id' : kaggle_dataset['essay_id'],
    'essay_set' : kaggle_dataset['essay_set'],
    'essay' : kaggle_dataset['essay'],
    'rater1' : kaggle_dataset['rater1_domain1'],
    'rater2' : kaggle_dataset['rater2_domain1'],
    'score' : kaggle_dataset['domain1_score'],
    'score2' : kaggle_dataset['domain2_score']
  })
dataset_df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1,rater2,score,score2
0,1,1,"Dear local newspaper, I think effects computer...",4,4,8,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,9,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,7,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,8,


In [10]:
# 4 classes => 2b, 3, 4, 7
# 5 classes => 5, 6
# 6 classes => 1, 2a, 8

dataset_df = dataset_df.loc[(dataset_df['essay_set'] == 3)].copy()

## Trainingh for essay set 3

In [12]:
# Essay set 3: 
mask = (dataset_df['essay_set'] == 3)
dataset_df.loc[mask, 'target_score'] = dataset_df.loc[mask, 'score']
dataset_df.loc[mask, 'target_score'] = dataset_df.loc[mask, 'target_score'] - dataset_df.loc[mask, 'target_score'].min()
dataset_df.loc[mask, 'target_score'].value_counts()

  dataset_df.loc[mask, 'target_score'] = dataset_df.loc[mask, 'score']


2    657
1    607
3    423
0     39
Name: target_score, dtype: int64

In [13]:
# essay_df = dataset_df[dataset_df['essay_set'] == 7].copy()

# essay_df = dataset_df.loc[(dataset_df['essay_set'] == 2) | (dataset_df['essay_set'] == 3) | (dataset_df['essay_set'] == 4)].copy()
essay_df = dataset_df
essay_df.shape

(1726, 8)

In [15]:
from sklearn.model_selection import train_test_split

essay_df['target_score'] = essay_df[target_column]

X, y = essay_df['essay'].to_list(), essay_df['target_score'].to_numpy()
essay_ids = essay_df['essay_set'].to_list()
num_labels = essay_df[target_column].unique().size

# 80 / 20 train test split
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(X, y, essay_ids, test_size=0.20, random_state=42) # stratify=y, this paramter will not work if any class has number of examples lower than 2

# split test to half to get 80 / 10 / 10 split
X_test, X_val, y_test, y_val, ids_test, ids_val = train_test_split(X_test, y_test, ids_test, test_size=0.50, random_state=42) # stratify=y_test

In [26]:
import torch
from torch.utils.data import Dataset, DataLoader

# a torch dataset implementation for ASAP dataset
class EssayDataset(Dataset):
    def __init__(self, essays, targets, essay_ids, tokenizer):
        self.essays = essays
        self.essay_ids = essay_ids
        self.targets = targets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, idx):
        text = str(self.essays[idx])
        encoded_input = tokenizer(text, truncation=True, return_tensors='pt').to(device)

        return encoded_input['input_ids'].squeeze(), encoded_input['attention_mask'].squeeze(), self.targets[idx], self.essay_ids[idx]

In [43]:
# collater function to pad tokens
def collate_fn(batch):
    PAD_TOKEN_ID = 50256 # Use tokenizer.pad_token_id to check
    input_ids_list, attention_mask_list, targets, essay_ids = [], [], [], []

    for input_ids, attention_mask, target, essay_id in batch:
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        targets.append(target)
        essay_ids.append(essay_id)

    # Pad the batch to the maximum sequence length within that batch using the tokenizer's pad token
    max_length = max(len(ids) for ids in input_ids_list)
    padded_input_ids = []
    padded_attention_mask = []

    for input_ids, attention_mask in zip(input_ids_list, attention_mask_list):
        pad_length = max_length - len(input_ids)
        padded_input_ids.append(torch.cat([input_ids, torch.tensor([PAD_TOKEN_ID] * pad_length, device=device, dtype=torch.long)]))
        # add zeros to attention mask for padds
        padded_attention_mask.append(torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long, device=device)]))

    return torch.stack(padded_input_ids), torch.stack(padded_attention_mask), torch.tensor(targets, dtype=torch.int64), essay_ids

In [44]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(hp['base_model'])
tokenizer.pad_token_id = tokenizer.eos_token_id

In [45]:
from transformers import GPT2ForSequenceClassification

class ClassifierLayer(torch.nn.Module):

  def __init__(self, input_size, output_size, bias=False):
    super(ClassifierLayer, self).__init__()

    self.dropout = torch.nn.Dropout(0.1)
    self.linear = torch.nn.Linear(input_size, output_size, bias=bias)

  def forward(self, x):
    inputs = self.dropout(x)
    return self.linear(inputs)

class GPT2Classification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.score = ClassifierLayer(config.n_embd, self.num_labels, bias=False)

        self.post_init()

In [46]:
from accelerate import Accelerator

# use fp16 mixed precision to improve training speed
accelerator = Accelerator(mixed_precision=hp['mixed_precision']) # fp16
device = accelerator.device

# model = GPT2Classification.from_pretrained(hp['base_model'], num_labels=num_labels)
model = GPT2ForSequenceClassification.from_pretrained(hp['base_model'], num_labels=num_labels)
model.to(device)
tokenizer.pad_token_id = tokenizer.eos_token_id
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# train loop

def train_loop(model, train_loader, val_loader, loss_fct, optimizer, lr_scheduler, progress_bar, log_file_handler, logging_step=1, use_amp=False):
    samples = 0.
    cumulative_loss = 0.

    # set model to train mode
    model.train()

    for step, (inputs, attention_masks, targets, essay_ids) in enumerate(train_loader):
        targets = targets.reshape(-1, 1).to(device)
        attention_masks = attention_masks.to(device)
        outputs = model(inputs, attention_mask=attention_masks)
        loss = loss_fct(outputs["logits"].view(-1, model.num_labels), targets.view(-1))
        accelerator.backward(loss)
        optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        samples += inputs.shape[0]
        cumulative_loss += loss.item()

        if step % logging_step == 0:
            # calculate qwk on test set

            with torch.no_grad():
                test_loss, test_results = test_loop(model, val_loader, loss_fct, use_amp=use_amp)
            model.train()
            log_str = "Step: {:<6} \t Train loss: {:<6.4f} \t Validation loss: {:<6.4f}\n".format(step, (cumulative_loss/samples), test_loss)
            qwk_avg = 0
            for essay_id, result in test_results.items():
                log_str += "\t {:<3} QWK: {:<6.4f}\n".format(essay_id, result['qwk'])
                qwk_avg += result['qwk']
            qwk_avg /= len(test_results)
            log_str += "\t AVG QWK: {:<6.4f}\n".format(qwk_avg)
            print(log_str)
            log_file_handler.write(log_str + "\n")
            samples = 0
            cumulative_loss = 0

    return cumulative_loss/samples if samples != 0 else float("inf")

def test_loop(model, test_loader, loss_fct, use_amp=False, show_progression=False):
    samples = 0.
    cumulative_loss = 0.
    preds = []
    labels = []
    all_essay_ids = []

    # set model to eval mode
    model.eval()

    loop_iterator = enumerate(tqdm(test_loader)) if show_progression else enumerate(test_loader)

    with torch.no_grad():
        for step, (inputs, attention_masks, targets, essay_ids) in loop_iterator:
            targets = targets.reshape(-1, 1).to(device)
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(inputs, attention_mask=attention_masks)
            loss = loss_fct(outputs["logits"].view(-1, model.num_labels), targets.view(-1))

            samples += inputs.shape[0]
            cumulative_loss += loss.item()

            probs = outputs['logits'].softmax(-1) # probs
            predictions = probs.argmax(-1) # predicted classes

            labels.extend(targets.tolist())
            preds.extend(predictions.tolist())
            all_essay_ids.extend(essay_ids)

         # calculate qwk per id
        unique_ids = set(all_essay_ids)
        preds = np.asarray(preds, dtype=int)
        labels = np.asarray(labels, dtype=int)
        all_essay_ids = np.asarray(all_essay_ids, dtype=int)
        results = {}

        for id in unique_ids:
            mask = (all_essay_ids == id)
            curr_preds = preds[mask]
            curr_labels = labels[mask]
            qwk = kappa(curr_preds, curr_labels, weights='quadratic')
            results[id] = {"qwk": qwk, "preds": curr_preds, "label": curr_labels}

    return cumulative_loss/samples if samples != 0 else float("inf"), results

In [48]:
from tqdm.auto  import tqdm
from transformers import get_scheduler
from torch.optim import AdamW
from sklearn.utils import class_weight

lr = hp['lr']
num_epochs = hp['num_epochs']
batch_size = hp['batch_size']
use_amp = hp['use_amp']

# create train data loader
train_dataset = EssayDataset(X_train, y_train, ids_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
num_training_steps = num_epochs * len(train_loader)

# create test data loader
test_dataset = EssayDataset(X_test, y_test, ids_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# create val data loader
val_dataset = EssayDataset(X_val, y_val, ids_val, tokenizer)
val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# # get class weights
# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

loss_fct = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer,
#     num_warmup_steps=0, num_training_steps=num_training_steps
# )
lr_scheduler = None
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# use accelerator prepare

# removed val_loader from prepare command
model, optimizer, train_loader, test_loader, val_loader = accelerator.prepare(
    model, optimizer, train_loader, test_loader, val_loader#, lr_scheduler
)

In [49]:
def open_log_file(log_folder, essay_df, model, label_column: str, hyper_parameters):
    # using time as a file name for logging

    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = timestr + '.log'

    # check if folder exists, create if it isn't
    pathlib.Path(log_folder).mkdir(exist_ok=True, parents=True)

    # open file to log results
    log_file = os.path.join(log_folder, file_name)
    fp = open(log_file, "a")

    fp.write("Log time: " + timestr + "\n")
    fp.write("Essay classes: " + str(essay_df['essay_set'].unique()) + "\n")
    fp.write("Using score column: " + label_column + "\n")
    fp.write("Score distribution: " + "\n" + essay_df[label_column].value_counts().to_string() + "\n")

    fp.write("\n--- Model parameters:\n")
    fp.write(str(model))
    fp.write('\n')

    fp.write("\n--- Hyper parameters:\n")
    for k, v in hyper_parameters.items():
        fp.write(f" {k:<25}: {v}\n")

    fp.write('\n')
    fp.flush()
    return fp

In [50]:
# Start logging to a file
fp = open_log_file(log_folder, essay_df, model, label_column=target_column, hyper_parameters=hp)

try:
    # Start training
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    with accelerator.autocast():
        fp.write("Training logs: \n\n")
        for epoch in range(num_epochs):
            train_loss = train_loop(model, train_loader, val_loader, loss_fct, optimizer, lr_scheduler, progress_bar, fp, logging_step=len(train_loader)//3, use_amp=hp['use_amp'])
            with torch.no_grad():
                test_loss, test_results = test_loop(model, test_loader, loss_fct)
                log_string = "Epoch: {:<6}\t Test  loss: {:<6.4f}\n".format(epoch+1, test_loss)
                qwk_avg = 0
                for essay_id, result in test_results.items():
                    log_string += "\t {:<3} QWK: {:<6.4f}\n".format(essay_id, result['qwk'])
                    qwk_avg += result['qwk']
                qwk_avg /= len(test_results)
                log_string += "\t AVG QWK: {:<6.4f}\n".format(qwk_avg)
                print(log_string)
                fp.write(log_string + "\n")
finally:
    print("Log file closed.")
    fp.close()

  0%|          | 0/5520 [00:00<?, ?it/s]

Step: 0      	 Train loss: 0.7995 	 Validation loss: 1.8298
	 3   QWK: 0.0000
	 AVG QWK: 0.0000

Step: 230    	 Train loss: 0.8669 	 Validation loss: 0.5930
	 3   QWK: 0.3320
	 AVG QWK: 0.3320

Step: 460    	 Train loss: 0.5537 	 Validation loss: 0.5305
	 3   QWK: 0.4520
	 AVG QWK: 0.4520

Epoch: 1     	 Test  loss: 0.5001
	 3   QWK: 0.4688
	 AVG QWK: 0.4688

Step: 0      	 Train loss: 0.4054 	 Validation loss: 0.4972
	 3   QWK: 0.4767
	 AVG QWK: 0.4767

Step: 230    	 Train loss: 0.4864 	 Validation loss: 0.4850
	 3   QWK: 0.5287
	 AVG QWK: 0.5287

Step: 460    	 Train loss: 0.4794 	 Validation loss: 0.5008
	 3   QWK: 0.4943
	 AVG QWK: 0.4943

Epoch: 2     	 Test  loss: 0.5298
	 3   QWK: 0.4288
	 AVG QWK: 0.4288

Step: 0      	 Train loss: 0.5220 	 Validation loss: 0.5341
	 3   QWK: 0.4288
	 AVG QWK: 0.4288

Step: 230    	 Train loss: 0.4308 	 Validation loss: 0.4651
	 3   QWK: 0.5686
	 AVG QWK: 0.5686

Step: 460    	 Train loss: 0.4296 	 Validation loss: 0.4446
	 3   QWK: 0.6279
	 AV