In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score as kappa
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2Model
from tqdm.auto import tqdm
import os
import time
import pathlib

# log folder
log_folder = "./train_logs/"

# target column
target_column = "score"

# hyper parameters
hp = {
    "lr": 1e-4,
    "num_epochs": 5,
    "batch_size": 2,
    "use_amp": True,
    "mixed_precision": "fp16",
}

# Prepare ASAP Dataset

In [2]:
# Original kaggle training set
kaggle_dataset  = pd.read_csv('./training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")

# Smaller training set used for this project
dataset_df = pd.DataFrame(
  {
    'essay_id' : kaggle_dataset['essay_id'],
    'essay_set' : kaggle_dataset['essay_set'],
    'essay' : kaggle_dataset['essay'],
    'rater1' : kaggle_dataset['rater1_domain1'],
    'rater2' : kaggle_dataset['rater2_domain1'],
    'score' : kaggle_dataset['domain1_score']
  })
dataset_df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1,rater2,score
0,1,1,"Dear local newspaper, I think effects computer...",4,4,8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,8


## Use all essays for classification

In [3]:
# essay_df = dataset_df[dataset_df['essay_set'] == 8].copy()

essay_df = dataset_df.loc[(dataset_df['essay_set'] == 3) | (dataset_df['essay_set'] == 4) | (dataset_df['essay_set'] == 5) | (dataset_df['essay_set'] == 6)].copy()
# essay_df = dataset_df.copy()
essay_df.shape

(7101, 6)

In [4]:
import numpy as np
# Use minimum score
rater_1 = essay_df["rater1"].to_numpy()
rater_2 = essay_df["rater2"].to_numpy()

min_score = np.minimum(rater_1, rater_2)
max_score = np.maximum(rater_1, rater_2)

essay_df['min_score'] = min_score
essay_df['max_score'] = max_score

In [5]:
# assign < 20 to 20 and above 50 to 50 for essay set 8 
mask = (essay_df['essay_set'] == 8) & (essay_df['score'] > 50)
essay_df.loc[mask, 'score'] = 50

mask = (essay_df['essay_set'] == 8) & (essay_df['score'] < 20)
essay_df.loc[mask, 'score'] = 20

In [6]:
# split to half
mask = (essay_df['essay_set'] == 8)
essay_df.loc[mask, 'score'] = essay_df.loc[mask, 'score'].floordiv(2)
# essay_df.loc[mask, 'score'] = essay_df.loc[mask, 'score'].round().astype('int64')

In [7]:
essay_ids = essay_df['essay_set'].unique().tolist()

essay_stats = {}

for essay_id in essay_ids:
    curr_df = essay_df[essay_df['essay_set'] == essay_id]
    min_value = min(curr_df['score'])
    max_value = max(curr_df['score'])

    essay_stats[essay_id] = {
        "min": min_value,
        "max": max_value,
    }

essay_stats

{3: {'min': 0, 'max': 3},
 4: {'min': 0, 'max': 3},
 5: {'min': 0, 'max': 4},
 6: {'min': 0, 'max': 4}}

In [8]:
# transforms an real value to regression output
# r_range equals 22 (24-2) for set 7, it is 30 (50-20) for set 8 
def get_regression_value(value, essay_id, essay_stats, r_range = 4): 
    min = essay_stats[essay_id]["min"]
    max = essay_stats[essay_id]["max"]

    r_value = (value - min) / (max - min) # set value to 0..1 range

    # scale to regression value
    r_value *= r_range
    return r_value

In [9]:
# transforms regression output to label
def get_real_label(r_value, essay_id, essay_stats, r_range = 4):
    min = essay_stats[essay_id]["min"]
    max = essay_stats[essay_id]["max"]

    value = r_value / r_range # set regression value to 0..1 range

    # set to real label
    value = value * (max - min) + min
    return round(value)

In [10]:
from sklearn.model_selection import train_test_split

X, y = essay_df['essay'].to_list(), [get_regression_value(score, essay_id, essay_stats) for score, essay_id in zip(essay_df['score'], essay_df['essay_set'])]
essay_ids = essay_df['essay_set'].to_list()

# 60 / 40 train test split
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(X, y, essay_ids, test_size=0.20, random_state=42)

# split test to half to get 60 / 20 / 20 split

X_test, X_val, y_test, y_val, ids_test, ids_val = train_test_split(X_test, y_test, ids_test, test_size=0.50, random_state=42)

In [11]:
from transformers import GPT2ForSequenceClassification
import torch

class ClassifierLayer(torch.nn.Module):

  def __init__(self, input_size, output_size, bias=False):
    super(ClassifierLayer, self).__init__()

    self.dropout = torch.nn.Dropout(0.1)
    self.linear = torch.nn.Linear(input_size, output_size, bias=bias)

  def forward(self, x):
    inputs = self.dropout(x)
    return self.linear(inputs)

class GPT2Classification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.score = ClassifierLayer(config.n_embd, self.num_labels, bias=False)

        self.post_init()

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

# a torch dataset implementation for ASAP dataset
class EssayDataset(Dataset):
    def __init__(self, essays, targets, essay_ids, tokenizer):
        self.essays = essays
        self.essay_ids = essay_ids
        self.targets = targets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, idx):
        text = str(self.essays[idx])
        encoded_input = tokenizer(text, truncation=True, return_tensors='pt').to(device)

        return encoded_input['input_ids'].squeeze(), encoded_input['attention_mask'].squeeze(), self.targets[idx], self.essay_ids[idx]

In [13]:
# collater function to pad tokens
def collate_fn(batch):
    PAD_TOKEN_ID = 50256 # Use tokenizer.pad_token_id to check
    input_ids_list, attention_mask_list, targets, essay_ids = [], [], [], []

    for input_ids, attention_mask, target, essay_id in batch:
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        targets.append(target)
        essay_ids.append(essay_id)

    # Pad the batch to the maximum sequence length within that batch using the tokenizer's pad token
    max_length = max(len(ids) for ids in input_ids_list)
    padded_input_ids = []
    padded_attention_mask = []

    for input_ids, attention_mask in zip(input_ids_list, attention_mask_list):
        pad_length = max_length - len(input_ids)
        padded_input_ids.append(torch.cat([input_ids, torch.tensor([PAD_TOKEN_ID] * pad_length, device=device, dtype=torch.long)]))
        # add zeros to attention mask for padds
        padded_attention_mask.append(torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long, device=device)]))

    return torch.stack(padded_input_ids), torch.stack(padded_attention_mask), torch.tensor(targets, dtype=torch.float), essay_ids

In [14]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id

In [15]:
from accelerate import Accelerator
from transformers import GPT2ForSequenceClassification

# use fp16 mixed precision to improve training speed
accelerator = Accelerator(mixed_precision=hp['mixed_precision']) # fp16
device = accelerator.device

model = GPT2Classification.from_pretrained('gpt2', num_labels=1, problem_type="regression")
model.to(device)
tokenizer.pad_token_id = tokenizer.eos_token_id
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2Classification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# train loop

def train_loop(model, train_loader, val_loader, optimizer, lr_scheduler, progress_bar, log_file_handler, logging_step=1, use_amp=False):
    samples = 0.
    cumulative_loss = 0.

    # set model to train mode
    model.train()

    for step, (inputs, attention_masks, targets, essay_ids) in enumerate(train_loader):
        targets = targets.reshape(-1, 1)
        attention_masks = attention_masks.to(model.device)

        outputs = model(inputs, attention_mask=attention_masks, labels=targets)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        samples += inputs.shape[0]
        cumulative_loss += loss.item()

        if step % logging_step == 0:
            # calculate qwk on test set

            with torch.no_grad():
                test_loss, test_results = test_loop(model, val_loader, use_amp=use_amp)
            model.train()
            log_str = "Step: {:<6} \t Train loss: {:<6.4f} \t Validation loss: {:<6.4f}\n".format(step, (cumulative_loss/samples), test_loss)
            qwk_avg = 0
            for essay_id, result in test_results.items():
                log_string += "\t {:<3} QWK: {:<6.4f}\n".format(essay_id, result['qwk'])
                qwk_avg += result['qwk']
            qwk_avg /= len(test_results)
            log_string += "\t AVG QWK: {:<6.4f}\n".format(qwk_avg)
            print(log_str)
            log_file_handler.write(log_str + "\n")
            samples = 0
            cumulative_loss = 0

    return cumulative_loss/samples if samples != 0 else float("inf")

def test_loop(model, test_loader, use_amp=False, show_progression=False):
    samples = 0.
    cumulative_loss = 0.
    preds = []
    labels = []
    all_essay_ids = []

    # set model to eval mode
    model.eval()

    loop_iterator = enumerate(tqdm(test_loader)) if show_progression else enumerate(test_loader)

    with torch.no_grad():
        for step, (inputs, attention_masks, targets, essay_ids) in loop_iterator:
            targets = targets.reshape(-1, 1).to(model.device)
            inputs = inputs.to(model.device)
            attention_masks = attention_masks.to(model.device)
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(inputs, attention_mask=attention_masks, labels=targets)
            loss = outputs.loss

            samples += inputs.shape[0]
            cumulative_loss += loss.item()

            # probs = outputs['logits'].softmax(-1) # probs
            # predictions = probs.argmax(-1) # predicted classes
            predictions = outputs['logits'].round().to(torch.long).reshape(-1)

            labels.extend([get_real_label(target, essay_id, essay_stats) for target, essay_id in zip(targets.reshape(-1).tolist(), essay_ids)])
            preds.extend([get_real_label(pred, essay_id, essay_stats) for pred, essay_id in zip(predictions.tolist(), essay_ids)])
            all_essay_ids.extend(essay_ids)

        # calculate qwk per id
        unique_ids = set(all_essay_ids)
        preds = np.asarray(preds, dtype=int)
        labels = np.asarray(labels, dtype=int)
        all_essay_ids = np.asarray(all_essay_ids, dtype=int)
        results = {}

        for id in unique_ids:
            mask = (all_essay_ids == id)
            curr_preds = preds[mask]
            curr_labels = labels[mask]
            qwk = kappa(curr_preds, curr_labels, weights='quadratic')
            results[id] = {"qwk": qwk, "preds": curr_preds, "label": curr_labels}

    return cumulative_loss/samples if samples != 0 else float("inf"), results

In [17]:
from tqdm.auto  import tqdm
from transformers import get_scheduler
from torch.optim import AdamW

lr = hp['lr']

num_epochs = hp['num_epochs']
batch_size = hp['batch_size']

use_amp = hp['use_amp']

# create train data loader
train_dataset = EssayDataset(X_train, y_train, ids_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
num_training_steps = num_epochs * len(train_loader)

# create test data loader
test_dataset = EssayDataset(X_test, y_test, ids_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# create val data loader
val_dataset = EssayDataset(X_val, y_val, ids_val, tokenizer)
val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer,
    num_warmup_steps=0, num_training_steps=num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# use accelerator prepare

# removed val_loader from prepare command
model, optimizer, train_loader, test_loader, val_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, test_loader, val_loader, lr_scheduler
)

In [18]:
def open_log_file(log_folder, essay_df, model, label_column: str, hyper_parameters):
    # using time as a file name for logging

    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = timestr + '.log'

    # check if folder exists, create if it isn't
    pathlib.Path(log_folder).mkdir(exist_ok=True, parents=True)

    # open file to log results
    log_file = os.path.join(log_folder, file_name)
    fp = open(log_file, "a")

    fp.write("Log time: " + timestr + "\n")
    fp.write("Essay classes: " + str(essay_df['essay_set'].unique()) + "\n")
    fp.write("Using score column: " + label_column + "\n")
    fp.write("Score distribution: " + "\n" + essay_df[label_column].value_counts().to_string() + "\n")

    fp.write("\n--- Model parameters:\n")
    fp.write(str(model))
    fp.write('\n')

    fp.write("\n--- Hyper parameters:\n")
    for k, v in hyper_parameters.items():
        fp.write(f" {k:<25}: {v}\n")

    fp.write('\n')
    fp.flush()
    return fp

In [20]:
# Start logging to a file
fp = open_log_file(log_folder, essay_df, model, label_column=target_column, hyper_parameters=hp)

try:
    # Start training
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    with accelerator.autocast():
        fp.write("Training logs: \n\n")
        for epoch in range(num_epochs):
            train_loss = train_loop(model, train_loader, val_loader, optimizer, lr_scheduler, progress_bar, fp, logging_step=len(train_loader)//3, use_amp=True)
            with torch.no_grad():
                test_loss, test_results = test_loop(model, test_loader)
                log_string = "Epoch: {:<6}\t Test  loss: {:<6.4f}".format(epoch+1, test_loss)
                qwk_avg = 0
                for essay_id, result in test_results.items():
                    log_string += "\t {:<3} QWK: {:<6.4f}\n".format(essay_id, result['qwk'])
                    qwk_avg += result['qwk']
                qwk_avg /= len(test_results)
                log_string += "\t AVG QWK: {:<6.4f}\n".format(qwk_avg)
                print(log_string)
                fp.write(log_string + "\n")
finally:
    print("Log file closed.")
    fp.close()

  0%|          | 0/14200 [00:00<?, ?it/s]

Step: 0      	 Train loss: 7.0036 	 Validation loss: 11.4458
	 3  QWK: 0.0052
	 4  QWK: -0.0013
	 5  QWK: 0.0017
	 6  QWK: 0.0041

Step: 946    	 Train loss: 1.0700 	 Validation loss: 0.6072
	 3  QWK: 0.4385
	 4  QWK: 0.6558
	 5  QWK: 0.4545
	 6  QWK: 0.2338

Step: 1892   	 Train loss: 0.3221 	 Validation loss: 0.3949
	 3  QWK: 0.4952
	 4  QWK: 0.6872
	 5  QWK: 0.6225
	 6  QWK: 0.5552

Step: 2838   	 Train loss: 0.2802 	 Validation loss: 0.3336
	 3  QWK: 0.5770
	 4  QWK: 0.6994
	 5  QWK: 0.6746
	 6  QWK: 0.6089

Epoch: 1     	 Test  loss: 0.3284	 3   QWK: 0.5845
	 4   QWK: 0.6994
	 5   QWK: 0.6746
	 6   QWK: 0.6427
	 AVG QWK: 0.6503

Step: 0      	 Train loss: 0.3507 	 Validation loss: 0.3273
	 3  QWK: 0.5845
	 4  QWK: 0.6939
	 5  QWK: 0.6746
	 6  QWK: 0.6427

Step: 946    	 Train loss: 0.2432 	 Validation loss: 0.2744
	 3  QWK: 0.5819
	 4  QWK: 0.6986
	 5  QWK: 0.7364
	 6  QWK: 0.6620

Step: 1892   	 Train loss: 0.2538 	 Validation loss: 0.2988
	 3  QWK: 0.5650
	 4  QWK: 0.7016
	 5  Q