In [1]:
!pip install datasets
!python -m nltk.downloader punkt
!pip install evaluate
!pip install sacremoses sacrebleu
!pip install accelerate

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import pandas as pd
from datasets import Dataset
import pickle
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers.utils import PaddingStrategy
import torch
import numpy as np
import torch.nn as nn
import random
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from evaluate import load
sari = load("sari")


data_location = '/content/drive/MyDrive/UdS/Thesis/Thesis-Project/data/wikilarge/'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(model_name,
    max_new_tokens=1024
)

In [4]:
class TrainingArguments:
    def __init__(self):
        self.output_dir = "./output/"
        self.evaluation_strategy = "epoch"
        self.batch_size = 32
        self.adam_beta1 = 0.9
        self.adam_beta2 = 0.999
        self.adam_epsilon = 1e-8
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.lr_scheduler_type = "linear"
        self.max_grad_norm = 1.0
        self.max_steps = -1
        self.num_train_epochs = 3
        self.seed = 42
        self.warmup_steps = 0
        self.weight_decay = 0.0
        self.max_sequence_length = 126
        # self.logging_dir = "./logs"
        # self.logging_first_step = False
        # self.logging_steps = 500
        # self.save_steps = 500
        # self.save_total_limit = 1

    def __str__(self):
        print("Training Arguments / Hyperparameters:")
        print("---------------------------------")
        for key, value in self.__dict__.items():
            print(f"| {key}: {value}")
        return "--------------------------------"
training_args = TrainingArguments()

In [5]:
def seed_everything(seed: int):


    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(training_args.seed)

In [6]:
print(training_args)
print(training_args.batch_size)

Training Arguments / Hyperparameters:
---------------------------------
| output_dir: ./output/
| evaluation_strategy: epoch
| batch_size: 32
| adam_beta1: 0.9
| adam_beta2: 0.999
| adam_epsilon: 1e-08
| gradient_accumulation_steps: 1
| learning_rate: 5e-05
| lr_scheduler_type: linear
| max_grad_norm: 1.0
| max_steps: -1
| num_train_epochs: 3
| seed: 42
| warmup_steps: 0
| weight_decay: 0.0
| max_sequence_length: 126
--------------------------------
32


In [7]:
# grade_ratio = pd.read_csv(f'{data_location}grade_ratio_wiki_train.csv')
# # source texts
# with open(f'{data_location}wiki_train.src', 'r', encoding='utf-8') as f:
#     train_src = f.readlines()
# train_src = pd.DataFrame(train_src, columns=['source'])
# # target texts
# with open(f'{data_location}wiki_train.tgt', 'r', encoding='utf-8') as f:
#     train_tgt = f.readlines()
# train_tgt = pd.DataFrame(train_tgt, columns=['target'])
# train_texts = pd.concat([train_src, grade_ratio['abs_src_FKGL_Grade'], train_tgt, grade_ratio['abs_tgt_FKGL_Grade']], axis=1)
# train_texts.rename(columns={'abs_src_FKGL_Grade': 'source_grade', 'abs_tgt_FKGL_Grade': 'target_grade'}, inplace=True)
# train_texts['souce'] = train_texts['source'].replace(r'\n','', regex=True)
# train_texts['target'] = train_texts['target'].replace(r'\n','', regex=True)

In [8]:

data_location = '/content/drive/MyDrive/UdS/Thesis/Thesis-Project/data/wikilarge/'
train_texts = pd.read_pickle(f'{data_location}train_texts.pkl')
train_texts.iloc[0]['source']

'Heinrich Luitpold Himmler (7 October 1900 - 23 May 1945) was Chief of the German Police and Minister of the Interior.\n'

In [9]:
grade_groups = train_texts.groupby(['target_grade'])
grade_groups.get_group(0)

Unnamed: 0,source,source_grade,target,target_grade
5,"Though founded in 1887, under Jack Hyles' lead...",13,Dr. Jack Hyles\n,0
121,"On January 27, 2008, at the NHL All-Star Game ...",7,Records\n,0
130,Gone the times when nations battled for this' ...,8,gone the days when strife and discord.\n,0
152,May 17 & ndash; The conflict between Toyotomi ...,10,Ghent falls to the Spanish.\n,0
172,Some subjects that are discussed have criminal...,11,(see and).\n,0
...,...,...,...,...
216813,"Dubnium (,) is a chemical element with the sym...",10,It has the symbol Db.\n,0
216828,WWE Hell in a Cell is a professional wrestling...,13,Hell\n,0
216833,He died of a heart attack in 1968 and was hono...,8,He died from a heart attack in 1968.\n,0
216839,"In English, the name is sometimes spelled Bela...",10,-)\n,0


In [10]:
# create custom dataset where each grade group is a separate dataset, including source, target, and target grade
datasets = {}
for i, (grade, group) in enumerate(grade_groups):
    datasets[i] = Dataset.from_pandas(group[['source', 'target', 'target_grade']]).train_test_split(test_size=0.2)

datasets[6]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 20313
    })
    test: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 5079
    })
})

In [11]:
datasets[6]['train']

Dataset({
    features: ['source', 'target', 'target_grade', '__index_level_0__'],
    num_rows: 20313
})

In [12]:
# def pre_tokenize_function(examples):
#     return tokenizer(text=examples["source"], text_target=examples['target'], padding=True, max_length=training_args.max_sequence_length, return_tensors="pt")
def tokenize_function(examples):
    return tokenizer(text=examples["source"], text_target=examples['target'], padding=True, max_length=training_args.max_sequence_length, return_tensors="pt")

tokenized_dataset = datasets[6].map(tokenize_function, batched=True, batch_size=training_args.batch_size,
                                      remove_columns=['source', 'target', '__index_level_0__'])



Map:   0%|          | 0/20313 [00:00<?, ? examples/s]

Map:   0%|          | 0/5079 [00:00<?, ? examples/s]

In [13]:
print(len(tokenized_dataset['train'][100]['input_ids']))
print(len(tokenized_dataset['train'][100]['attention_mask']))
print(len(tokenized_dataset['train'][100]['labels']))

63
63
38


In [14]:
def find_max_len(tokenized_dataset):
    longest_source = 0
    source = ''
    longest_target = 0
    target = ''
    for dataset in ['train', 'test']:
        for example in tokenized_dataset[dataset]:
            source_len = len(example['input_ids'])
            target_len = len(example['labels'])
            if source_len > longest_source:
                longest_source = source_len
                source = example['input_ids']
            if target_len > longest_target:
                longest_target = target_len
                target = example['labels']
    return max(longest_source, longest_target)

training_args.max_sequence_length = find_max_len(tokenized_dataset)

In [15]:
# tokenized_dataset = datasets[6].map(tokenize_function, batched=True, batch_size=training_args.batch_size,
#                                       remove_columns=['source', 'target', '__index_level_0__'])


In [16]:
print(len(tokenized_dataset['train'][100]['input_ids']))
print(len(tokenized_dataset['train'][100]['attention_mask']))
print(len(tokenized_dataset['train'][100]['labels']))

63
63
38


In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding="max_length", max_length=training_args.max_sequence_length, label_pad_token_id=tokenizer.pad_token_id)

train_data_loader = torch.utils.data.DataLoader(tokenized_dataset['train'], batch_size=training_args.batch_size, shuffle=True, collate_fn=data_collator)
eval_data_loader = torch.utils.data.DataLoader(tokenized_dataset['test'], batch_size=training_args.batch_size, shuffle=False, collate_fn=data_collator)
dataloaders = {'train': train_data_loader, 'eval': eval_data_loader}

In [18]:
for batch in train_data_loader:
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['labels'].shape)
    print(batch['target_grade'].shape)
    break

torch.Size([32, 98])
torch.Size([32, 98])
torch.Size([32, 98])
torch.Size([32])


In [19]:
def compute_metrics(prediction):
    source_ids, pred_ids, labels_ids = prediction
    sources = []
    labels = []
    predictions = []

    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    source_str = tokenizer.batch_decode(source_ids, skip_special_tokens=True)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    sources.append(''.join(source_str))
    labels.append([''.join(label_str)])
    predictions.append(''.join(pred_str))


    return sari.compute(sources=sources, predictions=predictions, references=labels)


In [20]:
compute_metrics([tokenized_dataset['train'][1]['input_ids'], tokenized_dataset['train'][1]['labels'], tokenized_dataset['train'][1]['labels']])

{'sari': 97.43589743589745}

In [21]:
class FineTuneGPT2(nn.Module):
    def __init__(self, model, tokenizer, training_args):
        super(FineTuneGPT2, self).__init__()
        self.model = model


    def forward(self, input_ids, attention_mask, labels=None):
        # print("input shape:", input_ids.shape)
        # print("attn shape:", attention_mask.shape)
        # print("labels shape:", labels.shape)

        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

In [22]:
def train_test(model, dataloader, optimizer, training):
    """
    Performs a single epoch of training, validation, or testing on the given model using the specified DataLoader.
    This function adapts its behavior based on the 'training' parameter to correctly handle the model's state and
    perform necessary operations such as backpropagation and optimizer updates during training.

    Parameters:
        model (torch.nn.Module): The neural network model to be trained, validated, or tested.
        dataloader (DataLoader): A DataLoader providing batches of data (features and labels) for processing.
        optimizer (torch.optim.Optimizer): The optimizer (AdamW) to use for updating model parameters during training.
        pos_weight (torch.Tensor): A tensor specifying the weight for the positive class to handle class imbalance.
        training (str): A string specifying the mode of operation. Must be 'train', 'validation', or 'test'.

    Returns:
        None if training.
        Cumulative loss (float) if validation.
        A tuple (label_list, prediction_list) containing lists of true labels and predicted labels for
        each sample if testing.
    """
    # BCEWithLogitsLoss combines sigmoid with BCELoss for better stability, and handles class imbalance via pos_weight

    if training == "train":
        model.train()
    elif training == "validation":
        model.eval()
    elif training == "test":
        model.eval()
    else:
        raise ValueError("training argument must be either 'train', 'validation' or 'test'")

    cumulative_loss = 0
    input_list = [] # store inputs accross folds for calculating metrics
    prediction_list = [] # store predictions accross folds for calculating accuracy and f1
    label_list = [] # store labels accross folds for calculating accuracy and f1
    first_loop = True

    for sample in tqdm(dataloader): # iterate over batches in the DataLoader


        sample.to(device)
        input, attention_mask, labels = sample["input_ids"], sample["attention_mask"], sample['labels']
        output = model(input, attention_mask, labels) # forward pass
        loss_value = output.loss
        cumulative_loss += loss_value.item()

        if training == "train":
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
        labels_temp = []
        labels_temp.append(labels.to('cpu').detach().numpy())
        input_list.extend(input.to('cpu').detach().numpy())
        prediction_list.extend(torch.argmax(output.logits, dim=-1).to('cpu').detach().numpy())
        label_list.extend(labels_temp)
        if first_loop == False:
            break
        first_loop = False


    if training == "train":
        print("cumulative training loss:", cumulative_loss)
        print(compute_metrics((input, torch.argmax(output.logits, dim=-1), labels)))
        return cumulative_loss
    elif training == "validation":
        print("cumulative validation loss:", cumulative_loss)
        print(compute_metrics((input, torch.argmax(output.logits, dim=-1), labels)))
        return cumulative_loss
    elif training == "test":
        return label_list, prediction_list
    else:
        raise ValueError("Ya Done Fuck'd up, son!")


In [26]:
def evaluate(dataloaders, training_args):
    """
    Evaluates neural model's performance on a given task using specified parameters.
    The function preprocesses the data, splits it according to the task, initializes a TuneableModel,
    and trains it. It then evaluates the model on a test set and returns performance metrics.

    The function asserts the task to be one of the predefined tasks and initializes the model based on
    the provided parameters. It supports dynamic pos_weight handling and uses early stopping based on
    validation loss to prevent overfitting.

    Parameters:
        data (pd.DataFrame): The dataset to evaluate the model on.
        parameters (dict): A dictionary containing model hyperparameters and training settings. Expected
            keys include "pos_weight", "batch_size", "alpha", "hidden_size", "dropout", "n_hidden",
            "learning_rate", "beta_1", and "beta_2".
        task (int): An integer indicating the task type. Valid values are 0, 1, 2, and 3, each representing
            a different way of splitting the data for training and testing:
                0 - Known subjects and items with k-fold cross-validation.
                1 - Known subjects and items with leave-one-out cross-validation.
                2 - Held-out subjects, known items.
                3 - Held-out items, known subjects.

    Returns:
        tuple: A tuple containing the accuracy score, F1 score, and confusion matrix of the model evaluated
            on a given test set.
    """

    max_epochs = 1000

    predictions = []
    labels = []
    gpt_new = FineTuneGPT2(model, tokenizer, training_args)
    gpt_new.to(device)
    optimizer = torch.optim.AdamW(gpt_new.parameters(), lr=training_args.learning_rate,
                                  betas=(training_args.adam_beta1, training_args.adam_beta2),
                                  weight_decay=training_args.adam_epsilon)

    train_data_loader = dataloaders['train']
    eval_data_loader = dataloaders['eval']

    max_patience = 2
    last_loss = 1000000
    PATH = f"./models/gpt_new.pt"
    for epoch in range(max_epochs):
        print(f"Epoch {epoch}")
        # training
        train_test(gpt_new, train_data_loader, optimizer, training="train")
        # validation at end of epoch
        with torch.no_grad():
            validation_loss = train_test(gpt_new, eval_data_loader, optimizer, training="validation")

        if validation_loss < last_loss:
            last_loss = validation_loss
            current_patience = 0
        else:
            if current_patience == 0:
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': gpt_new.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': last_loss,
                    }, PATH)
            current_patience += 1
        if current_patience == max_patience:
            break

    # # Testing once patience is reached
    # torch.manual_seed(seed)
    # model = TuneableModel(input_size, parameters["hidden_size"], parameters["dropout"], parameters["n_hidden"])
    # gpt_new.to(device)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=parameters["learning_rate"], betas=(0.99, 0.99), weight_decay=1e-4)
    # checkpoint = torch.load(PATH)
    # model.load_state_dict(checkpoint['model_state_dict'])
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    # with torch.no_grad():
    #     prediction_list, label_list = train_test(model, test_dataloader, optimizer, training="test")
    # predictions.extend(prediction_list)
    # labels.extend(label_list)

    return #compute_metrics() # insert sari ids

In [27]:
evaluate(dataloaders, training_args)

Epoch 0


  0%|          | 1/635 [00:00<09:15,  1.14it/s]


cumulative training loss: 1.3227291107177734
{'sari': 23.902813697933357}


  1%|          | 1/159 [00:00<00:39,  4.03it/s]


cumulative validation loss: 8.58504867553711
{'sari': 23.338597990278732}
Epoch 1


  0%|          | 1/635 [00:00<08:27,  1.25it/s]


cumulative training loss: 7.851688385009766
{'sari': 21.10417138235956}


  1%|          | 1/159 [00:00<00:39,  4.03it/s]


cumulative validation loss: 2.2476260662078857
{'sari': 24.69212369300005}
Epoch 2


  0%|          | 1/635 [00:00<08:24,  1.26it/s]


cumulative training loss: 1.7905173301696777
{'sari': 23.18923824237813}


  1%|          | 1/159 [00:00<00:39,  3.97it/s]

cumulative validation loss: 9.620508193969727
{'sari': 27.813285737372055}





RuntimeError: Parent directory ./models does not exist.

In [None]:
sari.compute(sources=["the lion"], predictions=['the witch'], references=[['and the wardrobe']])