In [None]:
import logging
import pandas as pd
import random
import torch 

from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import TrainingArguments, Trainer

from functions import *

In [None]:
cuda_avbl, device = test_cuda_avbl()

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}                                                      # set model

SPECIAL_TOKENS  = {"bos_token": "<|BOS|>",                                                                              # beginning of a sequenze
                   "eos_token": "<|EOS|>",                                                                              # end of a sequenze
                   "unk_token": "<|UNK|>",                                                                              # set for unknown tokens
                   "pad_token": "<|PAD|>",                                                                              # empty tokens for short sentences
                   "sep_token": "<|SEP|>"}                                                                              # seperates sentences

MAX_LENGTH      = 1024

TRAIN_SIZE      = 0.8

In [None]:
class BasicData():
    '''
    This is a class for loading and extracting the basic data.

    Attributes:
        data_path (str): Path of the json trainings data.
        read_lines (bool): Should the file be read as a lined object?
        join_lists (bool):  Is it necessary to join multipile strings to one source?
        source_min_max (list): Minimum and maximum string length of the source.
        target_min_max (list): Minimum and maximum string length of the target.
    '''

    def __init__(self, data_path, read_lines=False, join_lists=False, source_min_max=[0, 100000], target_min_max=[20, 250]):
        '''
        Constructor for BasicData class.
        
        Parameters:
            data_path (str): Path of the json trainings data.
            read_lines (bool): Should the file be read as a lined object?
            join_lists (bool):  Is it necessary to join multipile strings to one source?
            source_min_max (list): Minimum and maximum string length of the source.
            target_min_max (list): Minimum and maximum string length of the target.
        '''

        self.data_path      = data_path
        self.read_lines     = read_lines
        self.join_lists     = join_lists
        self.source_min_max = source_min_max
        self.target_min_max = target_min_max

        self.df  = self.get_data()
        self.dic = self.get_dict()

        logging.info("BasicData instantiated")
    

    def get_data(self):
        '''
        Reads json file and converts it to pandas data frame.
        
        Returns:
            df (DataFrame): Data Frame with basic training data.
        '''

        df = pd.read_json(self.data_path, lines=self.read_lines)
        logging.info("Data readed")
        df = self.filter_data(df)

        return df
    

    def filter_data(self, data):
        '''
        Filters Data Frame for columns and string length.

        Parameters:
            data (DataFrame): Basic Trainings Data Frame.
        
        Returns:
            filtered_data (DataFrame): Filtered trainings data.
        '''

        filtered_data = data[["paper_id", "source", "target"]]
        filtered_data = filtered_data.dropna(inplace=False)
        if self.join_lists:
            filtered_data["source"] = filtered_data['source'].apply(lambda x: ' '.join(map(str, x)))
            filtered_data["target"] = filtered_data['target'].apply(lambda x: ' '.join(map(str, x)))
        filtered_data = filtered_data[(filtered_data.source.astype(str).str.len()>self.source_min_max[0]) & (filtered_data.source.astype(str).str.len()<self.source_min_max[1])]
        filtered_data = filtered_data[(filtered_data.target.astype(str).str.len()>self.target_min_max[0]) & (filtered_data.target.astype(str).str.len()<self.target_min_max[1])]
        filtered_data["paper_id"] = filtered_data["paper_id"].astype(str)
        
        logging.info("Data filtered")

        return filtered_data
    

    def get_dict(self):
        '''
        Converts Training Data from Data Frame to Dictionary.
        
        Returns:
            d (dict): Dictionary with trainings data.
        '''

        d = dict()
        i = 1
        for index, row in self.df.iterrows():
            i = i + 1
            d[row["paper_id"]+"_"+str(i)] = [row["source"], row["target"]]
        
        logging.info("Dictionary created")

        return d


    def get_train_test(self, split):   
        '''
        Splits trainings dictionary into trainings and test data.

        Parameters:
            split (int): Ratio for split between trainings and test data.
        
        Returns:
            train_data (dict): Dictionary with data for training.
            test_data (dict): Dictionary with data for testing.
        '''

        ids = list(self.dic.keys())
        random.shuffle(ids)
   
        train_size = int(split * len(self.dic))

        train_ids = ids[:train_size]
        test_ids = ids[train_size:]

        train_data = dict()
        for id in train_ids:
            train_data[id] = self.dic[id]

        test_data = dict()
        for id in test_ids:
            test_data[id] = self.dic[id]
        
        logging.info("Data splitted into test and training")

        return train_data, test_data 

In [None]:
class GPTDataset(Dataset):
    '''
    This is a class for encoding data to train nlp model.

    Attributes:
        data (dict): Dictionary of basic data.
        tokenizer (tokenizer class): tokenizer for preparing inputs for model.
    '''

    def __init__(self, data, tokenizer):
        '''
        Constructor for GPTDataset class.
        
        Parameters:
            data (dict): Dictionary of basic data.
            tokenizer (tokenizer class): tokenizer for preparing inputs for model.
        '''

        text, smry= [], []
        for k, v in data.items():
            text.append(v[0].strip().replace("\n", " "))
            smry.append(v[1])

        self.tokenizer = tokenizer 
        self.text      = text
        self.smry      = smry

        logging.info("GPTDataset instantiated")


    def __len__(self):
        return len(self.text)

    
    def __getitem__(self, i):
        '''
        Creates encoded input for model training.

        Returns:
            input_ids (torch.Tensor): Numerical representation of input tokens.
            attention_mask (torch.Tensor): Information which tokens should be attended to. 
        '''
        
        input = SPECIAL_TOKENS["bos_token"] + self.text[i] + \
            "TL;DR:" + self.smry[i] + SPECIAL_TOKENS["eos_token"] 

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAX_LENGTH, 
                                   padding="max_length")   
        
        input_ids = encodings_dict["input_ids"]
        attention_mask = encodings_dict["attention_mask"]
        
        logging.info("Encoded inputs created")
        
        return {"label": torch.tensor(input_ids),
                "input_ids": torch.tensor(input_ids), 
                "attention_mask": torch.tensor(attention_mask)}

In [None]:
tokenizer = get_tokenizer(MODEL, special_tokens=SPECIAL_TOKENS)
model = get_model(MODEL, cuda_avbl, tokenizer, special_tokens=SPECIAL_TOKENS)

In [None]:
UNFREEZE_START = 12  
UNFREEZE_STOP = 12                                                                                                  

freeze_layers(model, UNFREEZE_START, UNFREEZE_STOP)

In [None]:
example_data = BasicData("", read_lines=True, join_lists=True, source_min_max=[10000, 70000], target_min_max=[800, 3000])

train_data, val_data = example_data.get_train_test(TRAIN_SIZE)                                                        

train_dataset = GPTDataset(train_data, tokenizer)                                                                       
val_dataset = GPTDataset(val_data, tokenizer)

In [None]:
EPOCHS          = 5
TRAIN_BATCHSIZE = 1
BATCH_UPDATE    = 1 
STRATEGY        = "epoch"
WS              = 1e2  #0
LR              = 3e-4 #5e-5  
EPS             = 1e-8
WD              = 0.01


training_args = TrainingArguments(
    output_dir="model_test",
    num_train_epochs=EPOCHS,                                                                                        # number of training epochs
    per_device_train_batch_size=TRAIN_BATCHSIZE,                                                                    # batch size per GPU/CPU core for training
    per_device_eval_batch_size=TRAIN_BATCHSIZE,                                                                     # batch size per GPU/CPU core for evaluation
    gradient_accumulation_steps=BATCH_UPDATE,                                                                       # number of steps to accumulate the gradients
    evaluation_strategy=STRATEGY,                                                                                   # when model should be evaluated
    warmup_steps=WS,                                                                                                # steps from 0 to learing rate  
    learning_rate=LR,                                                                                               # step size at each iteration
    optim="adamw_torch",                                                                                            # optimizer
    adam_epsilon=EPS,                                                                                               # threshold for adaptive learning rates against zero division problems
    weight_decay=WD,                                                                                                # regularization parameter to shrink model weights
    disable_tqdm=False,                                                                                             # ensure the display of the progress bar while training
    save_strategy=STRATEGY,                                                                                         # when model should be saved     
    save_total_limit=1,                                                                                             # maximum number of saved models
    load_best_model_at_end=True,
    report_to="none"     
)


# define trainer with model, arguments, data, tokenizer
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()

logging.info("Model trained and saved")