<a href="https://colab.research.google.com/github/Sansith/gpt2sp/blob/gpt2sp-base/model_training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training Script

### Necessary Library

In [1]:
pip install torch pandas===1.5.3 transformers numpy tokenizers koila tensorboard

Collecting koila
  Downloading koila-0.1.1-py3-none-any.whl (18 kB)
Collecting pynvml (from koila)
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pynvml, koila
Successfully installed koila-0.1.1 pynvml-11.5.0


In [2]:
import pdb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp

/content/drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp


In [6]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter
from GPT2SP import GPT2ForSequenceClassification as GPT2SP
from transformers import GPT2ForSequenceClassification as LinearGPT2
from transformers import GPT2Config
import os
from tokenizers import Tokenizer
import torch.nn as nn

### Hyperparameters

In [7]:
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME , ADD_DESCRIPTION

EPOCHS = 20
BATCH_SIZE_RATIO = 0.3 # within proj: 0.3 / cross proj: 0.4
SEQUENCE_LEN = 20
LEARNING_RATE = 5e-4
TOKENIZER = 'gpt2' # available: gpt2, wordlevel, sentencepiece, wordpiece
MODEL_NAME = 'gpt2sp' # available: gpt2sp, gpt2
ADD_DESCRIPTION = False

# define device
global DEVICE
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# define files to be used
global DATA_PATH
DATA_PATH = './sp_dataset/marked_data/'

### Static Methods and Variables

In [8]:
OUTPUT = '  '
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []

def data_processing(file_pair):
    global BATCH_SIZE, BATCH_SIZE_RATIO, DATA_PATH, WITHIN_PROJECT, DYNAMIC_BATCH

    train_data = pd.DataFrame(columns=['text', 'label'])
    for train_file_name in file_pair['train']:
        fname = DATA_PATH + train_file_name + '.csv'
        df = prepare_dataframe(fname)
        train_data = train_data.append(df)

    # data split
    if WITHIN_PROJECT:
        train_text, train_labels, val_text, val_labels, test_text, test_labels = within_project_split(train_data)
    else:
        train_text, train_labels, val_text, val_labels = train_val_split(train_data, 0.6)
    # define batch size dynamically based on training length
    if DYNAMIC_BATCH:
        BATCH_SIZE = int(len(train_text) * BATCH_SIZE_RATIO)
    # tokenization
    tokens_train = tokenization(train_text.tolist())
    tokens_val = tokenization(val_text.tolist())
    print(tokens_train['input_ids'][:5])

    train_seq = torch.tensor(tokens_train['input_ids'])
    train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)
    train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random')

    val_seq = torch.tensor(tokens_val['input_ids'])
    val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)
    val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential')

    # prepare testing datasets
    all_test_dataloader = []
    test_file_names = []
    if WITHIN_PROJECT:
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')
        all_test_dataloader.append(test_dataloader)
        test_file_names.append(file_pair['test'][0])
        return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names

    for test_file_name in file_pair['test']:
        fname = DATA_PATH + test_file_name + '.csv'
        test_data = prepare_dataframe(fname)

        test_text = test_data['text']
        test_labels = test_data['label']

        # tokenization
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')

        all_test_dataloader.append(test_dataloader)
        test_file_names.append(test_file_name)
    print('cross project data processing!')
    return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names


def train_val_split(data, split_ratio):
    print('cross project split!')
    split_point = int(len(data) * split_ratio)
    train_text = data['text'][:split_point]
    train_labels = data['label'][:split_point]
    val_text = data['text'][split_point:]
    val_labels = data['label'][split_point:]
    return train_text, train_labels, val_text, val_labels


def tokenization(text_list):
    global TOKENIZER, SEQUENCE_LEN, MODEL
    # tokenization
    if TOKENIZER == 'wordpiece':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/word_piece/vocab.txt')
    elif TOKENIZER == 'sentencepiece':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sentence_piece/spm_tokenizer.model', padding_side='right')
    elif TOKENIZER == 'wordlevel':
        print('using wordlevel tokenizer!')
        tokenizer = Tokenizer.from_file('all_tokenizers/word_level/wordlevel.json')
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        return encoded_sentences
    elif TOKENIZER == 'gpt2':
        print('using pretrained gpt-2 tokenizer')
        tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER)
        tokenizer.pad_token = '[PAD]'
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')


    if ADD_DESCRIPTION :
      print("### text : title+description")
      d = {'text': (data['title'] + " : " + data["description"]).tolist(), 'label': data['storypoint']}
    else:
      print("### text : title")
      d = {'text': (data['title']).tolist(), 'label': data['storypoint']}
    print("Input data feed ::: ",d['text'][0])
    return pd.DataFrame(data=d)


def prepare_dataloader(seq, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    print("BATCH_SIZE : ",BATCH_SIZE)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')
    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)
    train_text = data['text'][:train_val_split_point]
    train_labels = data['label'][:train_val_split_point]
    val_text = data['text'][train_val_split_point:val_test_split_point]
    val_labels = data['label'][train_val_split_point:val_test_split_point]
    test_text = data['text'][val_test_split_point:]
    test_labels = data['label'][val_test_split_point:]
    return train_text, train_labels, val_text, val_labels, test_text, test_labels


def train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, model, test_file_names):
    global LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE

    # Optimizerrr -->
    optimizer = AdamW(MODEL.parameters(), lr=LEARNING_RATE)
    # Total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print("Start training for ", file_pair, ".....")
    training_start_time = time.time()

    # tensorboard writer
    writer_path = 'tb/' + str(file_pair['train'][0]) + '_' + str(file_pair['test'][0])
    writer = SummaryWriter(writer_path)

    # vars for model selection
    min_eval_loss_epoch = [10000, 0]

    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    start_time = time.time()
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        torch.cuda.empty_cache()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            # pdb.set_trace()
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids,
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()
            # Calculates the gradients
            loss.backward()
            # The clip_grad_norm_ function clips (limits) the norm (magnitude) of the gradients to a maximum value specified by the user.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            #updates the weights and bias accrding to the calculated gradients
            optimizer.step()
            # update learning rates
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids, b_labels, result, loss, logits

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        writer.add_scalar('loss/train', avg_train_loss, e)
        # clean memory
        del avg_train_loss, total_train_loss

        time_records.append(time.time() - start_time)

        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                b_input_ids = batch[0].to(DEVICE)
                b_labels = batch[1].to(DEVICE)
                model.zero_grad()
                result = model(b_input_ids,
                            labels=b_labels,
                            return_dict=True)
                loss = result.loss
                logits = result.logits
                total_eval_loss += loss.item()
                # clean memory
                del b_input_ids, b_labels, batch, result, loss, logits
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))

        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e

        writer.add_scalar('loss/eval', avg_eval_loss, e)
        # clean memory
        del avg_eval_loss, total_eval_loss
        # save model state to dict
        torch.save(model.state_dict(), './models/' + 'epo_' + str(e))

        print("===============================")

        # testing on holdout data
        index = 0
        for test_dataloader in all_test_dataloader:
            test_file_name = test_file_names[index]
            index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_labels = batch
                with torch.no_grad():
                    logits = model(b_input_ids)
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)
            # calculate errors
            distance_records = []
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    distance_records.append(distance)

            ## MAE = mean value of all absolute errors (stored in distance_records)
            MAE = np.mean(np.array(distance_records))
            ## MdAE = median value of all absolute errors (stored in distance_records)
            MdAE = np.median(np.array(distance_records))

            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)

            global OUTPUT
            print("Testing model")
            OUTPUT += 'Testing### '+ '\n'
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
    writer.flush()
    writer.close()

    # select model
    os.rename('models/epo_' + str(min_eval_loss_epoch[1]),
              'models/' + str(file_pair['train'][0]) + '_'
              + str(file_pair['test'][0]) + '_epo_' + str(min_eval_loss_epoch[1]))

    # del unwanted models
    for i in range(20):
        try:
            os.remove("models/epo_" + str(i))
        except:
            continue
    OUTPUT += 'Epoch train summary---------- '
    OUTPUT += 'Minimum loss value : ' + str(min_eval_loss_epoch[0]) + '\n'
    OUTPUT += 'Minimum loss epoch : ' + str(min_eval_loss_epoch[1]) +'\n'
    OUTPUT += 'Testing MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
                + 'Testing MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE) + '\n'
    global ADD_DESCRIPTION
    OUTPUT += 'Description added : ' + str(ADD_DESCRIPTION) + '\n'


    print('all done for one project')
    return MAE_RECORDS[min_eval_loss_epoch[1]] , min_eval_loss_epoch[0]

### Within Project Training Script

In [9]:
torch.cuda.empty_cache()

In [10]:
global WITHIN_PROJECT
WITHIN_PROJECT = True

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
                        # {'train': ['aptanastudio'], 'test': ['aptanastudio']},
                        # {'train': ['bamboo'], 'test': ['bamboo']},
                        # {'train': ['clover'], 'test': ['clover']},
                        # # {'train': ['datamanagement'], 'test': ['datamanagement']},
                        # {'train': ['duracloud'], 'test': ['duracloud']},
                        # {'train': ['jirasoftware'], 'test': ['jirasoftware']},
                        # {'train': ['mesos'], 'test': ['mesos']},
                        # {'train': ['moodle'], 'test': ['moodle']},
                        # {'train': ['mule'], 'test': ['mule']},
                        # {'train': ['mulestudio'], 'test': ['mulestudio']},
                        # {'train': ['springxd'], 'test': ['springxd']},
                        # {'train': ['talenddataquality'], 'test': ['talenddataquality']},
                        # {'train': ['talendesb'], 'test': ['talendesb']},
                        # {'train': ['titanium'], 'test': ['titanium']},
                        # {'train': ['usergrid'], 'test': ['usergrid']},
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        TRAIN_MAE_VALUE ,min_loss_value =train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        
        print('MAE value for the project : ',TRAIN_MAE_VALUE)
        print('Minimum loss value for the project : ',min_loss_value)

        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/bdb.py", line 336, in set_trace
    sys.settrace(self.trace_dispatch)



> [0;32m/content/drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp/GPT2SP.py[0m(16)[0;36m__init__[0;34m()[0m
[0;32m     14 [0;31m        [0mself[0m[0;34m.[0m[0mtransformer[0m [0;34m=[0m [0mGPT2Model[0m[0;34m([0m[0mconfig[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 16 [0;31m        [0mself[0m[0;34m.[0m[0mdense1[0m [0;34m=[0m [0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0mconfig[0m[0;34m.[0m[0mn_embd[0m[0;34m,[0m [0;36m4[0m [0;34m*[0m [0mconfig[0m[0;34m.[0m[0mn_embd[0m[0;34m,[0m [0mbias[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     17 [0;31m        [0mself[0m[0;34m.[0m[0mdense2[0m [0;34m=[0m [0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;36m4[0m [0;34m*[0m [0mconfig[0m[0;34m.[0m[0mn_embd[0m[0;34m,[0m [0mconfig[0m[0;34m.[0m[0mn_emb


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/bdb.py", line 347, in set_continue
    sys.settrace(None)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['dense1.weight', 'dense2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

### Cross Project Training Script - Within Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# within repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['mesos'], 'test': ['usergrid']},
                        {'train': ['usergrid'], 'test': ['mesos']},
                        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                        {'train': ['appceleratorstudio'], 'test': ['titanium']},
                        {'train': ['titanium'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['titanium']},
                        {'train': ['mule'], 'test': ['mulestudio']},
                        {'train': ['mulestudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

### Cross Project Training Script - Cross Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# cross repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['clover'], 'test': ['usergrid']},
                        {'train': ['talendesb'], 'test': ['mesos']},
                        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                        {'train': ['mule'], 'test': ['titanium']},
                        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                        {'train': ['mulestudio'], 'test': ['titanium']},
                        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                        {'train': ['appceleratorstudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()