<a href="https://colab.research.google.com/github/Sansith/gpt2sp/blob/ensemble-gpt2/model_training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training Script

### Necessary Library

In [1]:
pip install torch pandas===1.5.3 transformers numpy tokenizers koila tensorboard

Collecting koila
  Downloading koila-0.1.1-py3-none-any.whl (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x

In [1]:
import pdb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp

/content/drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp


In [4]:
ls

[0m[01;34mabe0[0m/                               [01;34mmodels[0m/
[01;34mall_tokenizers[0m/                     model_training_notebook_bigBird.ipynb
base_model_training_notebook.ipynb  model_training_notebook.ipynb
BigBird.py                          model_training_notebook_new_impl.ipynb
[01;34mcorpus_tokenization_comparison[0m/     [01;34m__pycache__[0m/
[01;34mcustom_transformers_interpret[0m/      README.md
[01;34mdata_model_analysis[0m/                [01;34mresults[0m/
GPT2SPEN.py                         [01;34msp_dataset[0m/
GPT2SP_inspection_notebook.ipynb    [01;34mtb[0m/
GPT2SP.py                           tokenizer_training_notebook.ipynb
LICENSE                             vocab_and_tokenization_comparison.ipynb
[01;34mlogo[0m/                               [01;34mxai_tokens[0m/


In [5]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter
from GPT2SP import GPT2ForSequenceClassification as GPT2SP
from GPT2SPEN import GPT2ForSequenceClassification as GPT2SPEN
from transformers import GPT2ForSequenceClassification as LinearGPT2
from transformers import GPT2Config
import os
from tokenizers import Tokenizer
import torch.nn as nn

### Hyperparameters

In [6]:
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME , ADD_DESCRIPTION

EPOCHS = 20
BATCH_SIZE_RATIO = 0.3 # within proj: 0.3 / cross proj: 0.4
SEQUENCE_LEN = 20
LEARNING_RATE = 5e-4
TOKENIZER = 'gpt2' # available: gpt2, wordlevel, sentencepiece, wordpiece
MODEL_NAME = 'gpt2spen' # available:gpt2spen , gpt2sp, gpt2
ADD_DESCRIPTION = False

# define device
global DEVICE
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# define files to be used
global DATA_PATH
DATA_PATH = './sp_dataset/marked_data/'

### Static Methods and Variables

In [7]:
OUTPUT = '  '
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []

def data_processing(file_pair):
    global BATCH_SIZE, BATCH_SIZE_RATIO, DATA_PATH, WITHIN_PROJECT, DYNAMIC_BATCH

    train_data = pd.DataFrame(columns=['title', 'description',"label"])
    for train_file_name in file_pair['train']:
        fname = DATA_PATH + train_file_name + '.csv'
        df = prepare_dataframe(fname)
        train_data = train_data.append(df)

    # data split
    if WITHIN_PROJECT:
        train_title,train_description,train_labels,  val_title,val_description,val_labels,  test_title,test_description,test_labels = within_project_split(train_data)
    else:
        train_title,train_description , train_labels, val_title,val_description , val_labels = train_val_split(train_data, 0.6)
    # define batch size dynamically based on training length
    if DYNAMIC_BATCH:
        BATCH_SIZE = int(len(train_title) * BATCH_SIZE_RATIO)
    # tokenization
    title_tokens_train = tokenization(train_title.tolist())
    description_tokens_train = tokenization(train_description.tolist())

    title_tokens_val = tokenization(val_title.tolist())
    description_tokens_val = tokenization(val_description.tolist())


    train_seq_titles = torch.tensor(title_tokens_train['input_ids'])
    train_seq_descriptions = torch.tensor(description_tokens_train['input_ids'])
    train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)

    train_dataloader = prepare_dataloader(train_seq_titles,train_seq_descriptions ,  train_y, sampler_type='random')


    val_seq_titles = torch.tensor(title_tokens_val['input_ids'])
    val_seq__descriptions = torch.tensor(description_tokens_val['input_ids'])
    val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)

    val_dataloader = prepare_dataloader(val_seq_titles,val_seq__descriptions ,val_y, sampler_type='sequential')

    # prepare testing datasets
    all_test_dataloader = []
    test_file_names = []
    if WITHIN_PROJECT:
        tokens_test_title = tokenization(test_title.tolist())
        tokens_test_description = tokenization(test_description.tolist())

        test_seq_title = torch.tensor(tokens_test_title['input_ids'])
        test_seq_description = torch.tensor(tokens_test_description['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)

        test_dataloader = prepare_dataloader(test_seq_title,test_seq_description, test_y, sampler_type='sequential')
        all_test_dataloader.append(test_dataloader)
        test_file_names.append(file_pair['test'][0])
        return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names

    for test_file_name in file_pair['test']:
        fname = DATA_PATH + test_file_name + '.csv'
        test_data = prepare_dataframe(fname)

        test_title = test_data['title']
        test_description = test_data['description']
        test_labels = test_data['label']

        # tokenization
        tokens_test_title = tokenization(test_title.tolist())
        tokens_test_description = tokenization(test_description.tolist())
        test_seq_title = torch.tensor(tokens_test_title['input_ids'])
        test_seq_description = torch.tensor(tokens_test_description['input_ids'])

        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq_title,test_seq_description, test_y, sampler_type='sequential')

        all_test_dataloader.append(test_dataloader)
        test_file_names.append(test_file_name)
    print('cross project data processing!')
    return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names


def train_val_split(data, split_ratio):
    print('cross project split!')
    split_point = int(len(data) * split_ratio)

    train_title = data['title'][:split_point]
    train_description = data['description'][:split_point]
    train_labels = data['label'][:split_point]

    val_title = data['title'][split_point:]
    val_description = data['description'][split_point:]
    val_labels = data['label'][split_point:]
    return train_title,train_description , train_labels, val_title,val_description , val_labels


def tokenization(text_list):
    global TOKENIZER, SEQUENCE_LEN, MODEL
    # tokenization
    if TOKENIZER == 'wordpiece':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/word_piece/vocab.txt')
    elif TOKENIZER == 'sentencepiece':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sentence_piece/spm_tokenizer.model', padding_side='right')
    elif TOKENIZER == 'wordlevel':
        print('using wordlevel tokenizer!')
        tokenizer = Tokenizer.from_file('all_tokenizers/word_level/wordlevel.json')
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        return encoded_sentences
    elif TOKENIZER == 'gpt2':
        print('using pretrained gpt-2 tokenizer')
        tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER)
        tokenizer.pad_token = '[PAD]'
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')


    # if ADD_DESCRIPTION :
    #   print("### text : title+description")
    #   d = {'text': (data['title'] + " : " + data["description"]).tolist(), 'label': data['storypoint']}
    # else:
    #   print("### text : title")
    #   d = {'text': (data['title']).tolist(), 'label': data['storypoint']}

    d = { 'title':(data['title']).tolist(), 'description': (data["description"]).tolist() , 'label': data['storypoint'] }
    print("Input data feed ::: ",d.keys())
    return pd.DataFrame(data=d)


def prepare_dataloader(seq_title,seq_description, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq_title,seq_description, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    print("BATCH_SIZE : ",BATCH_SIZE)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')

    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)

    train_title = data['title'][:train_val_split_point]
    train_description = data['description'][:train_val_split_point]
    train_labels = data['label'][:train_val_split_point]

    val_title = data['title'][train_val_split_point:val_test_split_point]
    val_description = data['description'][train_val_split_point:val_test_split_point]
    val_labels = data['label'][train_val_split_point:val_test_split_point]

    test_title = data['title'][val_test_split_point:]
    test_description = data['description'][val_test_split_point:]
    test_labels = data['label'][val_test_split_point:]

    return train_title,train_description,train_labels,  val_title,val_description,val_labels,  test_title,test_description,test_labels


def train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, model, test_file_names):
    global LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE

    # Optimizerrr -->
    optimizer = AdamW(MODEL.parameters(), lr=LEARNING_RATE)
    # Total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print("Start training for ", file_pair, ".....")
    training_start_time = time.time()

    # tensorboard writer
    writer_path = 'tb/' + str(file_pair['train'][0]) + '_' + str(file_pair['test'][0])
    writer = SummaryWriter(writer_path)

    # vars for model selection
    min_eval_loss_epoch = [10000, 0]

    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    start_time = time.time()
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        torch.cuda.empty_cache()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0

        for step, batch in enumerate(train_dataloader):
            # pdb.set_trace()
            b_input_ids_title = batch[0].to(DEVICE)
            b_input_ids_description = batch[1].to(DEVICE)
            b_labels = batch[2].to(DEVICE)

            model.zero_grad()

            result = model(input_ids_title=b_input_ids_title,
                           input_ids_description=b_input_ids_description,
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()
            # Calculates the gradients
            loss.backward()
            # The clip_grad_norm_ function clips (limits) the norm (magnitude) of the gradients to a maximum value specified by the user.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            #updates the weights and bias accrding to the calculated gradients
            optimizer.step()
            # update learning rates
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids_title,b_input_ids_description, b_labels, result, loss, logits

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        writer.add_scalar('loss/train', avg_train_loss, e)
        # clean memory
        del avg_train_loss, total_train_loss

        time_records.append(time.time() - start_time)

        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        for batch in val_dataloader:
            b_input_ids_title = batch[0].to(DEVICE)
            b_input_ids_description = batch[1].to(DEVICE)
            b_labels = batch[2].to(DEVICE)

            model.zero_grad()
            result = model(input_ids_title=b_input_ids_title,
                           input_ids_description=b_input_ids_description,
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_eval_loss += loss.item()
            # clean memory
            del b_input_ids_title,b_input_ids_description, b_labels, batch, result, loss, logits
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))

        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e

        writer.add_scalar('loss/eval', avg_eval_loss, e)
        # clean memory
        del avg_eval_loss, total_eval_loss
        # save model state to dict
        torch.save(model.state_dict(), './models/' + 'epo_' + str(e))

        print("===============================")

        # testing on holdout data
        index = 0
        for test_dataloader in all_test_dataloader:
            test_file_name = test_file_names[index]
            index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                # batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids_title = batch[0].to(DEVICE)
                b_input_ids_description = batch[1].to(DEVICE)
                b_labels = batch[2].to(DEVICE)

                b_input_ids, b_labels = batch
                with torch.no_grad():
                    logits = model(input_ids_title=b_input_ids_title,input_ids_description=b_input_ids_description)
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)

                del b_input_ids_title, b_input_ids_description ,b_labels
            # calculate errors
            distance_records = []
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    distance_records.append(distance)

            ## MAE = mean value of all absolute errors (stored in distance_records)
            MAE = np.mean(np.array(distance_records))
            ## MdAE = median value of all absolute errors (stored in distance_records)
            MdAE = np.median(np.array(distance_records))

            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)

            global OUTPUT
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
    writer.flush()
    writer.close()

    # select model
    os.rename('models/epo_' + str(min_eval_loss_epoch[1]),
              'models/' + str(file_pair['train'][0]) + '_'
              + str(file_pair['test'][0]) + '_epo_' + str(min_eval_loss_epoch[1]))

    # del unwanted models
    for i in range(20):
        try:
            os.remove("models/epo_" + str(i))
        except:
            continue

    OUTPUT += 'MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
                + '  MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE) + '\n'
    global ADD_DESCRIPTION
    OUTPUT += 'Description added : ' + str(ADD_DESCRIPTION) + '\n'


    print('all done for one project')

### Within Project Training Script

In [9]:
torch.cuda.empty_cache()

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = True

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
                        # {'train': ['aptanastudio'], 'test': ['aptanastudio']},
                        # {'train': ['bamboo'], 'test': ['bamboo']},
                        # {'train': ['clover'], 'test': ['clover']},
                        # # {'train': ['datamanagement'], 'test': ['datamanagement']},
                        # {'train': ['duracloud'], 'test': ['duracloud']},
                        # {'train': ['jirasoftware'], 'test': ['jirasoftware']},
                        # {'train': ['mesos'], 'test': ['mesos']},
                        # {'train': ['moodle'], 'test': ['moodle']},
                        # {'train': ['mule'], 'test': ['mule']},
                        # {'train': ['mulestudio'], 'test': ['mulestudio']},
                        # {'train': ['springxd'], 'test': ['springxd']},
                        # {'train': ['talenddataquality'], 'test': ['talenddataquality']},
                        # {'train': ['talendesb'], 'test': ['talendesb']},
                        # {'train': ['titanium'], 'test': ['titanium']},
                        # {'train': ['usergrid'], 'test': ['usergrid']},
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            # if torch.device("cuda") :
              # MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            # if torch.device("cuda"):
              # MODEL.cuda()
        elif MODEL_NAME == 'gpt2spen':
            MODEL = GPT2SPEN.from_pretrained('gpt2', config=config)
            # if torch.device("cuda"):
              # MODEL.cuda()



        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['dense1.weight', 'dense2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  train_data = train_data.append(df)


Input data feed :::  dict_keys(['title', 'description', 'label'])
within project split!
using pretrained gpt-2 tokenizer
using pretrained gpt-2 tokenizer
using pretrained gpt-2 tokenizer
using pretrained gpt-2 tokenizer
BATCH_SIZE :  525
BATCH_SIZE :  525
using pretrained gpt-2 tokenizer
using pretrained gpt-2 tokenizer
BATCH_SIZE :  525


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Start training for  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} .....
>>> epoch  0


### Cross Project Training Script - Within Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# within repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['mesos'], 'test': ['usergrid']},
                        {'train': ['usergrid'], 'test': ['mesos']},
                        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                        {'train': ['appceleratorstudio'], 'test': ['titanium']},
                        {'train': ['titanium'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['titanium']},
                        {'train': ['mule'], 'test': ['mulestudio']},
                        {'train': ['mulestudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

### Cross Project Training Script - Cross Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# cross repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['clover'], 'test': ['usergrid']},
                        {'train': ['talendesb'], 'test': ['mesos']},
                        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                        {'train': ['mule'], 'test': ['titanium']},
                        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                        {'train': ['mulestudio'], 'test': ['titanium']},
                        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                        {'train': ['appceleratorstudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()