# Model Training Script

### Necessary Library

In [2]:
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

SyntaxError: invalid syntax (1139368413.py, line 1)

In [3]:
!source "$HOME/.cargo/env"

In [1]:
pip install pandas transformers numpy tokenizers tensorboard pyproject wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
!wandb login 392e817af43c45fef2953b58c84ebb95d7dd31b5

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


In [1]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter
from GPT2SP import GPT2ForSequenceClassification as GPT2SP
from transformers import GPT2ForSequenceClassification as LinearGPT2
from transformers import GPT2Config
import os
from tokenizers import Tokenizer
import torch.nn as nn
import wandb

### Hyperparameters

In [2]:
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME

EPOCHS = 20
BATCH_SIZE_RATIO = 0.03 # within proj: 0.3 / cross proj: 0.4
SEQUENCE_LEN = 100
LEARNING_RATE = 5e-4
TOKENIZER = 'gpt2' # available: gpt2, wordlevel, sentencepiece, wordpiece 
MODEL_NAME = 'gpt2sp' # available: gpt2sp, gpt2

# define device
global DEVICE
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# define files to be used
global DATA_PATH 
DATA_PATH = './sp_dataset/marked_data/'

### Static Methods and Variables

In [3]:
OUTPUT = ''
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []

def data_processing(file_pair):
    global BATCH_SIZE, BATCH_SIZE_RATIO, DATA_PATH, WITHIN_PROJECT, DYNAMIC_BATCH

    train_data = pd.DataFrame(columns=['text', 'label'])
    for train_file_name in file_pair['train']:
        fname = DATA_PATH + train_file_name + '.csv'
        df = prepare_dataframe(fname)
        train_data = train_data.append(df)
        
    # data split
    if WITHIN_PROJECT:
        train_text, train_labels, val_text, val_labels, test_text, test_labels = within_project_split(train_data)
    else:
        train_text, train_labels, val_text, val_labels = train_val_split(train_data, 0.6)
    # define batch size dynamically based on training length
    if DYNAMIC_BATCH:
        BATCH_SIZE = int(len(train_text) * BATCH_SIZE_RATIO)
    # tokenization
    tokens_train = tokenization(train_text.tolist())
    tokens_val = tokenization(val_text.tolist())
    print(tokens_train['input_ids'][:5])
 
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)
    train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random')

    val_seq = torch.tensor(tokens_val['input_ids'])
    val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)
    val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential')
    
    # prepare testing datasets
    all_test_dataloader = []
    test_file_names = []
    if WITHIN_PROJECT:
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')
        all_test_dataloader.append(test_dataloader)
        test_file_names.append(file_pair['test'][0])
        return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names

    for test_file_name in file_pair['test']:
        fname = DATA_PATH + test_file_name + '.csv'
        test_data = prepare_dataframe(fname)

        test_text = test_data['text']
        test_labels = test_data['label']

        # tokenization
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')

        all_test_dataloader.append(test_dataloader)
        test_file_names.append(test_file_name)
    print('cross project data processing!')
    return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names


def train_val_split(data, split_ratio):
    print('cross project split!')
    split_point = int(len(data) * split_ratio)
    train_text = data['text'][:split_point]
    train_labels = data['label'][:split_point]
    val_text = data['text'][split_point:]
    val_labels = data['label'][split_point:]
    return train_text, train_labels, val_text, val_labels


def tokenization(text_list):
    global TOKENIZER, SEQUENCE_LEN, MODEL
    # tokenization
    if TOKENIZER == 'wordpiece':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/word_piece/vocab.txt')
    elif TOKENIZER == 'sentencepiece':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sentence_piece/spm_tokenizer.model', padding_side='right')
    elif TOKENIZER == 'wordlevel':
        print('using wordlevel tokenizer!')
        tokenizer = Tokenizer.from_file('all_tokenizers/word_level/wordlevel.json')
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        return encoded_sentences
    elif TOKENIZER == 'gpt2':
        print('using pretrained gpt-2 tokenizer')
        tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER)
        tokenizer.pad_token = '[PAD]'
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')
    d = {'text': (data['title'] + " | " + data["description"] ).tolist(), 'label': data['storypoint']}
    return pd.DataFrame(data=d)


def prepare_dataloader(seq, y, sampler_type):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')
    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)
    train_text = data['text'][:train_val_split_point]
    train_labels = data['label'][:train_val_split_point]
    val_text = data['text'][train_val_split_point:val_test_split_point]
    val_labels = data['label'][train_val_split_point:val_test_split_point]
    test_text = data['text'][val_test_split_point:]
    test_labels = data['label'][val_test_split_point:]
    return train_text, train_labels, val_text, val_labels, test_text, test_labels   


def train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, model, test_file_names):
    global MODEL_NAME, TOKENIZER, LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE , SEQUENCE_LEN , BATCH_SIZE_RATIO

    wandb.init(
                # set the wandb project where this run will be logged
                project = "esti-mate",
                name = f"{MODEL_NAME}_{file_pair['train'][0]}",
                tags = ["RMSLELoss","concat"],

                # track hyperparameters and run metadata
                config={
                "learning_rate": LEARNING_RATE,
                "sequence_len": SEQUENCE_LEN,
                "batch_size_ratio":BATCH_SIZE_RATIO,
                "tokenizer":TOKENIZER,
                "model_name":MODEL_NAME,
                "description_added":True,
                "epochs": EPOCHS,
                'data_set':file_pair["train"][0]
                }
    )


    optimizer = AdamW(MODEL.parameters(), lr=LEARNING_RATE)    
    # Total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print("Start training for ", file_pair, ".....")
    training_start_time = time.time()
    
    # tensorboard writer
    writer_path = 'tb/' + str(file_pair['train'][0]) + '_' + str(file_pair['test'][0])
    writer = SummaryWriter(writer_path)
    
    # vars for model selection
    min_eval_loss_epoch = [10000, 0]
    
    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    start_time = time.time()
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        torch.cuda.empty_cache()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):            
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids, 
                           labels=b_labels,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()  
            loss.backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids, b_labels, result, loss, logits

        avg_train_loss = total_train_loss / len(train_dataloader)
        wandb.log({f'train_loss':avg_train_loss},step=e)
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        writer.add_scalar('loss/train', avg_train_loss, e)
        # clean memory
        del avg_train_loss, total_train_loss
        
        time_records.append(time.time() - start_time)
        
        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        for batch in val_dataloader:            
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            model.zero_grad()
            with torch.no_grad():
                result = model(b_input_ids, 
                            labels=b_labels,
                            return_dict=True)
            loss = result.loss
            logits = result.logits
            total_eval_loss += loss.item()  
            # clean memory
            del b_input_ids, b_labels, batch, result, loss, logits
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        wandb.log({f'eval_loss':avg_eval_loss},step=e)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))
        
        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e
        
        writer.add_scalar('loss/eval', avg_eval_loss, e)
        # clean memory
        del avg_eval_loss, total_eval_loss
        # save model state to dict
        torch.save(model.state_dict(), './models/' + 'epo_' + str(e))
        
        print("===============================")
        
        # testing on holdout data
        index = 0
        for test_dataloader in all_test_dataloader:
            test_file_name = test_file_names[index]
            index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_labels = batch
                with torch.no_grad():
                    logits = model(b_input_ids)
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)
            # calculate errors
            distance_records = []
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    distance_records.append(distance)

            ## MAE = mean value of all absolute errors (stored in distance_records)
            MAE = np.mean(np.array(distance_records)) 
            ## MdAE = median value of all absolute errors (stored in distance_records)
            MdAE = np.median(np.array(distance_records)) 

            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)
            
            global OUTPUT
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
    writer.flush()
    writer.close()
    

    
    # select model
    os.rename('models/epo_' + str(min_eval_loss_epoch[1]), 
              'models/' + str(file_pair['train'][0]) + '_' 
              + str(file_pair['test'][0]) + '_epo_' + str(min_eval_loss_epoch[1]))

    wandb.log({"best_MAE": MAE_RECORDS[min_eval_loss_epoch[1]],"best_MdAE": MDAE_RECORDS[min_eval_loss_epoch[1]] , "best_MAR_train_time":time_records[min_eval_loss_epoch[1]]  })
    # del unwanted models
    for i in range(20):
        try:
            os.remove("models/epo_" + str(i))
        except:
            continue
            
    OUTPUT += 'MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
                + '  MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE)
    print('all done for one project')

### Within Project Training Script

In [4]:
global WITHIN_PROJECT
WITHIN_PROJECT = True

TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
                        # {'train': ['aptanastudio'], 'test': ['aptanastudio']},
                        # {'train': ['bamboo'], 'test': ['bamboo']},
                        # {'train': ['clover'], 'test': ['clover']},
                        # {'train': ['datamanagement'], 'test': ['datamanagement']},
                        # {'train': ['duracloud'], 'test': ['duracloud']},
                        # {'train': ['jirasoftware'], 'test': ['jirasoftware']},
                        # {'train': ['mesos'], 'test': ['mesos']},
                        # {'train': ['moodle'], 'test': ['moodle']},
                        # {'train': ['mule'], 'test': ['mule']},
                        # {'train': ['mulestudio'], 'test': ['mulestudio']},
                        # {'train': ['springxd'], 'test': ['springxd']},
                        # {'train': ['talenddataquality'], 'test': ['talenddataquality']},
                        # {'train': ['talendesb'], 'test': ['talendesb']},
                        # {'train': ['titanium'], 'test': ['titanium']},
                        # {'train': ['usergrid'], 'test': ['usergrid']},
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)    
            
                   
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
            
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""

                
if __name__ == "__main__":
    main()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['dense1.weight', 'dense2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  train_data = train_data.append(df)


within project split!
using pretrained gpt-2 tokenizer
using pretrained gpt-2 tokenizer
[[4550, 7257, 1028, 2134, 4187, 874, 287, 2163, 800, 20968, 930, 1391, 6494, 92, 27, 7146, 6927, 79, 29, 464, 2126, 994, 318, 326, 611, 674, 20150, 23007, 257, 2099, 355, 2163, 1822, 11, 356, 815, 307, 1498, 284, 2251, 281, 4554, 286, 326, 2099, 355, 281, 2134, 18875, 355, 281, 1822, 284, 257, 2163, 43219, 13, 1114, 1672, 25, 3556, 79, 29, 1279, 3866, 29, 1279, 8189, 29, 40533, 13, 10080, 13, 17953, 33986, 7, 1391, 1222, 2528, 26, 26745, 12, 6888, 12, 1456, 5, 13655, 26, 1782, 5619, 3556, 8189, 29, 7359, 3866, 12240, 7146, 29, 90, 6494], [10260, 24150, 329, 2034, 7015, 1352, 13877, 284, 2034, 7015, 1352, 11112, 220, 930, 1391, 6494, 92, 27, 7146, 6927, 79, 29, 2953, 1551, 4259, 3895, 17149, 11, 3917, 299, 6691, 13, 8673, 584, 8947, 355, 880, 25970, 79, 12240, 7146, 29, 90, 6494, 92, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mseniyas[0m. Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Start training for  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} .....
>>> epoch  0
 Average training MAE loss: 6.97
-
 Average eval MAE loss: 1.48
MAE:  1.512077
MdAE:  1.0917227
>>> epoch  1
 Average training MAE loss: 2.85
-
 Average eval MAE loss: 2.06
MAE:  2.020484
MdAE:  1.8081338
>>> epoch  2
 Average training MAE loss: 2.49
-
 Average eval MAE loss: 1.78
MAE:  1.7960339
MdAE:  1.5534604
>>> epoch  3
 Average training MAE loss: 2.08
-
 Average eval MAE loss: 1.83
MAE:  1.7795147
MdAE:  1.4966741
>>> epoch  4
 Average training MAE loss: 1.90
-
 Average eval MAE loss: 2.47
MAE:  2.4823313
MdAE:  2.2366526
>>> epoch  5
 Average training MAE loss: 1.67
-
 Average eval MAE loss: 1.75
MAE:  1.7886248
MdAE:  1.4323916
>>> epoch  6
 Average training MAE loss: 1.71
-
 Average eval MAE loss: 1.79
MAE:  1.844916
MdAE:  1.4966317
>>> epoch  7
 Average training MAE loss: 1.47
-
 Average eval MAE loss: 1.90
MAE:  1.9021661
MdAE:  1.5855639
>>> epoch  8
 Average training 

NameError: name 'self' is not defined

In [13]:
ls

2136.38s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


GPT2SP.py                         [0m[01;34mlogo[0m/
GPT2SP_inspection_notebook.ipynb  model_training_notebook.ipynb
LICENSE                           [01;34mmodels[0m/
README.md                         [01;34mresults[0m/
[01;34m__pycache__[0m/                      [01;34msp_dataset[0m/
[01;34mabe0[0m/                             [01;34mtb[0m/
[01;34mall_tokenizers[0m/                   tokenizer_training_notebook.ipynb
[01;34mcorpus_tokenization_comparison[0m/   vocab_and_tokenization_comparison.ipynb
[01;34mcustom_transformers_interpret[0m/    [01;34mwandb[0m/
[01;34mdata_model_analysis[0m/              [01;34mxai_tokens[0m/


In [4]:
from transformers import pipeline,Pipeline
def get_gpt2sp_pipeline(model: str) -> Pipeline:
    global DEVICE
    # model = "MickyMike/0-GPT2SP-" + model.lower()
    config = GPT2Config(num_labels=1, pad_token_id=50256)
    gpt2sp = GPT2SP.from_pretrained('gpt2',config=config)
    state_dict = torch.load("./models/gpt2sp_base_appceleratorstudio_appceleratorstudio_epo_7.pth",map_location='cpu')
    gpt2sp.load_state_dict(state_dict=state_dict ,strict=False  )
    gpt2sp.to(DEVICE)
    gpt2sp.eval()

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = '[PAD]'
    
    


    return pipeline(task="text-classification", model=gpt2sp, tokenizer=tokenizer, device=DEVICE )


def predict_sp(estimator: Pipeline, given_title: str) -> dict:
    res= estimator(given_title)
    # return round(.item(), 0)

    return res


pipeline = get_gpt2sp_pipeline("")
story_point = predict_sp(pipeline, "new mobile projects can'\t find appicon.jpg resulting in ' error detected' shown on TiApp pane")

print(story_point)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['dense1.weight', 'dense2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': 'LABEL_0', 'score': 0.9833680987358093}]


In [3]:
from transformers import pipeline,Pipeline
def get_gpt2sp_pipeline(model: str) -> Pipeline:
    global DEVICE
    # model = "MickyMike/0-GPT2SP-" + model.lower()
    config = GPT2Config(num_labels=1, pad_token_id=50256)
    gpt2sp = GPT2SP.from_pretrained('MickyMike/0-GPT2SP-appceleratorstudio',config=config)
    # state_dict = torch.load("./models/gpt2sp_base_appceleratorstudio_appceleratorstudio_epo_7.pth",map_location='cpu')
    # gpt2sp.load_state_dict(state_dict=state_dict ,strict=False  )
    gpt2sp.to(DEVICE)
    gpt2sp.eval()

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = '[PAD]'
    
    return pipeline(task="text-classification", model=gpt2sp, tokenizer=tokenizer, device=DEVICE )


def predict_sp(estimator: Pipeline, given_title: str) -> dict:
    res= estimator(given_title)
    # return round(.item(), 0)

    return res


pipeline = get_gpt2sp_pipeline("")
story_point = predict_sp(pipeline, "new mobile projects can'\t find appicon.jpg resulting in ' error detected' shown on TiApp pane")

print(story_point)

[{'label': 'LABEL_0', 'score': 0.9862019419670105}]


In [23]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="MickyMike/0-GPT2SP-appceleratorstudio")
res = pipe("Add preference for default settings on android run/debug configurations (API & Screen size)")
print(res)

Some weights of the model checkpoint at MickyMike/0-GPT2SP-appceleratorstudio were not used when initializing GPT2ForSequenceClassification: ['dense1.weight', 'dense2.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'LABEL_0', 'score': 0.5239192247390747}]


In [41]:
pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: captum
Successfully installed captum-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from transformers import pipeline,Pipeline
from transformers import PreTrainedTokenizer
from custom_transformers_interpret import  SequenceClassificationExplainer


def get_top_token(token_attributions: list) -> list:
    # word_attributions have a shape of [('word', 0.1234), ...]
    top_index = 0
    top_value = None
    for i in range(len(token_attributions)):
        if top_value is None or token_attributions[i][1] > top_value:
            top_value = token_attributions[i][1]
            top_index = i
    return [str(token_attributions[top_index][0])]


def get_gpt2sp_pipeline(text: str) -> Pipeline:
    global DEVICE
    model = 'gpt2' #"MickyMike/0-GPT2SP-appceleratorstudio"
    config = GPT2Config(num_labels=1, pad_token_id=50256)
    gpt2sp = GPT2SP.from_pretrained(model,config=config)
    state_dict = torch.load("models/epo_2")
    gpt2sp.load_state_dict(state_dict=state_dict ,strict=False  )
    # gpt2sp.to(DEVICE)
    gpt2sp.eval()

    tokenizer:PreTrainedTokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = '[PAD]'
    
    input_ids = tokenizer(text,return_tensors="pt")
    # input_ids.to(DEVICE)
    with torch.no_grad():
        outs = gpt2sp(**input_ids)

    explainer = SequenceClassificationExplainer(gpt2sp,tokenizer)
    word_att = explainer(text)
    top_token = get_top_token(word_att)
    print("top token :",str(top_token[0]))

    return outs

outs = get_gpt2sp_pipeline("Create test plan for tiapp.xml module additions")


print(outs.logits)

"""


"""

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['dense1.weight', 'dense2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


top token : additions
tensor([[6.7372]])


'\n\n\n'

### Cross Project Training Script - Within Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# within repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['mesos'], 'test': ['usergrid']},
                        {'train': ['usergrid'], 'test': ['mesos']},
                        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                        {'train': ['appceleratorstudio'], 'test': ['titanium']},
                        {'train': ['titanium'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['titanium']},
                        {'train': ['mule'], 'test': ['mulestudio']},
                        {'train': ['mulestudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)           
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""

                
if __name__ == "__main__":
    main()

### Cross Project Training Script - Cross Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# cross repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['clover'], 'test': ['usergrid']},
                        {'train': ['talendesb'], 'test': ['mesos']},
                        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                        {'train': ['mule'], 'test': ['titanium']},
                        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                        {'train': ['mulestudio'], 'test': ['titanium']},
                        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                        {'train': ['appceleratorstudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)           
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()            
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""

                
if __name__ == "__main__":
    main()