<a href="https://colab.research.google.com/github/Sansith/gpt2sp/blob/bertsp/model_training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training Script

### Necessary Library

In [None]:
pip install torch pandas===1.5.3 transformers numpy tokenizers koila tensorboard wandb

In [None]:
!wandb login 392e817af43c45fef2953b58c84ebb95d7dd31b5

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/MyDrive/gpt2sp

In [None]:
cd drive/MyDrive/Year4/FYP/effort-estimation/gpt2sp

In [1]:
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup , BertTokenizer
import numpy as np
import time
from torch.utils.tensorboard import SummaryWriter
# from GPT2SP import GPT2ForSequenceClassification as GPT2SP
from transformers import GPT2ForSequenceClassification as LinearGPT2
from transformers import GPT2Config , BertConfig
import os
from tokenizers import Tokenizer
import torch.nn as nn
import wandb
import pdb

In [14]:
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
import torch.nn as nn
from transformers import GPT2Model, GPT2PreTrainedModel
import torch


class GPT2SP(GPT2PreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config)
        self.dense1 = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
        self.dense2 = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None


    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        # MLP Layer
        hidden_states = self.dense1(hidden_states)
        hidden_states = self.dense2(hidden_states)

        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.L1Loss()
                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [3]:

from transformers.modeling_outputs import SequenceClassifierOutputWithPast
import torch.nn as nn
from transformers import  BertPreTrainedModel , BertModel
import torch


class BertSP(BertPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = BertModel(config)
        print("n_embd/hidden_size : ", config.hidden_size)
        self.dense1 = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=False)
        self.relu1 = nn.ReLU()
        self.dense2 = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=False)
        self.relu2 = nn.ReLU()
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None


    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        # MLP Layer
        hidden_states = self.dense1(hidden_states)
        hidden_states = self.relu1(hidden_states)

        hidden_states = self.dense2(hidden_states)
        hidden_states = self.relu2(hidden_states)
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.L1Loss()
                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

### Hyperparameters

In [4]:
global EPOCHS, BATCH_SIZE_RATIO, SEQUENCE_LEN, LEARNING_RATE, TOKENIZER, MODEL_NAME , ADD_DESCRIPTION

EPOCHS = 20
BATCH_SIZE_RATIO = 0.02 # within proj: 0.3 / cross proj: 0.4
SEQUENCE_LEN = 100
LEARNING_RATE = 5e-4
TOKENIZER = 'bert' # available:bert, gpt2, wordlevel, sentencepiece, wordpiece
MODEL_NAME = 'bert' # available: bert, gpt2sp, gpt2
ADD_DESCRIPTION = True
WANDB_SPECIAL_TAGS = ['with relu','description+title','model_export'] #'with relu'
# define device
global DEVICE
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# define files to be used
global DATA_PATH
DATA_PATH = './sp_dataset/marked_data/'

### Static Methods and Variables

In [5]:
OUTPUT = '  '
MODEL = None
DYNAMIC_BATCH = True
BATCH_SIZE = None
WITHIN_PROJECT = None
MAE_RECORDS = []
MDAE_RECORDS = []

def data_processing(file_pair):
    global BATCH_SIZE, BATCH_SIZE_RATIO, DATA_PATH, WITHIN_PROJECT, DYNAMIC_BATCH

    train_data = pd.DataFrame(columns=['text', 'label'])
    for train_file_name in file_pair['train']:
        fname = DATA_PATH + train_file_name + '.csv'
        df = prepare_dataframe(fname)
        train_data = train_data.append(df)

    # data split
    if WITHIN_PROJECT:
        train_text, train_labels, val_text, val_labels, test_text, test_labels = within_project_split(train_data)
    else:
        train_text, train_labels, val_text, val_labels = train_val_split(train_data, 0.6)
    # define batch size dynamically based on training length
    if DYNAMIC_BATCH:
        BATCH_SIZE = int(len(train_text) * BATCH_SIZE_RATIO)
    # tokenization
    tokens_train = tokenization(train_text.tolist())
    tokens_val = tokenization(val_text.tolist())
    print(tokens_train['input_ids'][:5])

    train_seq = torch.tensor(tokens_train['input_ids'])
    train_att_mask = torch.tensor(tokens_train['attention_mask'])
    train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)
    train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random',attention_mask=train_att_mask)

    val_seq = torch.tensor(tokens_val['input_ids'])
    val_att_mask = torch.tensor(tokens_val['attention_mask'])
    val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)
    val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential',attention_mask=val_att_mask)

    # prepare testing datasets
    all_test_dataloader = []
    test_file_names = []
    if WITHIN_PROJECT:
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_att_mask = torch.tensor(tokens_test['attention_mask'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential',attention_mask=test_att_mask)
        all_test_dataloader.append(test_dataloader)
        test_file_names.append(file_pair['test'][0])
        return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names

    for test_file_name in file_pair['test']:
        fname = DATA_PATH + test_file_name + '.csv'
        test_data = prepare_dataframe(fname)

        test_text = test_data['text']
        test_labels = test_data['label']

        # tokenization
        tokens_test = tokenization(test_text.tolist())
        test_seq = torch.tensor(tokens_test['input_ids'])
        test_att_mask = torch.tensor(tokens_test['attention_mask'])
        test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
        test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential',attention_mask=test_att_mask)

        all_test_dataloader.append(test_dataloader)
        test_file_names.append(test_file_name)
    print('cross project data processing!')
    return file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names


def train_val_split(data, split_ratio):
    print('cross project split!')
    split_point = int(len(data) * split_ratio)
    train_text = data['text'][:split_point]
    train_labels = data['label'][:split_point]
    val_text = data['text'][split_point:]
    val_labels = data['label'][split_point:]
    return train_text, train_labels, val_text, val_labels


def tokenization(text_list):
    global TOKENIZER, SEQUENCE_LEN, MODEL
    # tokenization
    if TOKENIZER == 'wordpiece':
        print('using wordpiece tokenizer!')
        tokenizer = BertTokenizer('all_tokenizers/word_piece/vocab.txt')
    elif TOKENIZER == 'sentencepiece':
        print('using sentencepiece tokenizer!')
        tokenizer = XLNetTokenizer('all_tokenizers/sentence_piece/spm_tokenizer.model', padding_side='right')
    elif TOKENIZER == 'wordlevel':
        print('using wordlevel tokenizer!')
        tokenizer = Tokenizer.from_file('all_tokenizers/word_level/wordlevel.json')
        encoded_sentences = {'input_ids':[]}
        for sentence in text_list:
            encoded = tokenizer.encode(sentence)
            encoded = encoded.ids
            if len(encoded) > SEQUENCE_LEN:
                encoded = encoded[:SEQUENCE_LEN]
            elif len(encoded) < SEQUENCE_LEN:
                padding = SEQUENCE_LEN - len(encoded)
                for _ in range(padding):
                    encoded.append(3)
            encoded_sentences['input_ids'].append(encoded)
        return encoded_sentences
    elif TOKENIZER == 'gpt2':
        print('using pretrained gpt-2 tokenizer')
        tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER)
        tokenizer.pad_token = '[PAD]'

    elif TOKENIZER == 'bert':
        print('usingbert tokenizer')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        # tokenizer.pad_token = '[PAD]'
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length', return_tensors='pt')


def prepare_dataframe(file_name):
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(' ')


    if ADD_DESCRIPTION :
      print("### text : title+description")
      d = {'text':('[CLS]' + data['title'] + '[SEP]' + data["description"]+'[SEP]').tolist(), 'label': data['storypoint']}
    else:
      print("### text : title")
      d = {'text': (data['title']).tolist(), 'label': data['storypoint']}
    print("Input data feed ::: ",d['text'][0])
    return pd.DataFrame(data=d)


def prepare_dataloader(seq, y, sampler_type, attention_mask):
    global BATCH_SIZE
    tensor_dataset = TensorDataset(seq, y,attention_mask)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    print("BATCH_SIZE : ",BATCH_SIZE)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader


def within_project_split(data):
    print('within project split!')
    train_val_split_point = int(len(data) * 0.6)
    val_test_split_point = int(len(data) * 0.8)
    train_text = data['text'][:train_val_split_point]
    train_labels = data['label'][:train_val_split_point]
    val_text = data['text'][train_val_split_point:val_test_split_point]
    val_labels = data['label'][train_val_split_point:val_test_split_point]
    test_text = data['text'][val_test_split_point:]
    test_labels = data['label'][val_test_split_point:]
    return train_text, train_labels, val_text, val_labels, test_text, test_labels


def train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, model, test_file_names):
    global LEARNING_RATE, EPOCHS, MAE_RECORDS, MDAE_RECORDS, DEVICE ,ADD_DESCRIPTION,WANDB_SPECIAL_TAGS

        # start a new wandb run to track this script
    wandb.init(
            # set the wandb project where this run will be logged
            project = "esti-mate",
            name = f"{MODEL_NAME}_{file_pair['train'][0]}",
            tags = WANDB_SPECIAL_TAGS,

            # track hyperparameters and run metadata
            config={
            "learning_rate": LEARNING_RATE,
            "sequence_len": SEQUENCE_LEN,
            "batch_size_ratio":BATCH_SIZE_RATIO,
            "tokenizer":TOKENIZER,
            "model_name":MODEL_NAME,
            "description_added":ADD_DESCRIPTION,
            "epochs": EPOCHS,
            'data_set':file_pair["train"][0]
            }
    )




    # Optimizerrr -->
    optimizer = AdamW(MODEL.parameters(), lr=LEARNING_RATE)
    # Total number of training steps is [number of batches] x [number of epochs]
    total_steps = len(train_dataloader) * EPOCHS
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    print("Start training for ", file_pair, ".....")
    training_start_time = time.time()

    # tensorboard writer
    writer_path = 'tb/' + str(file_pair['train'][0]) + '_' + str(file_pair['test'][0])
    writer = SummaryWriter(writer_path)

    # vars for model selection
    min_eval_loss_epoch = [10000, 0]

    time_records = []
    MAE_RECORDS = []
    MDAE_RECORDS = []
    start_time = time.time()
    loss_fct = nn.L1Loss()
    for e in range(EPOCHS):
        # ---TRAINING---
        # clean GPU memory
        torch.cuda.empty_cache()
        print(">>> epoch ", e)
        # set model into train mode
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            # pdb.set_trace()
            b_input_ids = batch[0].to(DEVICE)
            b_labels = batch[1].to(DEVICE)
            b_attention_mask = batch[2].to(DEVICE)
            model.zero_grad()
            result = model(b_input_ids,
                           labels=b_labels,
                           attention_mask=b_attention_mask,
                           return_dict=True)
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()
            # Calculates the gradients
            loss.backward()
            # The clip_grad_norm_ function clips (limits) the norm (magnitude) of the gradients to a maximum value specified by the user.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            #updates the weights and bias accrding to the calculated gradients
            optimizer.step()
            # update learning rates
            scheduler.step()
            # clean memory
            del step, batch, b_input_ids, b_labels, result, loss, logits

        avg_train_loss = total_train_loss / len(train_dataloader)

        wandb.log({f"train_loss": avg_train_loss} , step=e )
        print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
        writer.add_scalar('loss/train', avg_train_loss, e)

        # clean memory
        del avg_train_loss, total_train_loss

        time_records.append(time.time() - start_time)

        # ---EVAL---
        print("-")
        # set model into eval mode
        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                b_input_ids = batch[0].to(DEVICE)
                b_labels = batch[1].to(DEVICE)
                b_attention_mask = batch[2].to(DEVICE)
                model.zero_grad()
                result = model(b_input_ids,
                            labels=b_labels,
                            attention_mask=b_attention_mask,
                            return_dict=True)
                loss = result.loss
                logits = result.logits
                total_eval_loss += loss.item()
                # clean memory
                del b_input_ids, b_labels, batch, result, loss, logits
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        wandb.log({f"eval_loss": avg_eval_loss}, step=e)
        print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))

        if avg_eval_loss <= min_eval_loss_epoch[0]:
            min_eval_loss_epoch[0] = avg_eval_loss
            min_eval_loss_epoch[1] = e

        writer.add_scalar('loss/eval', avg_eval_loss, e)
        # clean memory
        del avg_eval_loss, total_eval_loss
        # save model state to dict
        torch.save(model.state_dict(), './models/' + 'epo_' + str(e))

        print("===============================")
        
        # testing on holdout data
        index = 0
        for test_dataloader in all_test_dataloader:
            test_file_name = test_file_names[index]
            index += 1
            testing_start_time = time.time()
            predictions = []
            true_labels = []
            for batch in test_dataloader:
                batch = tuple(t.to(DEVICE) for t in batch)
                b_input_ids, b_labels, attention_mask = batch
                with torch.no_grad():
                    logits = model(b_input_ids,attention_mask=attention_mask)
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.append(logits)
                true_labels.append(label_ids)
            # calculate errors
            distance_records = []
            for i in range(len(predictions)):
                for j in range(len(predictions[i])):
                    distance = abs(predictions[i][j] - true_labels[i][j])
                    distance_records.append(distance)

            ## MAE = mean value of all absolute errors (stored in distance_records)
            MAE = np.mean(np.array(distance_records))
            ## MdAE = median value of all absolute errors (stored in distance_records)
            MdAE = np.median(np.array(distance_records))

            MAE_RECORDS.append(MAE)
            MDAE_RECORDS.append(MdAE)

            wandb.log({f"test_MAE": MAE, f"MdAE": MdAE},step=e)

            global OUTPUT
            OUTPUT +=  'Epochs ' + str(e) + '\n'
            OUTPUT += 'MAE: ' + str(MAE) + '\n'
            OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
            print('MAE: ', MAE)
            print('MdAE: ', MdAE)
    writer.flush()
    writer.close()

    # select model
    os.rename('models/epo_' + str(min_eval_loss_epoch[1]),'models/' + str(file_pair['train'][0]) + '_'+ str(file_pair['test'][0]) + '_epo_' + str(min_eval_loss_epoch[1]))

    # del unwanted models
    for i in range(20):
        try:
            os.remove("models/epo_" + str(i))
        except:
            continue

    OUTPUT += 'MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
                + '  MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
    OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
    global BATCH_SIZE
    OUTPUT += 'batch size: ' + str(BATCH_SIZE) + '\n'
    OUTPUT += 'Description added : ' + str(ADD_DESCRIPTION) + '\n'

    best_mae_index = min_eval_loss_epoch[1]
    wandb.log({"best_MAE": MAE_RECORDS[best_mae_index],
               "best_MdAE": MDAE_RECORDS[best_mae_index],
               "best_MAE_train_time":time_records[min_eval_loss_epoch[1]]})
    wandb.finish()

    print('all done for one project')

In [6]:
class CustomDataLoader:
    def __init__(self,dynamic_batch=True, batch_size_ratio=0.1, data_path='', within_project=False,tokenizer=None):
        self.dynamic_batch = dynamic_batch
        self.batch_size_ratio = batch_size_ratio
        self.data_path = data_path
        self.within_project = within_project
        self.batch_size = batch_size_ratio
        self.tokenizer = tokenizer


    def prepare_dataframe(self, file_name):
        data = pd.read_csv(file_name)
        # some rows have no description, fill blank to avoid Null
        data = data.fillna(' ')


        if ADD_DESCRIPTION :
            print("### text : title+description")
            d = {'text':('[CLS]' + data['title'] + '[SEP]' + data["description"]+'[SEP]').tolist(), 'label': data['storypoint']}
        else:
            print("### text : title")
            d = {'text': (data['title']).tolist(), 'label': data['storypoint']}
        
        print("Input data feed ::: ",d['text'][0])
        return pd.DataFrame(data=d)
    

    def tokenization(self, text_list):
        tokenized =  self.tokenizer.batch_encode_plus(text_list, truncation=True, 
                                                 max_length=SEQUENCE_LEN, 
                                                 padding='max_length', 
                                                 return_tensors='pt')
        return tokenized


    def train_val_split(self, data, split_ratio):
        print('cross project split!')
        
        split_point = int(len(data) * split_ratio)
        train_text = data['text'][:split_point]
        train_labels = data['label'][:split_point]
        val_text = data['text'][split_point:]
        val_labels = data['label'][split_point:]

        return train_text, train_labels, val_text, val_labels


    def within_project_split(self, data):
        print('within project split!')
        train_val_split_point = int(len(data) * 0.6)
        val_test_split_point = int(len(data) * 0.8)

        train_text = data['text'][:train_val_split_point]
        train_labels = data['label'][:train_val_split_point]

        val_text = data['text'][train_val_split_point:val_test_split_point]
        val_labels = data['label'][train_val_split_point:val_test_split_point]
        
        test_text = data['text'][val_test_split_point:]
        test_labels = data['label'][val_test_split_point:]
        
        return train_text, train_labels, val_text, val_labels, test_text, test_labels
    

    def prepare_dataloader(self, seq, y, sampler_type, attention_mask):
        tensor_dataset = TensorDataset(seq, y,attention_mask)

        if sampler_type == 'random':
            sampler = RandomSampler(tensor_dataset)
        elif sampler_type == 'sequential':
            sampler = SequentialSampler(tensor_dataset)

        batch_size = int(len(seq) * BATCH_SIZE_RATIO)
        print("BATCH_SIZE : ",batch_size)
        
        dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=batch_size)
        return dataloader

    def get_dataloader(self,tokenized_result,labels , sampler_type='sequential'):

        input_ids = torch.tensor(tokenized_result['input_ids'])
        att_mask = torch.tensor(tokenized_result['attention_mask'])
        y = torch.tensor(labels.tolist()).type(torch.LongTensor)
        dataloader = self.prepare_dataloader(input_ids,y,
                                             sampler_type=sampler_type,
                                             attention_mask=att_mask)
        return dataloader

    def get_test_dataloader(self,file_pair,labels ,test_text=None, sampler_type='sequential'):
        # prepare testing datasets
        all_test_dataloader = []
        test_file_names = []

        if self.within_project :
            tokens_test = self.tokenization(test_text.tolist())
            dataloader = self.get_dataloader(tokens_test,labels,sampler_type=sampler_type)

            all_test_dataloader.append(dataloader)
            test_file_names.append(file_pair['test'][0])

            return all_test_dataloader
        else:

            # to iterate over testing files
            for test_file_name in file_pair['test']:
                file_path = self.data_path + test_file_name + '.csv'
                test_data = self.prepare_dataframe(file_path)

                test_text = test_data['text']
                test_labels = test_data['label']

                # tokenization
                tokens_test = tokenization(test_text.tolist())
                data_loader = self.get_dataloader(tokens_test,test_labels,sampler_type=sampler_type)

                all_test_dataloader.append(data_loader)
                test_file_names.append(test_file_name)
            return all_test_dataloader #, test_file_names
            

    def data_processing(self,file_pair):

        train_data = pd.DataFrame(columns=['text', 'label'])

        for train_file_name in file_pair['train']:
            file_path = self.data_path + train_file_name + '.csv'
            df = self.prepare_dataframe(file_path)
            train_data = train_data.append(df)


        # Split the dataset for train | eval | test
        if self.within_project:
            train_text, train_labels, val_text, val_labels, test_text, test_labels = self.within_project_split(train_data)
        else:
            train_text, train_labels, val_text, val_labels = self.train_val_split(train_data, 0.6)


        # split into batches
        # if self.dynamic_batch:
        #     batch_size = int(len(train_text) * BATCH_SIZE_RATIO)    

        # tokenization
        train_tokenized_res = self.tokenization(train_text.tolist())
        val_tokenizer_res = self.tokenization(val_text.tolist())
        

        train_dataloader = self.get_dataloader(train_tokenized_res,train_labels,sampler_type='random')
        val_dataloader = self.get_dataloader(val_tokenizer_res,val_labels,sampler_type='sequential')
        test_dataloaders = self.get_test_dataloader(file_pair,test_labels,test_text,sampler_type='sequential')

        return train_dataloader, val_dataloader, test_dataloaders

In [7]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import time
import os
import wandb
from transformers import PreTrainedModel
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn

class Trainer:
    min_eval_loss_epoch = [10000, 0]
    time_records= []
    def __init__(self, model:PreTrainedModel,file_pair , device, learning_rate, epochs, batch_size_ratio, sequence_len, tokenizer,tokenizer_name, model_name, add_description, wandb_special_tags):
        self.model = model
        self.device = device
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size_ratio = batch_size_ratio
        self.sequence_len = sequence_len
        self.tokenizer = tokenizer
        self.model_name = model_name
        self.add_description = add_description
        self.file_pair = file_pair
        self.wandb_special_tags = wandb_special_tags
        self.tokenizer_name= tokenizer_name
    
    def log_to_wandb(self,logs,step=None):
        wandb.log(logs,step=step)    

    def train(self, train_dataloader,epoch_step,optimizer,scheduler):


        print("EPOCH : ", self.file_pair,":",epoch_step)
        training_start_time = time.time()

        self.model.train()
        total_train_loss=0
        for index, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(self.device)
            labels = batch[1].to(self.device)
            attention_mask = batch[2].to(self.device)
            
            #reset the gradients 
            self.model.zero_grad()
            result = self.model(input_ids,
                            labels=labels,
                            attention_mask=attention_mask,
                            return_dict=True)
            
            loss = result.loss
            logits = result.logits
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            del batch, input_ids, labels, result, loss, logits

            # averaging train loss of each batch iteration
        avg_train_loss = total_train_loss / len(train_dataloader)

        self.log_to_wandb({f'train_loss':avg_train_loss},step=epoch_step)
        print("Average train loss :{0:.2f}".format(avg_train_loss))

        del avg_train_loss, total_train_loss
        self.time_records.append(time.time() - training_start_time)

    def finalize_saved_models(self):
        final_model_path = 'models/' + str(self.file_pair['train'][0]) + '_'+ str(self.file_pair['test'][0]) + '_epo_' + str(self.min_eval_loss_epoch[1])
        os.rename('models/epo_' + str(self.min_eval_loss_epoch[1]),final_model_path)

        for i in range(20):
            try:
                os.remove("models/epo_" + str(i))
            except:
                continue 

        print("Saved best epoch/Cleared Model folder")
        return final_model_path

    def evaluate(self, val_dataloader,step):
        # Evaluation logic here
        self.model.eval()
        total_eval_loss = 0

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch[0].to(self.device)
                labels = batch[1].to(self.device)
                attention_mask = batch[2].to(self.device)

                self.model.zero_grad()
                result = self.model(input_ids,
                            labels=labels,
                            attention_mask=attention_mask,
                            return_dict=True)
                
                loss = result.loss
                logits = result.logits
                total_eval_loss += loss.item()

                # clean memory
                del input_ids, labels, batch, result, loss, logits
            
        avg_eval_loss = total_eval_loss / len(val_dataloader)
        self.log_to_wandb({f'eval_loss':avg_eval_loss},step=step)
        print("Average eval loss :{0:.2f}".format(avg_eval_loss))

        # keeping the min eval loss epoch
        if avg_eval_loss <= self.min_eval_loss_epoch[0]:
            self.min_eval_loss_epoch[0] = avg_eval_loss
            self.min_eval_loss_epoch[1] = step
        
        torch.save(self.model.state_dict(), './models/' + 'epo_' + str(step))

    def load_model_from_epoch(self,epoch):
        self.model.zero_grad()
        self.model.load_state_dict(torch.load('models/epo_' + str(epoch)))


    def test(self, test_dataloader,best_epoch):
        predictions = []
        true_labels = []
        for test_set in test_dataloader:
            for batch in test_set:
                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_labels, attention_mask = batch

                with torch.no_grad():
                    self.load_model_from_epoch(best_epoch)
                    logits = self.model(b_input_ids,attention_mask=attention_mask)
                
                logits = logits['logits'].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                
                predictions.append(logits)
                true_labels.append(label_ids)
        
        # Calculate Errors
        distance_records = []
        for i in range(len(predictions)):
            for j in range(len(predictions[i])):
                distance = abs(predictions[i][j] - true_labels[i][j])
                distance_records.append(distance)

        MAE = np.mean(np.array(distance_records))
        MADE = np.median(np.array(distance_records))
        print("Testing MAE  : ",MAE)
        print("Testing MdAE : ",MADE)
        return MAE , MADE




    def train_eval_test(self, file_pair, train_dataloader, val_dataloader, all_test_dataloader):
              # start a new wandb run to track this script
        wandb.init(
                # set the wandb project where this run will be logged
                project = "esti-mate",
                name = f"{self.model_name}_{file_pair['train'][0]}",
                tags = self.wandb_special_tags,

                # track hyperparameters and run metadata
                config={
                "learning_rate": self.learning_rate,
                "sequence_len": self.sequence_len,
                "batch_size_ratio":self.batch_size_ratio,
                "tokenizer":self.tokenizer_name,
                "model_name":self.model_name,
                "description_added":self.add_description,
                "epochs": self.epochs,
                'data_set':file_pair["train"][0]
                }
        )

        
        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        total_steps = len(train_dataloader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        
        
        for e in range(self.epochs):
            self.train(train_dataloader,e,optimizer,scheduler)
            self.evaluate(val_dataloader,e)
               

        
        print("Best epoch : ",self.min_eval_loss_epoch[1])
        print("Best eval loss : ",self.min_eval_loss_epoch[0])
        mae , made = self.test(all_test_dataloader,best_epoch=self.min_eval_loss_epoch[1])


        print("Best testing MAE : ",mae)
        self.log_to_wandb({"best_MAE": mae,"best_MdAE": made , "best_MAR_train_time":self.time_records[self.min_eval_loss_epoch[1]]  })
        
        wandb.finish() 
        self.finalize_saved_models()

### Within Project Training Script @

In [8]:
torch.cuda.empty_cache()

In [10]:
global WITHIN_PROJECT
WITHIN_PROJECT = True

TRAIN_TEST_FILE_PAIRS = [
    {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']},
    # {'train': ['aptanastudio'], 'test': ['aptanastudio']},
    # {'train': ['bamboo'], 'test': ['bamboo']},
    # {'train': ['clover'], 'test': ['clover']},
    # {"train": ["datamanagement"], "test": ["datamanagement"]},
    # {"train": ["duracloud"], "test": ["duracloud"]},
    # {"train": ["jirasoftware"], "test": ["jirasoftware"]},
    # {"train": ["mesos"], "test": ["mesos"]},
    # {"train": ["moodle"], "test": ["moodle"]},
    # {"train": ["mule"], "test": ["mule"]},
    # {"train": ["mulestudio"], "test": ["mulestudio"]},
    # {"train": ["springxd"], "test": ["springxd"]},
    # {"train": ["talenddataquality"], "test": ["talenddataquality"]},
    # {"train": ["talendesb"], "test": ["talendesb"]},
    # {"train": ["titanium"], "test": ["titanium"]},
    # {"train": ["usergrid"], "test": ["usergrid"]},
]


def create_tokenizer(tokenizer_name):
    if tokenizer_name == "gpt2":
        print("using pretrained gpt-2 tokenizer")
        tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER)
        tokenizer.pad_token = "[PAD]"
    elif tokenizer_name == "bert":
        print("usingbert tokenizer")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
    return tokenizer


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME,LEARNING_RATE,DEVICE,BATCH_SIZE,SEQUENCE_LEN,ADD_DESCRIPTION,WANDB_SPECIAL_TAGS,EPOCHS,BATCH_SIZE_RATIO
    
    for file in TRAIN_TEST_FILE_PAIRS:

        tokenizer = create_tokenizer(TOKENIZER)
        if TOKENIZER == "gpt2":
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == "bert":
            config = BertConfig(num_labels=1, pad_token_id=0)
        
        if MODEL_NAME == "gpt2sp":
            MODEL = GPT2SP.from_pretrained("gpt2", config=config)
        elif MODEL_NAME == "bert":
            MODEL = BertSP.from_pretrained("bert-base-uncased", config=config)
           
        if torch.cuda.is_available():
            MODEL.cuda()


        # file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        data_processor = CustomDataLoader(
            dynamic_batch=True,
            batch_size_ratio=BATCH_SIZE_RATIO,
            
            data_path="./sp_dataset/marked_data/",
            within_project=True,
            tokenizer=tokenizer,
        )
        trainer = Trainer(
            model=MODEL,
            file_pair=file,
            device=DEVICE,
            learning_rate=LEARNING_RATE,
            epochs=EPOCHS,
            batch_size_ratio=BATCH_SIZE_RATIO,
            sequence_len=SEQUENCE_LEN,
            tokenizer=tokenizer,
            tokenizer_name=TOKENIZER,
            model_name=MODEL_NAME,
            add_description=ADD_DESCRIPTION,
            wandb_special_tags=WANDB_SPECIAL_TAGS,
            
        )

        train_dataloader, val_dataloader, test_dataloaders = data_processor.data_processing(file_pair=file)
        trainer.train_eval_test(file,train_dataloader,val_dataloader,test_dataloaders)

        # train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)

        del MODEL , data_processor , trainer
        torch.cuda.empty_cache()
        # global OUTPUT
        # with open(
        #     "./results/" + str(file["train"][0]) + "_" + str(file["test"][0]) + ".txt",
        #     "w+",
        # ) as f:
        #     f.writelines(OUTPUT)
        #     print("results have been written into a text file!")
        #     OUTPUT = ""


if __name__ == "__main__":
    main()

usingbert tokenizer


n_embd/hidden_size :  768


Some weights of BertSP were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.dense1.weight', 'bert.dense2.weight', 'bert.score.weight', 'bert.transformer.embeddings.LayerNorm.bias', 'bert.transformer.embeddings.LayerNorm.weight', 'bert.transformer.embeddings.position_embeddings.weight', 'bert.transformer.embeddings.token_type_embeddings.weight', 'bert.transformer.embeddings.word_embeddings.weight', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.0.attention.output.dense.bias', 'bert.transformer.encoder.layer.0.attention.output.dense.weight', 'bert.transformer.encoder.layer.0.attention.self.key.bias', 'bert.transformer.encoder.layer.0.attention.self.key.weight', 'bert.transformer.encoder.layer.0.attention.self.query.bias', 'bert.transformer.encoder.layer.0.attention.self.query.weight', 'bert.transformer.encoder.layer.0.

### text : title+description
Input data feed :::  [CLS]Add CA against object literals in function invocations[SEP]{html}<div><p>The idea here is that if our metadata captures a type as function arg, we should be able to create an instance of that type as an object literal as an arg to a function invocation. For example:</p> <pre> <code>Ti.UI.createLabel( { &lt;property-ca-here&gt; } );</code> </pre></div>{html}[SEP]
within project split!


  input_ids = torch.tensor(tokenized_result['input_ids'])
  att_mask = torch.tensor(tokenized_result['attention_mask'])


BATCH_SIZE :  35
BATCH_SIZE :  11
BATCH_SIZE :  11




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0111127616222196, max=1.0))…



EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 0
Average train loss :3.01
Average eval loss :1.71
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 1
Average train loss :2.70
Average eval loss :2.26
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 2
Average train loss :2.64
Average eval loss :1.85
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 3
Average train loss :2.68
Average eval loss :1.76
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 4
Average train loss :2.66
Average eval loss :1.88
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 5
Average train loss :2.66
Average eval loss :1.60
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 6
Average train loss :2.63
Average eval loss :1.57
EPOCH :  {'train': ['appceleratorstudio'], 'test': ['appceleratorstudio']} : 7
Average train loss :2.64




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_MAE,▁
best_MAR_train_time,▁
best_MdAE,▁
eval_loss,▃█▅▄▅▃▂▃▂▂▁▂▁▁▃▁▃▁▂▂
train_loss,█▃▂▂▂▂▂▂▁▁▄▁▂▂▃▂▂▂▁▂

0,1
best_MAE,1.34995
best_MAR_train_time,31.56958
best_MdAE,0.08396
eval_loss,1.50773
train_loss,2.62396


Saved best epoch/Cleared Model folder


### Cross Project Training Script - Within Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# within repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['mesos'], 'test': ['usergrid']},
                        {'train': ['usergrid'], 'test': ['mesos']},
                        {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                        {'train': ['appceleratorstudio'], 'test': ['titanium']},
                        {'train': ['titanium'], 'test': ['appceleratorstudio']},
                        {'train': ['aptanastudio'], 'test': ['titanium']},
                        {'train': ['mule'], 'test': ['mulestudio']},
                        {'train': ['mulestudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'bbpe':
            config = GPT2Config(num_labels=1, pad_token_id=50257)
        elif TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

### Cross Project Training Script - Cross Repository

In [None]:
global WITHIN_PROJECT
WITHIN_PROJECT = False

# cross repo
TRAIN_TEST_FILE_PAIRS = [
                        {'train': ['clover'], 'test': ['usergrid']},
                        {'train': ['talendesb'], 'test': ['mesos']},
                        {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                        {'train': ['mule'], 'test': ['titanium']},
                        {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                        {'train': ['mulestudio'], 'test': ['titanium']},
                        {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                        {'train': ['appceleratorstudio'], 'test': ['mule']}
                        ]


def main():
    global TRAIN_TEST_FILE_PAIRS, MODEL, TOKENIZER, MODEL_NAME
    for file in TRAIN_TEST_FILE_PAIRS:
        if TOKENIZER == 'gpt2':
            config = GPT2Config(num_labels=1, pad_token_id=50256)
        elif TOKENIZER == 'wordpiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'sentencepiece':
            config = GPT2Config(num_labels=1, pad_token_id=0)
        elif TOKENIZER == 'wordlevel':
            config = GPT2Config(num_labels=1, pad_token_id=3)
        if MODEL_NAME == 'gpt2':
            MODEL = LinearGPT2.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        elif MODEL_NAME == 'gpt2sp':
            MODEL = GPT2SP.from_pretrained('gpt2', config=config)
            MODEL.cuda()
        file_pair, train_dataloader, val_dataloader, all_test_dataloader, test_file_names = data_processing(file_pair=file)
        train_eval_test(file_pair, train_dataloader, val_dataloader, all_test_dataloader, MODEL, test_file_names)
        del MODEL
        torch.cuda.empty_cache()
        global OUTPUT
        with open('./results/' + str(file['train'][0]) + '_' + str(file['test'][0]) +'.txt', 'w+') as f:
            f.writelines(OUTPUT)
            print('results have been written into a text file!')
            OUTPUT = ""


if __name__ == "__main__":
    main()

In [48]:


def get_predictions(organization_id, text):
    try:
        global SEQUENCE_LEN,TOKENIZER , DEVICE
        config = BertConfig(num_labels=1, pad_token_id=0)
        MODEL = BertSP.from_pretrained("bert-base-uncased", config=config)
        # handler = EstimateModelHandler()
        model = MODEL
        tokenizer = create_tokenizer(TOKENIZER)

        if model is None:
            return jsonify({"error": f'Model not found for {organization_id}'}), 404

        # Get predictions
        # input_ids = tokenizer.encode(text, return_tensors="pt",truncation=True, padding='max_length', max_length=SEQUENCE_LENGTH)
        res = tokenizer.batch_encode_plus([text], max_length=SEQUENCE_LEN, truncation=True, padding='max_length', return_tensors="pt")
        input_ids =  res["input_ids"].to(DEVICE)
        attention_maskss = res["attention_mask"].to(DEVICE)
        
        state_dict = torch.load("models/appceleratorstudio_appceleratorstudio_epo_7.pth" )
        model.load_state_dict(state_dict)
        model.to(DEVICE)
        model.eval()
        
        with torch.no_grad():
            inference_output = model( input_ids,attention_mask=attention_maskss )["logits"]
        logits = inference_output.detach().cpu().numpy()
        logits.tolist()

        response_data = {"organizationId": organization_id, "prediction": str(logits[0][0])}
        # handler.clear_all_models()
        print(response_data)
        # return jsonify(response_data)
    except ValueError :
        print("ERROR:")
        return jsonify({"error": "Invalid 'organizationId'"}), 400


In [49]:
get_predictions("TT",'[CLS]Clear application data[SEP]')    

n_embd/hidden_size :  768


Some weights of BertSP were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.dense1.weight', 'bert.dense2.weight', 'bert.score.weight', 'bert.transformer.embeddings.LayerNorm.bias', 'bert.transformer.embeddings.LayerNorm.weight', 'bert.transformer.embeddings.position_embeddings.weight', 'bert.transformer.embeddings.token_type_embeddings.weight', 'bert.transformer.embeddings.word_embeddings.weight', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.0.attention.output.dense.bias', 'bert.transformer.encoder.layer.0.attention.output.dense.weight', 'bert.transformer.encoder.layer.0.attention.self.key.bias', 'bert.transformer.encoder.layer.0.attention.self.key.weight', 'bert.transformer.encoder.layer.0.attention.self.query.bias', 'bert.transformer.encoder.layer.0.attention.self.query.weight', 'bert.transformer.encoder.layer.0.

usingbert tokenizer
{'organizationId': 'TT', 'prediction': '4.8308964'}


In [54]:
from transformers import Pipeline

def get_bertSP_pipeline(model_name: str) -> Pipeline:
    model = BertSP.from_pretrained("bert-base-uncased")
    state_dict = torch.load("models/mulestudio_mulestudio_epo_13_bertsp" )
    model.load_state_dict(state_dict=state_dict)
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


    return Pipeline(model=model, tokenizer=tokenizer)

def predict_sp(estimator: Pipeline, given_title: str) -> dict:
    return round(estimator(given_title).item(), 0)






pipeline = get_bertSP_pipeline("bert-base-uncased")
story_point = predict_sp(pipeline, "proj title")

print("sp : ",story_point)

n_embd/hidden_size :  768


Some weights of BertSP were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.dense1.weight', 'bert.dense2.weight', 'bert.score.weight', 'bert.transformer.embeddings.LayerNorm.bias', 'bert.transformer.embeddings.LayerNorm.weight', 'bert.transformer.embeddings.position_embeddings.weight', 'bert.transformer.embeddings.token_type_embeddings.weight', 'bert.transformer.embeddings.word_embeddings.weight', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.0.attention.output.dense.bias', 'bert.transformer.encoder.layer.0.attention.output.dense.weight', 'bert.transformer.encoder.layer.0.attention.self.key.bias', 'bert.transformer.encoder.layer.0.attention.self.key.weight', 'bert.transformer.encoder.layer.0.attention.self.query.bias', 'bert.transformer.encoder.layer.0.attention.self.query.weight', 'bert.transformer.encoder.layer.0.

RuntimeError: Error(s) in loading state_dict for BertSP:
	size mismatch for score.weight: copying a param with shape torch.Size([1, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).

In [18]:
from transformers import pipeline

def get_gpt2sp_pipeline(model_ee: str) -> Pipeline:
    model = "MickyMike/7-GPT2SP-appceleratorstudio"
    gpt2sp = GPT2SP.from_pretrained(model)
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = '[PAD]'
    return pipeline(model=gpt2sp, tokenizer=tokenizer)
def predict_sp(estimator: Pipeline, given_title: str) -> dict:
    return round(estimator(given_title).item(), 0)

pipeline = get_gpt2sp_pipeline("appceleratorstudio")
story_point = predict_sp(pipeline, "Clear application data")

config.json:   0%|          | 0.00/835 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/529M [00:00<?, ?B/s]

RuntimeError: Inferring the task automatically requires to check the hub with a model_id defined as a `str`. GPT2SP(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dense1): Linear(in_features=768, out_features=3072, bias=False)
  (dense2): Linear(in_features=3072, out_features=768, bias=False)
  (score): Linear(in_features=768, out_features=1, bias=False)
) is not a valid model_id.