# SATD identification and classification by fine-tuning LLMs (RQ1, RQ3, RQ4)

To just see the final results, skip this section.

In [1]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import gc
import time
from sklearn.metrics import classification_report, confusion_matrix
# import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# add the context column to a dataframe
def add_context(df, inputs):
    context = []
    for _,row in df.iterrows():
        if inputs == 'ct':
            context.append(row['comment_text'])
        elif inputs == 'fp+ct':
            context.append('file path: ' + str(row['file_path']) + '\n' +
                       'Technical debt comment: ' + row['comment_text'])
        elif inputs == 'fp+cms+ct':
            context.append('file path: ' + str(row['file_path']) + '\n' +
                       'Containing method signature: """ ' + str(row['containing_method_signature']) + ' """\n' +
                       'Technical debt comment: ' + row['comment_text'])
        elif inputs == 'fp+ct+cmb':
            context.append('file path: ' + str(row['file_path']) + '\n' +
                       'Technical debt comment: """ ' + row['comment_text'] + ' """\n' +
                       'Containing method body: """\n' + str(row['containing_method']).replace('"""',"'''") + '\n"""\n')
        else:
            print('ERROR!')

    df['context'] = context
    return df

In [3]:
# read and prepare the Maldonado dataset (SATD identification)

df = pd.read_csv('Dataset/Maldonado-62k/maldonado_corrected.csv')
print(len(df))
print(df.columns)

INPUT = 'ct' # in Maldonado dataset there is only one input feature: comment text
FRACTION = 1

# for each project, split data to train and test and save it in a dataset
dataset = {}
for i, project_name in enumerate(sorted(set(df['project_name']))):
    test_df = df[df['project_name'] == project_name]
    train_df = df[df['project_name'] != project_name]
    train_df = train_df.sample(frac=FRACTION, random_state=42) # shuffle train

    # ---> use the test_df as validation
    data = DatasetDict({"train": Dataset.from_pandas(train_df), "valid": Dataset.from_pandas(test_df), "test": Dataset.from_pandas(test_df)})
    data=data.rename_column("satd","label")
    data=data.remove_columns(['project_name','classification','satd_orig','__index_level_0__'])
    dataset[project_name] = data

    
DATASET = 'Maldonado62k'
TEXT_COLUMN = 'comment_text'
LABEL_COLUMN = 'label'
NUM_LABELS = 2
METRIC = 'f1'

dataset

62275
Index(['project_name', 'classification', 'comment_text', 'satd_orig', 'satd'], dtype='object')


{'apache-ant-1.7.0': DatasetDict({
     train: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 58177
     })
     valid: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 4098
     })
     test: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 4098
     })
 }),
 'apache-jmeter-2.10': DatasetDict({
     train: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 54218
     })
     valid: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 8057
     })
     test: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 8057
     })
 }),
 'argouml': DatasetDict({
     train: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 52823
     })
     valid: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 9452
     })
     test: Dataset({
         features: ['comment_text', 'label'],
         num_rows: 9452


In [4]:
df['satd'].value_counts()

satd
0    57778
1     4497
Name: count, dtype: int64

In [5]:
# read and prepare the OBrien dataset (SATD classification)

# df = pd.read_csv('Dataset/23_Shades/OBrien_789.csv') # this version doesn't have the containing_method and containing_method_signature columns
df = pd.read_csv('Dataset/23_Shades/OBrien_789_v2.csv')
print(len(df))
print(df.columns)

FRACTION = 1

# map labels
label_mapping = {
    'Requirement': 0,
    'Code': 1,
    'M&T': 2,
    'Defect': 3,
    'Design': 4,
    'Documentation': 5
}
df['satd_type'] = df['satd_type'].map(label_mapping)
df = df.rename(columns={"filename": "file_path"})

INPUT = 'ct'               # only comment text
# INPUT = 'fp+ct'            # file path + comment text
# INPUT = 'fp+cms+ct'        # file path + containing method signature + comment text
# INPUT = 'fp+ct+cmb'        # file path + comment text + containing method body

df = add_context(df, INPUT)

print('\n-------------- An example of input data ---------------\n')
print(df.context[150]) # 151 2 107 129 150 193
print('-------------------------------------------------------\n')

df = df[['context','satd_type','fold']]

# for each project, split data to train and test and save it in a dataset
dataset = {}
for test_fold in sorted(set(df['fold'])):
    if test_fold>0:
        valid_fold = test_fold - 1
    else:
        valid_fold = max(df['fold'])
    
    test_df  = df[df['fold'] == test_fold]
    train_df = df[df['fold'] != test_fold]        
    train_df = train_df.sample(frac=1, random_state=42) # shuffle train

    data = DatasetDict({"train": Dataset.from_pandas(train_df), "test": Dataset.from_pandas(test_df)})        
    data=data.rename_column("satd_type","label")
    data=data.remove_columns(['fold','__index_level_0__'])
    dataset[test_fold] = data
    
DATASET = 'OBrien'
TEXT_COLUMN = 'context'
LABEL_COLUMN = 'label'
NUM_LABELS = 6
METRIC = 'accuracy'

dataset

789
Index(['Unnamed: 0', 'dataset_id', 'repo_type', 'repo_name', 'filename',
       'commit_introducing_revision', 'commit_removing_revision',
       'comment_text', 'is_satd', 'satd_type', 'ml_satd_type',
       'ml_satd_type_2', 'ml_pipeline_stage', 'file_content', 'satd_line',
       'commit_message', 'containing_method', 'containing_method_signature',
       'fold'],
      dtype='object')

-------------- An example of input data ---------------

self.mpc_sum(3; -5) TODO: Future work: how to handle gracefully minus numbers
-------------------------------------------------------



{0: DatasetDict({
     train: Dataset({
         features: ['context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'label'],
         num_rows: 79
     })
 }),
 1: DatasetDict({
     train: Dataset({
         features: ['context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'label'],
         num_rows: 79
     })
 }),
 2: DatasetDict({
     train: Dataset({
         features: ['context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'label'],
         num_rows: 79
     })
 }),
 3: DatasetDict({
     train: Dataset({
         features: ['context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'label'],
         num_rows: 79
     })
 }),
 4: DatasetDict({
     train: Dataset({
         features: ['context', 'label'],
         num_rows: 711
     })
     test: Dataset({
         features: ['context', 

In [6]:
# show class distribution
df['satd_type'].value_counts()

satd_type
0    321
1    207
2     84
3     82
4     80
5     15
Name: count, dtype: int64

In [7]:
# checkpoint = "bert-base-uncased"; HIDDEN_SIZE = 768;
# checkpoint = "microsoft/codebert-base"; HIDDEN_SIZE = 768;
checkpoint = "google/flan-t5-small"; HIDDEN_SIZE = 512;
# checkpoint = "google/flan-t5-base"; HIDDEN_SIZE = 768;
# checkpoint = "google/flan-t5-large"; HIDDEN_SIZE = 1024;
# checkpoint = "google/flan-t5-xl"; HIDDEN_SIZE = 2048;

USE_LoRA = False
LOCAL_FILES_ONLY = False

if DATASET=='Maldonado62k':
    MAX_LEN = 128 
    if checkpoint=='bert-base-uncased':
        BATCH_SIZE = 32
        LR=0.00001
    elif checkpoint=='microsoft/codebert-base':
        BATCH_SIZE = 32
        LR=0.00001
    elif checkpoint=='google/flan-t5-small':
        BATCH_SIZE = 32
        LR=0.0001
    elif checkpoint=='google/flan-t5-base':
        BATCH_SIZE = 32
        LR=0.0001
    elif checkpoint=='google/flan-t5-large':
        BATCH_SIZE = 16
        LR=0.0001
    elif checkpoint=='google/flan-t5-xl':
        BATCH_SIZE = 4
        LR=0.00002
elif DATASET=='OBrien':
    MAX_LEN = 512
    if checkpoint=='bert-base-uncased':
        BATCH_SIZE = 32
        LR=0.00005
    elif checkpoint=='microsoft/codebert-base':
        BATCH_SIZE = 32
        LR=0.00005
    elif checkpoint=='google/flan-t5-small':
        BATCH_SIZE = 32
        LR=0.001
    elif checkpoint=='google/flan-t5-base':
        BATCH_SIZE = 16
        LR=0.0005
    elif checkpoint=='google/flan-t5-large':
        BATCH_SIZE = 4
        LR=0.0002
    elif checkpoint=='google/flan-t5-xl':
        BATCH_SIZE = 1
        LR=0.00005
else:
    print('UNKNOWN DATASET!')
print('Dataset:',DATASET, '  MAX_LEN:', MAX_LEN, '  BATCH_SIZE:', BATCH_SIZE, '  USE_LoRA:', USE_LoRA, '  LR:', LR, '  Model:', checkpoint)

tokenizer = AutoTokenizer.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, local_files_only=LOCAL_FILES_ONLY)
tokenizer.model_max_len = MAX_LEN

Dataset: OBrien   MAX_LEN: 512   BATCH_SIZE: 32   USE_LoRA: False   LR: 0.001   Model: google/flan-t5-small


In [8]:
from transformers import T5Model
class CustomT5Model(T5Model):
    def forward(self, **kwargs):
        if 'labels' in kwargs:
            labels = kwargs.pop('labels')
        return super().forward(**kwargs)

In [9]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels, seed):
        super(CustomModel,self).__init__()
        
        # Set the seed for reproducibility
        torch.manual_seed(seed)
        np.random.seed(seed)
            
        self.num_labels = num_labels

        #Load Model with given checkpoint and extract its body
        if USE_LoRA:
            self.model = model = get_peft_model(CustomT5Model.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True, output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY)), lora_config)
        else:
            self.model = model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True, output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY))
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(HIDDEN_SIZE,num_labels) # load and initialize weights
        
    def forward(self, input_ids=None, attention_mask=None,labels=None):
        if 't5-' in checkpoint:
            outputs = self.model(decoder_input_ids=input_ids, input_ids=input_ids, attention_mask=attention_mask)
            encoder_last_hidden_state = outputs.last_hidden_state
        else:
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            encoder_last_hidden_state = outputs.last_hidden_state # it is outputs[0]

        #Add custom layers
        sequence_output = self.dropout(encoder_last_hidden_state)

        logits = self.classifier(sequence_output[:,0,:].view(-1,HIDDEN_SIZE)) # calculate losses

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if 't5-' in checkpoint:
            return TokenClassifierOutput(loss=loss, logits=logits)
        else:
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions) # orig


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/jovyan/conda-envs/llm_py310_torch2/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/jovyan/conda-envs/llm_py310_torch2/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [10]:
def generate_file_name():
    file_name = 'Adding_Custom_Layers_Results/' + DATASET
    if FRACTION!=1:
        file_name += '_' + str(FRACTION)
    file_name += '_Input-' + INPUT + '_' + checkpoint.split('/')[-1] 
    if ADD_CLASSIFICATION_LAYER:
        file_name += '_wCL'
    else:
        file_name += '_wInf'
    file_name += '_maxlen' + str(MAX_LEN) +  '_bs' + str(BATCH_SIZE) +  '_lr' + str(LR) + '_seed' + str(SEED)
    if USE_LoRA:
        file_name += '_lora'
    return file_name

In [11]:
num_epochs = 8
SEED = 1 # we use 1,2, and 3
ADD_CLASSIFICATION_LAYER = True # replace the last layer with a classification layer (RQ1 and RQ4: True, RQ3: False)
if False: # check other values for hyper parameters
    LR = 0.0001
    BATCH_SIZE = 1

In [12]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW,get_scheduler, AutoModelForSeq2SeqLM
from datasets import load_metric
import torch.nn.functional as F

LABEL_MAX_LEN = 5
def tokenize(batch):
    inputs = tokenizer(batch[TEXT_COLUMN], truncation=True, max_length=MAX_LEN)
    if ADD_CLASSIFICATION_LAYER:
        return inputs
    else:
        labels = tokenizer([str(label) for label in batch[LABEL_COLUMN]], truncation=True, max_length=LABEL_MAX_LEN, padding='max_length', return_tensors='pt')
#         labels = tokenizer(batch[LABEL_COLUMN], truncation=True, max_length=LABEL_MAX_LEN, padding='max_length', return_tensors='pt')
        inputs["labels"] = labels["input_ids"]
        return inputs
    
metric = load_metric(METRIC) # f1 or accuracy

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('Run the experiments for: ' + generate_file_name())


valid_results = {}
valid_losses = {}
test_results = {}
projects_real = {}
projects_pred = {}


for project_name, data in dataset.items():
    print('=======================================')
    print('------', project_name, '------')
    torch.cuda.empty_cache()
    gc.collect()
    valid_results[project_name] = []
    valid_losses[project_name] = []
    test_results[project_name] = []
    projects_real[project_name] = []
    projects_pred[project_name] = []
    labels = [str(x) for x in list(set(dataset[project_name]['train']['label']))]
    
    if ADD_CLASSIFICATION_LAYER:
        model = CustomModel(checkpoint=checkpoint,num_labels=NUM_LABELS, seed=SEED).to(device)
    else:
        if 't5' in checkpoint:
            model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True, output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY)).to(device)
        else:
            # model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, trust_remote_code=True, output_attentions=True,output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY)).to(device)
            model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, trust_remote_code=True, output_attentions=True,output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY)).to(device)
            # model = CustomT5Model.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True, local_files_only=LOCAL_FILES_ONLY)).to(device)

    # show the model precision
    if False:
        first_param = next(model.parameters())
        print(f"Data type of the first parameter: {first_param.dtype}")
    
    optimizer = AdamW(model.parameters(), lr=LR) 

    tokenized_dataset = data.map(tokenize, batched=True)     
    
    if False:
        # print([len(row['input_ids']) for row in tokenized_dataset['train']])
        print(f'The number of items in the tokenized dataset that their length is {MAX_LEN}. Larger items are also truncated to {MAX_LEN} tokens.')
        print(sum(len(row['input_ids']) == MAX_LEN for row in tokenized_dataset['train']), 'of', len(tokenized_dataset['train']), 'in train dataset')
        print(sum(len(row['input_ids']) == MAX_LEN for row in tokenized_dataset['test']), 'of', len(tokenized_dataset['test']), 'in test dataset')
        raise SystemExit("Stopping the notebook cell execution here.")
    if ADD_CLASSIFICATION_LAYER:
        tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", LABEL_COLUMN])
    else:
        tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "labels"])
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator) # ??? shuffle=True
    test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=BATCH_SIZE, collate_fn=data_collator)

    num_training_steps = num_epochs * len(train_dataloader)
    num_warmup_steps = int(0.1 * num_training_steps)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    
    i = 0
    losses = []
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            i+=1
            batch = {k: v.to(device) for k, v in batch.items()}            
            if ADD_CLASSIFICATION_LAYER:
                outputs = model(**batch)
                loss = outputs.loss
            else:
                inputs = {k: v for k, v in batch.items() if k != "labels"}
                outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=batch['labels'])
                loss = outputs.loss            
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if i%1000==0:
                print(i, 'of', num_training_steps)
        # Free GPU memory
        # print('step',i,'of',num_training_steps)
        print('epoch',epoch+1,'of',num_epochs)
        del batch  # Delete the batch tensor to free GPU memory
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(3)

        model.eval()

        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                if ADD_CLASSIFICATION_LAYER:
                    outputs = model(**batch)
                    batch_predictions = torch.argmax(outputs.logits, dim=-1)
                    metric.add_batch(predictions=batch_predictions, references=batch["labels"])
                else:
                    # Use the generate method for sequence-to-sequence models
                    generated_ids = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=LABEL_MAX_LEN)
                    pred_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
                    label_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["labels"]]
                    pred_texts = ['0' if p not in labels else p for p in pred_texts]
                    metric.add_batch(predictions=pred_texts, references=label_texts)
            # if it is the last epoch, save the predictions
            if epoch==num_epochs-1:
                if ADD_CLASSIFICATION_LAYER:
                    projects_real[project_name] += batch["labels"].tolist()                
                    projects_pred[project_name] += batch_predictions.tolist()
                else:
                    projects_real[project_name] += label_texts                
                    projects_pred[project_name] += pred_texts                    
        test_results[project_name].append(metric.compute())
        print(f"Test:{test_results[project_name][-1][METRIC]:.3f}")
        model.train()

  metric = load_metric(METRIC) # f1 or accuracy


Run the experiments for: Adding_Custom_Layers_Results/OBrien_Input-ct_flan-t5-small_wCL_maxlen512_bs32_lr0.001_seed1
------ 0 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 28318.06 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 12412.44 examples/s]
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 1 of 8
Test:0.443
epoch 2 of 8
Test:0.456
epoch 3 of 8
Test:0.443
epoch 4 of 8
Test:0.506
epoch 5 of 8
Test:0.595
epoch 6 of 8
Test:0.633
epoch 7 of 8
Test:0.608
epoch 8 of 8
Test:0.633
------ 1 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 35756.21 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 16268.97 examples/s]


epoch 1 of 8
Test:0.468
epoch 2 of 8
Test:0.456
epoch 3 of 8
Test:0.506
epoch 4 of 8
Test:0.392
epoch 5 of 8
Test:0.481
epoch 6 of 8
Test:0.418
epoch 7 of 8
Test:0.430
epoch 8 of 8
Test:0.468
------ 2 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 37917.87 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 15270.99 examples/s]


epoch 1 of 8
Test:0.367
epoch 2 of 8
Test:0.380
epoch 3 of 8
Test:0.354
epoch 4 of 8
Test:0.468
epoch 5 of 8
Test:0.519
epoch 6 of 8
Test:0.557
epoch 7 of 8
Test:0.557
epoch 8 of 8
Test:0.557
------ 3 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 42781.19 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 9893.41 examples/s]


epoch 1 of 8
Test:0.342
epoch 2 of 8
Test:0.354
epoch 3 of 8
Test:0.392
epoch 4 of 8
Test:0.468
epoch 5 of 8
Test:0.494
epoch 6 of 8
Test:0.456
epoch 7 of 8
Test:0.481
epoch 8 of 8
Test:0.557
------ 4 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 711/711 [00:00<00:00, 39652.56 examples/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 17426.00 examples/s]


epoch 1 of 8
Test:0.423
epoch 2 of 8
Test:0.487
epoch 3 of 8
Test:0.538
epoch 4 of 8
Test:0.551
epoch 5 of 8
Test:0.590
epoch 6 of 8
Test:0.538
epoch 7 of 8
Test:0.564
epoch 8 of 8
Test:0.526
------ 5 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 41033.37 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 17453.25 examples/s]


epoch 1 of 8
Test:0.443
epoch 2 of 8
Test:0.468
epoch 3 of 8
Test:0.430
epoch 4 of 8
Test:0.519
epoch 5 of 8
Test:0.532
epoch 6 of 8
Test:0.544
epoch 7 of 8
Test:0.557
epoch 8 of 8
Test:0.557
------ 6 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 44790.72 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 16563.36 examples/s]


epoch 1 of 8
Test:0.418
epoch 2 of 8
Test:0.392
epoch 3 of 8
Test:0.430
epoch 4 of 8
Test:0.443
epoch 5 of 8
Test:0.557
epoch 6 of 8
Test:0.532
epoch 7 of 8
Test:0.557
epoch 8 of 8
Test:0.570
------ 7 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 43799.27 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 19162.04 examples/s]


epoch 1 of 8
Test:0.405
epoch 2 of 8
Test:0.392
epoch 3 of 8
Test:0.430
epoch 4 of 8
Test:0.532
epoch 5 of 8
Test:0.557
epoch 6 of 8
Test:0.570
epoch 7 of 8
Test:0.557
epoch 8 of 8
Test:0.570
------ 8 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 45299.68 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 17256.02 examples/s]


epoch 1 of 8
Test:0.342
epoch 2 of 8
Test:0.418
epoch 3 of 8
Test:0.430
epoch 4 of 8
Test:0.443
epoch 5 of 8
Test:0.443
epoch 6 of 8
Test:0.430
epoch 7 of 8
Test:0.456
epoch 8 of 8
Test:0.443
------ 9 ------


Some weights of the model checkpoint at google/flan-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 710/710 [00:00<00:00, 42731.47 examples/s]
Map: 100%|██████████| 79/79 [00:00<00:00, 16505.60 examples/s]


epoch 1 of 8
Test:0.392
epoch 2 of 8
Test:0.405
epoch 3 of 8
Test:0.443
epoch 4 of 8
Test:0.443
epoch 5 of 8
Test:0.443
epoch 6 of 8
Test:0.468
epoch 7 of 8
Test:0.519
epoch 8 of 8
Test:0.519


In [13]:
# extract and show f1 score over epochs
results_df = pd.DataFrame({k: [epoch[METRIC] for epoch in v] for k, v in test_results.items()}).T
results_df.columns = ['Epoch'+str(i) for i in range(1,num_epochs+1)]
results_df.loc['Mean'] = results_df.mean()
results_df

Unnamed: 0,Epoch1,Epoch2,Epoch3,Epoch4,Epoch5,Epoch6,Epoch7,Epoch8
0,0.443038,0.455696,0.443038,0.506329,0.594937,0.632911,0.607595,0.632911
1,0.468354,0.455696,0.506329,0.392405,0.481013,0.417722,0.43038,0.468354
2,0.367089,0.379747,0.35443,0.468354,0.518987,0.556962,0.556962,0.556962
3,0.341772,0.35443,0.392405,0.468354,0.493671,0.455696,0.481013,0.556962
4,0.423077,0.487179,0.538462,0.551282,0.589744,0.538462,0.564103,0.525641
5,0.443038,0.468354,0.43038,0.518987,0.531646,0.544304,0.556962,0.556962
6,0.417722,0.392405,0.43038,0.443038,0.556962,0.531646,0.556962,0.56962
7,0.405063,0.392405,0.43038,0.531646,0.556962,0.56962,0.556962,0.56962
8,0.341772,0.417722,0.43038,0.443038,0.443038,0.43038,0.455696,0.443038
9,0.392405,0.405063,0.443038,0.443038,0.443038,0.468354,0.518987,0.518987


In [14]:
# save f1 score to csv file
file_name = generate_file_name()
results_df.to_csv(file_name+'_F1.csv')
print('The results saved in', file_name+'.csv')

The results saved in Adding_Custom_Layers_Results/OBrien_Input-ct_flan-t5-small_wCL_maxlen512_bs32_lr0.001_seed1.csv


In [15]:
# show precision, recall, f1, and confusion matrix
# save the result to [...]_confmat.txt file
def get_confmat_str(real, pred, labels):
    output = ''
    cm = confusion_matrix(real, pred, labels=labels)
    max_label_length = max([len(label) for label in labels] + [5])
    output = " " * max_label_length + " " + " ".join(label.ljust(max_label_length) for label in labels) + "\n"
    for i, label in enumerate(labels):
        row = " ".join([str(cm[i][j]).ljust(max_label_length) for j in range(len(labels))])
        output += label.ljust(max_label_length) + " " + row + "\n"
    return output

if DATASET=='OBrien':
    label_mapping = {
        0: 'Reqmnt', # Requirement
        1: 'Code',
        2: 'M&T',
        3: 'Defect',
        4: 'Design',
        5:'Doc' # Documentation
    }
elif DATASET=='Maldonado62k':
    label_mapping = {
        0: '0',
        1: '1'
    }
else:
    print("ERROR!")

all_real = []
all_pred = []
labels = list(label_mapping.values())

with open(file_name+'_confmat.txt', "w") as output_file:
    for project in projects_pred.keys():
        print('---------- '+str(project)+' ----------')
        output_file.write('\n---------- '+str(project)+' ----------\n')
        real = [label_mapping[int(label)] for label in projects_real[project]]
        pred = [label_mapping[int(label)] for label in projects_pred[project]]
        print(classification_report(real, pred, zero_division=0, digits=3))
        output_file.write(classification_report(real, pred, zero_division=0, digits=3) + '\n')
        # print confusion matrix with label in rows and columns
        confmat_str = get_confmat_str(real, pred, labels=labels)
        print(confmat_str)
        output_file.write(confmat_str)
        # add them to all_real and all_pred
        all_real += real
        all_pred += pred
    print('=========== Overall ==========')
    output_file.write('\n=========== Overall ==========\n')
    # print precision recall and F1 for all data
    print(classification_report(all_real, all_pred, zero_division=0, digits=3))
    output_file.write(classification_report(all_real, all_pred, zero_division=0, digits=3)+"\n")
    # print confusion matrix for all data
    confmat_str = get_confmat_str(all_real, all_pred, labels=labels)
    print(confmat_str)
    output_file.write(confmat_str)


---------- 0 ----------
              precision    recall  f1-score   support

        Code      0.679     0.792     0.731        24
      Defect      0.000     0.000     0.000         4
      Design      0.000     0.000     0.000         9
         M&T      0.500     0.429     0.462         7
      Reqmnt      0.636     0.800     0.709        35

    accuracy                          0.633        79
   macro avg      0.363     0.404     0.380        79
weighted avg      0.532     0.633     0.577        79

       Reqmnt Code   M&T    Defect Design Doc   
Reqmnt 28     5      2      0      0      0     
Code   5      19     0      0      0      0     
M&T    3      0      3      1      0      0     
Defect 2      2      0      0      0      0     
Design 6      2      1      0      0      0     
Doc    0      0      0      0      0      0     

---------- 1 ----------
              precision    recall  f1-score   support

        Code      0.111     0.333     0.167         9
      Defe

In [16]:
# extract real and pred labels and save to csv
real = []
pred = []
context = []
project = []
for proj in projects_pred.keys():
    for i in range(len(projects_real[proj])):
        project.append(proj)
        context.append(dataset[proj]['test'][TEXT_COLUMN][i])
        real.append(label_mapping[int(projects_real[proj][i])])
        pred.append(label_mapping[int(projects_pred[proj][i])])
test_result_df = pd.DataFrame({'project': project, 'context':context, 'real': real, 'pred': pred})
test_result_df.head(5)
# save to csv file
test_result_df.to_csv(file_name+'_pred.csv', index=False)

In [None]:
##########################################################################

# Show the results

In [40]:
#  show the average of epochs over seeds for the OBrien dataset
# Note: for the Maldonado dataset, please check the corresponding csv file in the Paper_Results folder

import pandas as pd
wCL = True # set True if with classification layer

if wCL:
    models_param = {
        'bert-base-uncased':'bs32_lr5e-05',
        'codebert-base':'bs32_lr5e-05',
        'flan-t5-small':'bs32_lr0.001',
        'flan-t5-base':'bs16_lr0.0005',
        'flan-t5-large':'bs4_lr0.0002',
        'flan-t5-xl':'bs1_lr5e-05'
    }
    filepath = 'Paper_Results/OBrien-finetune-with-classification-layer/'
else:
    filepath = 'Paper_Results/OBrien-finetune-original-architecture/'
    models_param = {
        'flan-t5-small':'bs32_lr0.005',
        'flan-t5-base':'bs32_lr0.002',
        'flan-t5-large':'bs8_lr0.0005',
         'flan-t5-xl':'bs1_lr0.0001'
    }

for input_data in ['ct', 'fp+ct', 'fp+cms+ct', 'fp+ct+cmb']: # 'ct' 'fp+ct' 'fp+cms+ct' 'fp+ct+cmb'
    modelrows = []
    for model, param in models_param.items():
        seedrows = []
        for seed in [1, 2, 3]:
            if wCL:
                file_name = f"{filepath}{input_data}/OBrien_Input-{input_data}_{model}_wCL_maxlen512_{param}_seed{seed}_F1.csv"
            else:
                file_name = f"{filepath}/OBrien_Input-{input_data}_{model}_wInf_maxlen512_{param}_seed{seed}_F1.csv"
            df = pd.read_csv(file_name, index_col=False)
            df.drop(columns=['Unnamed: 0'], inplace=True)
            last_row =  df.tail(1)
            seedrows.append(last_row)

        # Calculate the mean of the last rows of each seed
        mean_seeds = pd.concat(seedrows, axis=0)
        mean_seeds.loc[model] = mean_seeds.mean()
        modelrows.append(mean_seeds.tail(1))

    average_of_seeds_df =  pd.concat(modelrows, axis=0)

    # Save the DataFrame to a new CSV file
    # average_of_seeds_df.to_csv(f'{filepath}{input_data}.csv')
    print("\n\nInput data:", input_data)
    display(average_of_seeds_df)



Input data: ct


Unnamed: 0,Epoch1,Epoch2,Epoch3,Epoch4,Epoch5,Epoch6,Epoch7,Epoch8
bert-base-uncased,0.405626,0.477864,0.557644,0.558498,0.557627,0.563957,0.573672,0.576187
codebert-base,0.398848,0.453348,0.545937,0.591453,0.597793,0.592291,0.603327,0.611338
flan-t5-small,0.399264,0.42167,0.437764,0.466039,0.504896,0.51374,0.529368,0.537385
flan-t5-base,0.413226,0.436887,0.486336,0.524797,0.560592,0.549205,0.542892,0.564811
flan-t5-large,0.415331,0.430959,0.483371,0.52688,0.554333,0.569994,0.572476,0.575441
flan-t5-xl,0.417029,0.531581,0.576258,0.585962,0.59564,0.607476,0.617603,0.619712




Input data: fp+ct


Unnamed: 0,Epoch1,Epoch2,Epoch3,Epoch4,Epoch5,Epoch6,Epoch7,Epoch8
bert-base-uncased,0.402651,0.46347,0.551682,0.563069,0.564346,0.561868,0.563973,0.577886
codebert-base,0.367624,0.390831,0.435676,0.499059,0.565276,0.581305,0.586384,0.606665
flan-t5-small,0.390831,0.406865,0.406865,0.428822,0.466434,0.480358,0.509959,0.520091
flan-t5-base,0.406865,0.408552,0.414882,0.470659,0.522152,0.535697,0.544147,0.559775
flan-t5-large,0.385768,0.414903,0.483744,0.539549,0.579211,0.580894,0.592286,0.599475
flan-t5-xl,0.394634,0.470264,0.576263,0.610419,0.634096,0.62477,0.630721,0.648015




Input data: fp+cms+ct


Unnamed: 0,Epoch1,Epoch2,Epoch3,Epoch4,Epoch5,Epoch6,Epoch7,Epoch8
bert-base-uncased,0.398015,0.427605,0.519999,0.54868,0.53983,0.55251,0.553348,0.566861
codebert-base,0.379438,0.375641,0.413205,0.444948,0.505409,0.540836,0.558969,0.571643
flan-t5-small,0.392097,0.406865,0.406865,0.419983,0.439008,0.472774,0.491794,0.505739
flan-t5-base,0.406443,0.407709,0.403067,0.453776,0.495548,0.524327,0.534058,0.546257
flan-t5-large,0.391253,0.423742,0.467722,0.531516,0.56024,0.580055,0.592308,0.59023
flan-t5-xl,0.377329,0.445278,0.553029,0.609553,0.616796,0.626907,0.627718,0.628156




Input data: fp+ct+cmb


Unnamed: 0,Epoch1,Epoch2,Epoch3,Epoch4,Epoch5,Epoch6,Epoch7,Epoch8
bert-base-uncased,0.391675,0.388721,0.390825,0.449048,0.46889,0.487926,0.484545,0.495948
codebert-base,0.381126,0.370578,0.403911,0.406026,0.417889,0.450427,0.475349,0.492221
flan-t5-small,0.383236,0.406865,0.406865,0.406865,0.412772,0.421638,0.433463,0.436428
flan-t5-base,0.406865,0.406443,0.403489,0.409396,0.425068,0.449535,0.46645,0.468116
flan-t5-large,0.388299,0.410662,0.416991,0.450352,0.484134,0.503971,0.539062,0.545418
flan-t5-xl,0.374797,0.408136,0.471476,0.511171,0.566899,0.618019,0.630255,0.635324


In [29]:
def extract_overall_f1_scores(file_path):
    with open(file_path, 'r') as file:
        start_extracting = False
        f1_scores = {}

        for line in file:
            # Check if the relevant section has started
            if "== Overall ==" in line:
                start_extracting = True
                continue
            # Stop extracting if the section has ended
            if start_extracting and line.strip() == "" and len(f1_scores)>0:
                break
            # Extract F1-scores if within the relevant section
            if start_extracting:
                parts = line.split()
                if len(parts) == 5:
                    class_name = parts[0]
                    f1_score = float(parts[3])
                    f1_scores[class_name] = f1_score
        return f1_scores

# given a list of dictionaries, it returns a single dictionary which for each key, the value is its average in the list
def get_average_for_each_key(dict_list):
    sum_dict = {}
    for d in dict_list:
        for key, value in d.items():
            sum_dict[key] = sum_dict.get(key, 0) + value
    avg_dict = {key: value / len(dict_list) for key, value in sum_dict.items()}
    return avg_dict


In [35]:
#  show the average of f1 scores for each class over seeds (OBrien dataset)

import pandas as pd

models_param = {
    'bert-base-uncased':'bs32_lr5e-05',
    'codebert-base':'bs32_lr5e-05',
    'flan-t5-small':'bs32_lr0.001',
    'flan-t5-base':'bs16_lr0.0005',
    'flan-t5-large':'bs4_lr0.0002',
    'flan-t5-xl':'bs1_lr5e-05'
}

filepath = 'Paper_Results/OBrien-finetune-with-classification-layer/'

for input_data in ['ct']: # 'ct' 'fp+ct' 'fp+cms+ct' 'fp+ct+cmb'
    model_f1s = []
    for model, param in models_param.items():
        seed_f1s = []
        for seed in [1, 2, 3]:
            file_name = f"{filepath}{input_data}/OBrien_Input-{input_data}_{model}_wCL_maxlen512_{param}_seed{seed}_confmat.txt"
            seed_f1s.append(extract_overall_f1_scores(file_name))
        # Calculate the mean over seeds
        avg_f1s = get_average_for_each_key(seed_f1s)
        avg_f1s['model'] = model
        model_f1s.append(avg_f1s)

class_f1_average_of_seeds_df =  pd.DataFrame(model_f1s)
class_f1_average_of_seeds_df.set_index('model', inplace=True)

# Save the DataFrame to a new CSV file
# class_f1_average_of_seeds_df.to_csv(f'{filepath}{input_data}_classf1.csv')

class_f1_average_of_seeds_df

Unnamed: 0_level_0,Code,Defect,Design,Doc,M&T,Reqmnt
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bert-base-uncased,0.585333,0.367,0.312333,0.170333,0.633667,0.662333
codebert-base,0.636,0.398,0.423333,0.176333,0.610333,0.687
flan-t5-small,0.532667,0.119667,0.074,0.0,0.6,0.659333
flan-t5-base,0.600333,0.378333,0.299333,0.0,0.593333,0.653667
flan-t5-large,0.585667,0.417667,0.328667,0.23,0.598333,0.661667
flan-t5-xl,0.650667,0.433,0.475333,0.488,0.618333,0.684333


In [34]:
#  Voting over predictions (OBrien dataset)

import pandas as pd

models_param = {
#     'bert-base-uncased':'bs32_lr5e-05',
#     'codebert-base':'bs32_lr5e-05',
#     'flan-t5-small':'bs32_lr0.001',
#     'flan-t5-base':'bs16_lr0.0005',
#     'flan-t5-large':'bs4_lr0.0002',
    'flan-t5-xl':'bs1_lr5e-05'
}

filepath = 'Paper_Results/OBrien-finetune-with-classification-layer/'

df_pred_list = []
for input_data in ['ct', 'fp+ct', 'fp+cms+ct', 'fp+ct+cmb']:
    for model, param in models_param.items():
        for seed in [1, 2, 3]:
            file_name = f"{filepath}{input_data}/OBrien_Input-{input_data}_{model}_wCL_maxlen512_{param}_seed{seed}_pred.csv"
            df_pred_list.append(pd.read_csv(file_name))
            
df_pred_concat = pd.concat(df_pred_list, axis=1)

df_pred_voted = df_pred_concat[['pred']].mode(axis=1)[0]
df_pred_voted = pd.DataFrame({
    'project': df_pred_list[6]['project'],
    'context': df_pred_list[6]['context'],
    'real': df_pred_list[6]['real'],
    'pred': df_pred_voted
})

print(classification_report(df_pred_voted['real'], df_pred_voted['pred'], zero_division=0, digits=3))

              precision    recall  f1-score   support

        Code      0.668     0.700     0.684       207
      Defect      0.507     0.439     0.471        82
      Design      0.641     0.512     0.569        80
         Doc      0.700     0.467     0.560        15
         M&T      0.696     0.655     0.675        84
      Reqmnt      0.698     0.757     0.726       321

    accuracy                          0.668       789
   macro avg      0.652     0.588     0.614       789
weighted avg      0.664     0.668     0.664       789



In [None]:
##########################################################################