# SATD identification and classification by LLMs and in-context learning (RQ2)

In [1]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from sentence_transformers import SentenceTransformer
import sentence_transformers
import torch
import torch.nn as nn
import pandas as pd
import gc
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import random
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# add context and prompt_context colums to the dataset
def add_context(df, inputs):
    context = [] # to be used in SentenceTransform
    prompt_context = [] # to be used in prompt generation
    for _,row in df.iterrows():
        if inputs == 'ct':
            if DATASET=='Maldonado62k':
                context.append(row['comment_text'])
                prompt_context.append('### Comment text: """ ' + row['comment_text'] + ' """')
            else:
                context.append(row['comment_text'])
                prompt_context.append('### Technical debt comment: """ ' + row['comment_text'] + ' """')
        elif inputs == 'fp+ct':
            context.append(row['file_path'] + '\n' + row['comment_text'])
            prompt_context.append('### file path: ' + row['file_path'] + '\n' +
                       '### Technical debt comment: """ ' + row['comment_text'] + ' """')
        elif inputs == 'fp+cms+ct':
            context.append(row['file_path'] + '\n' + str(row['containing_method_signature']) + '\n' + row['comment_text'])
            prompt_context.append('### file path: ' + row['file_path'] + '\n' +
                       '### Containing method signature: """ ' + str(row['containing_method_signature']) + ' """\n' +
                       '### Technical debt comment: """ ' + row['comment_text'] + ' """')
        elif inputs == 'fp+ct+cmb':
            context.append(row['file_path'] + '\n' + row['comment_text'] + '\n' + str(row['containing_method']))
            prompt_context.append('### file path: ' + row['file_path'] + '\n' +
                       '### Technical debt comment: """ ' + row['comment_text'] + ' """\n' +
                       '### Containing method: """ ' + str(row['containing_method']).replace('"""',"'''") + ' """')
        else:
            print('ERROR!')

    df['context'] = context
    df['prompt_context'] = prompt_context
    return df

In [27]:
# read and prepare the Maldonado dataset (SATD identification)

df = pd.read_csv('Dataset/Maldonado-62k/maldonado_corrected.csv')
df['satd_str'] = df['satd'].apply(lambda x: 'SATD' if x == 1 else 'Not-SATD')
print(len(df))
print(df.columns)

DATASET = 'Maldonado62k'

INPUT = 'ct' # in Maldonado dataset there is only one input feature: comment text

df = add_context(df, INPUT)

print('\n-------------- An example of input data - context ---------------\n')
print(df.context[4])
print('\n-------------- An example of input data - prompt_context ---------------\n')
print(df.prompt_context[4])
print('-------------------------------------------------------\n')

df = df[['context','prompt_context','satd_str','project_name']]

# for each project, split data to train and test and save it in a dataset
dataset = {}
for project_name in sorted(set(df['project_name'])):
    test_df = df[df['project_name'] == project_name]
    train_df = df[df['project_name'] != project_name]
    train_df = train_df.drop_duplicates(subset='context') # remove duplicates from train
    train_df = train_df.sample(frac=1, random_state=42) # shuffle train

    data = DatasetDict({"train": Dataset.from_pandas(train_df), "test": Dataset.from_pandas(test_df)})
    data=data.rename_column("satd_str","label")
    data=data.remove_columns(['project_name','__index_level_0__'])
    dataset[project_name] = data

METRIC = 'accuracy'

dataset

62275
Index(['project_name', 'classification', 'comment_text', 'satd_orig', 'satd',
       'satd_str'],
      dtype='object')

-------------- An example of input data - context ---------------

// the generated classes must not be added in the generic JAR! // is that buggy on old JOnAS (2.4) ??

-------------- An example of input data - prompt_context ---------------

### Comment text: """ // the generated classes must not be added in the generic JAR! // is that buggy on old JOnAS (2.4) ?? """
-------------------------------------------------------



{'apache-ant-1.7.0': DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 35728
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 4098
     })
 }),
 'apache-jmeter-2.10': DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 34602
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 8057
     })
 }),
 'argouml': DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 33149
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 9452
     })
 }),
 'columba-1.4-src': DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 34541
     })
     test: Dataset({
         features: ['context', 'prompt_c

In [29]:
# read and prepare the OBrien dataset (SATD classification)

# df = pd.read_csv('Dataset/23_Shades/OBrien_789.csv') # this version doesn't have the containing_method and containing_method_signature columns
df = pd.read_csv('Dataset/23_Shades/OBrien_789_v2.csv')
print(len(df))
print(df.columns)

df = df.rename(columns={"filename": "file_path"})

DATASET = 'OBrien'

INPUT = 'ct'               # only comment text
# INPUT = 'fp+ct'            # file path + comment text
# INPUT = 'fp+cms+ct'        # file path + containing method signature + comment text
# INPUT = 'fp+ct+cmb'        # file path + comment text + containing method body

df = add_context(df, INPUT)

print('\n-------------- An example of input data - context ---------------\n')
print(df.context[2]) # 151
print('\n-------------- An example of input data - prompt_context ---------------\n')
print(df.prompt_context[2]) # 151
print('-------------------------------------------------------\n')

df = df[['context','prompt_context','satd_type','fold']]

# for each project, split data to train and test and save it in a dataset
dataset = {}
for test_fold in sorted(set(df['fold'])):
    test_df  = df[df['fold'] == test_fold]
    train_df = df[df['fold'] != test_fold]        
    train_df = train_df.sample(frac=1, random_state=42) # shuffle train

    data = DatasetDict({"train": Dataset.from_pandas(train_df), "test": Dataset.from_pandas(test_df)})        
    data=data.rename_column("satd_type","label")
    data=data.remove_columns(['fold','__index_level_0__'])
    dataset[test_fold] = data
    
METRIC = 'accuracy'

dataset

789
Index(['Unnamed: 0', 'dataset_id', 'repo_type', 'repo_name', 'filename',
       'commit_introducing_revision', 'commit_removing_revision',
       'comment_text', 'is_satd', 'satd_type', 'ml_satd_type',
       'ml_satd_type_2', 'ml_pipeline_stage', 'file_content', 'satd_line',
       'commit_message', 'containing_method', 'containing_method_signature',
       'fold'],
      dtype='object')

-------------- An example of input data - context ---------------

!TODO: An empty dictionary would actually also do here ... despite the fact that

-------------- An example of input data - prompt_context ---------------

### Technical debt comment: """ !TODO: An empty dictionary would actually also do here ... despite the fact that """
-------------------------------------------------------



{0: DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 79
     })
 }),
 1: DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 79
     })
 }),
 2: DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 79
     })
 }),
 3: DatasetDict({
     train: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 710
     })
     test: Dataset({
         features: ['context', 'prompt_context', 'label'],
         num_rows: 79
     })
 }),
 4: DatasetDict({
  

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import pandas as pd
import numpy as np

checkpoint='google/flan-t5-xxl'

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards: 100%|██████████| 5/5 [00:34<00:00,  6.81s/it]


In [6]:
generation_config = GenerationConfig(max_new_tokens=5, do_sample=True, temperature=0.01) # when I set max_new_tokens=3, it generates 'Require' rather 'Requirement'

In [7]:
def generate_prompt(instruction: str, input_ctxt: str = None) -> str:
    if input_ctxt:
        return f"""
### Instruction:
{instruction}

### Input:
{input_ctxt}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""


def get_response(model, tokenizer, generation_config, prompt):
    inputs = tokenizer(prompt, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()} # we need to move the data to cuda if the model is on cuda
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            generation_config = generation_config
        )[0],
        skip_special_tokens=True
    )
    return output

In [8]:
def get_confmat_str(real, pred, labels):
    output = ''
    cm = confusion_matrix(real, pred, labels=labels)
    max_label_length = max([len(label) for label in labels] + [5])
    output = " " * max_label_length + " " + " ".join(label.ljust(max_label_length) for label in labels) + "\n"
    for i, label in enumerate(labels):
        row = " ".join([str(cm[i][j]).ljust(max_label_length) for j in range(len(labels))])
        output += label.ljust(max_label_length) + " " + row + "\n"
    return output

def split_to_tokens(text):
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return tokens

In [9]:
def get_the_most_relevant_items_for_an_item(item_embed, items_embed, n):
    cos_sim = sentence_transformers.util.cos_sim(item_embed, items_embed) # Note: make sure to pass ndarray not list due to performance
    itemId_similarity = dict(zip(range(len(items_embed)),cos_sim.tolist()[0]))
    itemId_similarity = dict(sorted(itemId_similarity.items(), key=lambda item: item[1], reverse=True)) # sort
    itemId_similarity = [(k,itemId_similarity[k]) for k in list(itemId_similarity)[:n]] # take top n
    return itemId_similarity

def get_the_most_relevant_items_for_an_item_given_cos_sim(item_indx, cos_sim, n):
    cos_sim_row = cos_sim[item_indx]
    itemId_similarity = dict(zip(range(len(cos_sim_row)),cos_sim_row.tolist()))
    itemId_similarity = dict(sorted(itemId_similarity.items(), key=lambda item: item[1], reverse=True)) # sort
    itemId_similarity = [(k,itemId_similarity[k]) for k in list(itemId_similarity)[:n]] # take top n
    return itemId_similarity

   
# Self-admitted technical debt (SATD) is technical debt admitted by the developer through source code comments. Assign a label of 1 to indicate SATD or 0 to indicate Not-SATD for each source code comment.

init_prompt_for_Maldonado62k = """
Self-admitted technical debt (SATD) is technical debt admitted by the developer through source code comments. Assign the label of SATD or Not-SATD for each given source code comment.

Here are some examples:\n\n"""

# using the keywords in MAT paper
init_prompt_for_Maldonado62k_MAT = """
Self-admitted technical debt (SATD) is technical debt admitted by the developer through source code comments. SATD comments usually contains specific keywords: TODO, FIXME, HACK, and XXX. Assign the label of SATD or Not-SATD for each given source code comment.

Here are some examples:\n\n"""

# using the keywords by Jitterbug paper (their Easy approach)
init_prompt_for_Maldonado62k_Easy = """
Self-admitted technical debt (SATD) is technical debt admitted by the developer through source code comments. SATD comments usually contains specific keywords: TODO, FIXME, HACK, and WORKAROUND. Assign the label of SATD or Not-SATD for each given source code comment.

Here are some examples:\n\n"""

# using the keywords proposed by GPT4
# prompt: "what is the common keywords that developers use to highlight a code comment as self admitted technical debt."
init_prompt_for_Maldonado62k_GPT4 = """
Self-admitted technical debt (SATD) is technical debt admitted by the developer through source code comments. SATD comments usually contains specific keywords: TODO, FIXME, HACK, XXX, NOTE, DEBT, REFACTOR, OPTIMIZE, TEMP, WORKAROUND, KLUDGE, REVIEW, NOFIX, PENDING, and BUG. Assign the label of SATD or Not-SATD for each given source code comment.

Here are some examples:\n\n"""


init_prompt_for_OBrien = """
There are six types of software technical debts:

Requirement: Requirement debts can be functional or non-functional. In the functional case, implementations are left unfinished or in need of future feature support. In the non-functional case, the corresponding code does not meet the requirement standards (speed, memory usage, security, etc...).

Code: Bad coding practices leading to poor legibility of code, making it difficult to understand and maintain.

M&T: Problems found in implementations involving testing or monitoring subcomponents.

Defect: Identified defects in the system that should be addressed.

Design: Areas which violate good software design practices, causing poor flexibility to evolving business needs.

Documentation: Inadequate documentation that exists within the software system. 

Here are some examples:\n\n"""


def generate_prompt_without_adding_dynamic_examples(init_prompt, test_context):
    prompt = init_prompt
    prompt += test_context + '\n'
    prompt += '### Label: '
    return prompt

def generate_prompt_by_top_n_items(init_prompt, test_context, top_n_items, data):
    prompt = init_prompt
    for indx,similarity in top_n_items:
        if len(split_to_tokens(prompt+data['prompt_context'][indx]+test_context))<500:
            prompt += data['prompt_context'][indx] + '\n'
            prompt += '### Label: ' + data['label'][indx] + '\n\n'
    prompt += test_context + '\n'
    prompt += '### Label: '
    return prompt

def generate_prompt_by_random_n_items(init_prompt, test_context, num_rand, data):
    random_n_items = random.sample(range(len(data)), num_rand)
    prompt = init_prompt
    for indx in random_n_items:
        if len(split_to_tokens(prompt+data['prompt_context'][indx]+test_context))<500:
            prompt += data['prompt_context'][indx] + '\n'
            prompt += '### Label: ' + data['label'][indx] + '\n\n'
    prompt += test_context + '\n'
    prompt += '### Label: '
    return prompt

In [30]:
# show a prompt example that includes top n related items by SentenceTransformer

n = 3 # number of examples in the prompt

if DATASET == 'OBrien':
    project_name = 9
    indx = 11
    init_prompt = init_prompt_for_OBrien
else:
    project_name = 'apache-ant-1.7.0'
    indx = 2
    init_prompt = init_prompt_for_Maldonado62k_MAT
    
st_model = SentenceTransformer('all-MiniLM-L6-v2') # model size: 80MB
train_data_embed = st_model.encode(dataset[project_name]['train']['context'], show_progress_bar=True)
test_data_embed = st_model.encode(dataset[project_name]['test']['context'], show_progress_bar=True)
cos_sim = sentence_transformers.util.cos_sim(test_data_embed, train_data_embed)
top_n_items = get_the_most_relevant_items_for_an_item_given_cos_sim(indx, cos_sim, n)
print(generate_prompt_by_top_n_items(init_prompt, dataset[project_name]['test']['prompt_context'][indx], top_n_items, dataset[project_name]['train']))

Batches: 100%|██████████| 23/23 [00:00<00:00, 151.56it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00, 186.61it/s]



There are six types of software technical debts:

Requirement: Requirement debts can be functional or non-functional. In the functional case, implementations are left unfinished or in need of future feature support. In the non-functional case, the corresponding code does not meet the requirement standards (speed, memory usage, security, etc...).

Code: Bad coding practices leading to poor legibility of code, making it difficult to understand and maintain.

M&T: Problems found in implementations involving testing or monitoring subcomponents.

Defect: Identified defects in the system that should be addressed.

Design: Areas which violate good software design practices, causing poor flexibility to evolving business needs.

Documentation: Inadequate documentation that exists within the software system. 

Here are some examples:

### Technical debt comment: """ TODO normalice to make sum up to 1? """
### Label: Requirement

### Technical debt comment: """ TODO: Set up self.batch_sum if sel

In [21]:
# show a prompt example that includes n random items

n = 3 # number of examples in the prompt

if DATASET == 'OBrien':
    project_name = 9
    indx = 11
    init_prompt = init_prompt_for_OBrien
else:
    project_name = 'apache-ant-1.7.0'
    indx = 2
    init_prompt = init_prompt_for_Maldonado62k_MAT
    
print(generate_prompt_by_random_n_items(init_prompt, dataset[project_name]['test']['prompt_context'][indx], n, dataset[project_name]['train']))


There are six types of software technical debts:

Requirement: Requirement debts can be functional or non-functional. In the functional case, implementations are left unfinished or in need of future feature support. In the non-functional case, the corresponding code does not meet the requirement standards (speed, memory usage, security, etc...).

Code: Bad coding practices leading to poor legibility of code, making it difficult to understand and maintain.

M&T: Problems found in implementations involving testing or monitoring subcomponents.

Defect: Identified defects in the system that should be addressed.

Design: Areas which violate good software design practices, causing poor flexibility to evolving business needs.

Documentation: Inadequate documentation that exists within the software system. 

Here are some examples:

### Technical debt comment: """ TODO: I believe this is not really much used """
### Label: Code

### Technical debt comment: """ TODO In future; need to update a

In [23]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from datasets import load_metric

# ICL_METHOD = 'task-level' # use the same prompt (zero-shot or the same demonstration examples) for all test data
ICL_METHOD = 'instance-level-nearest' # use different prompts (selects different demonstration examples) for different test samples by nearest examples selection
# ICL_METHOD = 'instance-level-random' # use different prompts (selects different demonstration examples) for different test samples by random selection

if DATASET == 'OBrien':
    INIT_PROMPT = init_prompt_for_OBrien
    # INIT_PROMPT = "" # provide no description for the task (i.e., just provide some examples)
elif DATASET == 'Maldonado62k':
    # INIT_PROMPT = init_prompt_for_Maldonado62k # include no keywords
    INIT_PROMPT = init_prompt_for_Maldonado62k_MAT # include MAT keywords
    # INIT_PROMPT = init_prompt_for_Maldonado62k_Easy # include Easy keywords
    # INIT_PROMPT = init_prompt_for_Maldonado62k_GPT4 # include GPT4 keywords
else:
    print("ERROR! Unknown dataset")

random.seed(42)

st_model = SentenceTransformer('all-MiniLM-L6-v2') # model size: 80MB

if 'instance' in ICL_METHOD:
    if len(INIT_PROMPT)>0:
        num_instances = [0,1,2,3,5,10,15,20]
    else:
        num_instances = [1,2,3,5,10,15,20]
else:
    num_instances = [0]

for NUM_EXAMPLES_IN_PROMPT in num_instances:
    print('Run the experiments for num_instances=',num_instances)

    if ICL_METHOD == 'task-level':
        icl_name = '_ICL-task'
    elif ICL_METHOD == 'instance-level-nearest':
        icl_name = '_ICL-nearest-' + str(NUM_EXAMPLES_IN_PROMPT).zfill(2)
    elif ICL_METHOD == 'instance-level-random':
        icl_name = '_ICL-random-' + str(NUM_EXAMPLES_IN_PROMPT).zfill(2)
    else:
        icl_name = '_ICL-error'

    print('Run the experiments for: ' + DATASET + '_Input-' + INPUT + '_' + checkpoint.split('/')[-1] + icl_name)
    file_name = 'Adding_Custom_Layers_Results/' + DATASET + '_Input-' + INPUT + '_' + checkpoint.split('/')[-1] + icl_name

    test_results = {}
    projects_real = {}
    projects_pred = {}
    all_real = []
    all_pred = []
    all_context = []
    all_project = []
    labels = list(set(dataset[project_name]['train']['label']))

    unrecognized_pred = 0 # don't move it to outer loop
    with open(file_name+'_confmat.txt', "w") as output_file:
        for project_name, data in dataset.items():
            print('---------- '+str(project_name)+' ----------')
            output_file.write('\n---------- '+str(project_name)+' ----------\n')
            torch.cuda.empty_cache()
            gc.collect()
            if True: # ICL_METHOD == 'instance-level-nearest': 
                train_data_embed = st_model.encode(data['train']['context'], show_progress_bar=False)
                test_data_embed = st_model.encode(data['test']['context'], show_progress_bar=False)
                cos_sim = sentence_transformers.util.cos_sim(test_data_embed, train_data_embed)
            test_results[project_name] = []
            projects_real[project_name] = []
            projects_pred[project_name] = []
            for indx, row, row_embed in zip(range(len(data['test'])), data['test'], test_data_embed):
                if ICL_METHOD == 'task-level':
                    # prompt = generate_prompt(INIT_PROMPT, row['prompt_context'])
                    prompt = generate_prompt_without_adding_dynamic_examples(INIT_PROMPT, row['prompt_context'])
                elif ICL_METHOD == 'instance-level-nearest':
                    top_n_items = get_the_most_relevant_items_for_an_item_given_cos_sim(indx, cos_sim, NUM_EXAMPLES_IN_PROMPT)
                    prompt = generate_prompt_by_top_n_items(INIT_PROMPT, row['prompt_context'], top_n_items, data['train'])
                elif ICL_METHOD == 'instance-level-random':
                    prompt = generate_prompt_by_random_n_items(INIT_PROMPT, row['prompt_context'], NUM_EXAMPLES_IN_PROMPT, data['train'])
                else:
                    print('ERROR!')
                #print(len(split_to_tokens(prompt)))
                if len(split_to_tokens(prompt))<1000:
                    pred = get_response(model, tokenizer, generation_config, prompt)
                else:
                    pred = ''
                for label in labels:
                    if len(pred)>0 and pred.split()[0].lower() == label.lower():
                        pred = label
                if pred not in labels:
                    #print(pred)
                    if DATASET=='Maldonado62k':
                        pred = 'Not-SATD'
                    elif DATASET=='OBrien':
                        pred = 'Requirement'
                    unrecognized_pred += 1
                if pred=='SATD' and row['label']=='Not-SATD' and False:
                    print(prompt)
                    print('--------------------------')
                projects_real[project_name].append(row['label'])               
                projects_pred[project_name].append(pred)
                all_context.append(row['prompt_context'])
                all_project.append(project_name)
            all_real += projects_real[project_name]    
            all_pred += projects_pred[project_name]
            # print precision recall and F1 for this project
            print(classification_report(projects_real[project_name], projects_pred[project_name], zero_division=0, digits=3))
            output_file.write(classification_report(projects_real[project_name], projects_pred[project_name], zero_division=0, digits=3)+"\n")
            # print confusion matrix for this project
            confmat_str = get_confmat_str(projects_real[project_name], projects_pred[project_name], labels=labels)
            print(confmat_str)
            output_file.write(confmat_str)
        print('=========== Overall ==========')
        output_file.write('\n=========== Overall ==========\n')
        # print precision recall and F1 for all data
        print(classification_report(all_real, all_pred, zero_division=0, digits=3))
        output_file.write(classification_report(all_real, all_pred, zero_division=0, digits=3)+"\n")
        # print confusion matrix for all data
        confmat_str = get_confmat_str(all_real, all_pred, labels=labels)
        print(confmat_str)
        output_file.write(confmat_str)
        print('\nNumber of unrecognized predictions:', unrecognized_pred, '\nWe considered them as the majority class.')
        output_file.write('\nNumber of unrecognized predictions: '+str(unrecognized_pred)+'\nWe considered them as the majority class.\n')

    test_result_df = pd.DataFrame({'project': all_project, 'context':all_context, 'real': all_real, 'pred': all_pred})
    test_result_df.to_csv(file_name+'_pred.csv', index=False)

Run the experiments for num_instances= [0, 1, 2, 3, 5, 10, 15, 20]
Run the experiments for: OBrien_Input-ct_flan-t5-xxl_ICL-nearest-00
---------- 0 ----------


Token indices sequence length is longer than the specified maximum sequence length for this model (717 > 512). Running this sequence through the model will result in indexing errors


               precision    recall  f1-score   support

         Code      0.429     0.375     0.400        24
       Defect      0.250     0.250     0.250         4
       Design      0.429     0.333     0.375         9
Documentation      0.000     0.000     0.000         0
          M&T      0.250     0.286     0.267         7
  Requirement      0.636     0.600     0.618        35

     accuracy                          0.456        79
    macro avg      0.332     0.307     0.318        79
 weighted avg      0.496     0.456     0.474        79

              Requirement   Code          M&T           Design        Documentation Defect       
Requirement   21            6             4             2             2             0            
Code          8             9             2             1             1             3            
M&T           3             0             2             0             2             0            
Design        1             4             0            

In [31]:
# show the F1 score over projects in Maldonado62k dataset and calculate the average across 10 projects
from sklearn.metrics import f1_score
df_pred = pd.read_csv('Paper_Results/Maldonado62k-ICL/Maldonado62k_Input-ct_flan-t5-xxl_ICL-task-MAT_pred.csv')
print(len(df_pred))
print(df_pred.columns)

f1_scores = {}

for project_name, group in df_pred.groupby('project'):
    real = group['real']
    pred = group['pred']
    f1_scores[project_name] = f1_score(real, pred, pos_label='SATD', average='binary')
    

df_f1_scores = pd.DataFrame(list(f1_scores.items()), columns=['Project', 'F1 Score'])
df_f1_scores.loc[len(df_f1_scores)] = ['Average', df_f1_scores['F1 Score'].mean()]

df_f1_scores

62275
Index(['project', 'context', 'real', 'pred'], dtype='object')


Unnamed: 0,Project,F1 Score
0,apache-ant-1.7.0,0.526316
1,apache-jmeter-2.10,0.801444
2,argouml,0.923124
3,columba-1.4-src,0.864979
4,emf-2.4.1,0.491525
5,hibernate-distribution-3.3.2.GA,0.836777
6,jEdit-4.2,0.587699
7,jfreechart-1.0.19,0.82495
8,jruby-1.4.0,0.909357
9,sql12,0.702065


In [None]:
#########################################