Import dependencies

In [None]:
from sentence_transformers import InputExample

import pandas as pd
import pickle

Define functions to for the triplets

In [None]:
# for each bug extract its textual info (short decription and description)
def get_bug_textual_info(value: pd.Series):

    short_desc = ''
    description = ''

    if isinstance(value['short_desc'], str):
        short_desc = value['short_desc']
    
    if isinstance(value['description'], str):
        description = value['description']
    
    

    return f"{short_desc} - {description}"

In [None]:
# use reports from the train split and the relations file to create the triplets
def generate_triplets_with_contex(reports, relations):
    duplicates_pairs_set = set()
    train_examples = []

    # iterate trough all the report in the train split 
    for index, value in reports.iterrows():

        # check if report has duplicate
        if index in relations.index:

            # get the report's duplicates
            duplicates_id = []
            try:
                duplicates_id = [int(id) for id in relations.loc[index].values[0].split(';')]
            except:
                duplicates_id = []

            # iterate trough duplicates' ids
            for id in duplicates_id:

                # check if duplicate belongs to the train split
                if id in reports.index:
                    
                    # create duplicate pair and add it to the duplicates set if it's not there already
                    duplicates_pair = tuple(sorted([index, id]))
                    if duplicates_pair not in duplicates_pairs_set:

                        duplicates_pairs_set.add(duplicates_pair)

                        positive = reports.loc[id]

                        # select a random non duplicate to be the negative example of the triplet
                        negative = reports.sample(n=1).iloc[0]
                        
                        while (int(negative.name) == index) or (int(negative.name) in duplicates_id) or (not isinstance(negative['description'], str)):
                            negative = reports.sample(n=1).iloc[0]

                        # finally, create the triplet with the bug report (anchor), its duplicate (positive) and a non-duplicate (negative)
                        if isinstance(value['description'], str) and isinstance(positive['description'], str):
                            train_examples.append(
                                InputExample(texts=[
                                    get_bug_textual_info(value),
                                    get_bug_textual_info(positive),
                                    get_bug_textual_info(negative)
                                ])
                            )
    # return list with all triplets
    return train_examples

Import all the training splits and the relations files

In [None]:
train_file_path = 'data/splits/eclipse/eclipse_train.csv'
relations_file_path = 'data/br/eclipse/eclipse_pairs.csv'

reports_train_eclipse = pd.read_csv(train_file_path, index_col='bug_id')
relations_eclipse = pd.read_csv(relations_file_path, index_col='issue_id')

In [None]:
train_file_path = 'data/splits/openoffice/openoffice_train.csv'
relations_file_path = 'data/br/openoffice/openoffice_pairs.csv'

reports_train_openoffice = pd.read_csv(train_file_path, index_col='bug_id')
relations_openoffice = pd.read_csv(relations_file_path, index_col='issue_id')

In [None]:
train_file_path = 'data/splits/firefox/firefox_train.csv'
relations_file_path = 'data/br/firefox/firefox_pairs.csv'

reports_train_firefox = pd.read_csv(train_file_path, index_col='bug_id')
relations_firefox = pd.read_csv(relations_file_path, index_col='issue_id')

In [None]:
train_file_path = './data/splits/netbeans/netbeans_train.csv'
relations_file_path = 'data/br/netbeans/netbeans_pairs - Copia.csv'

reports_train_netbeans = pd.read_csv(train_file_path, index_col='bug_id')
relations_netbeans = pd.read_csv(relations_file_path, index_col='issue_id')

Create dictionary to store them all

In [None]:
datasets = {}

In [None]:
datasets['eclipse'] = {
    'reports': reports_train_eclipse,
    'relations': relations_eclipse
}

datasets['openoffice'] = {
    'reports': reports_train_openoffice,
    'relations': relations_openoffice
}

datasets['firefox'] = {
    'reports': reports_train_firefox,
    'relations': relations_firefox
}

datasets['netbeans'] = {
    'reports': reports_train_netbeans,
    'relations': relations_netbeans
}

For each dataset generate the triplets and save them

In [None]:
for key, value in datasets.items():

    # Generate triplets for the train dataset
    triplets = generate_triplets_with_contex(value['reports'], value['relations'])
    print(key, len(triplets))

    # Save triplets into pickle file
    triplets_file_path = f'data/splits/complete_triplets/complete_{key}_triplets.pkl'
    with open(triplets_file_path, "wb") as f:
        pickle.dump(triplets, f)

    