In [2]:
from sentence_transformers import InputExample

import pandas as pd
import pickle

In [11]:
# for each bug extract its textual info (short decription and description)
def get_bug_textual_info(value: pd.Series):

    short_desc = ''
    description = ''

    if isinstance(value['short_desc'], str):
        short_desc = value['short_desc']
    
    if isinstance(value['description'], str):
        description = value['description']
    
    

    return f"{short_desc} - {description}"

In [12]:
# use reports from the train split and the relations file to create the triplets
def generate_triplets_with_contex(reports, relations):
    duplicates_pairs_set = set()
    train_examples = []

    # iterate trough all the report in the train split 
    for index, value in reports.iterrows():

        # check if report has duplicate
        if index in relations.index:

            # get the report's duplicates
            duplicates_id = []
            try:
                duplicates_id = [int(id) for id in relations.loc[index].values[0].split(';')]
            except:
                duplicates_id = []

            # iterate trough duplicates' ids
            for id in duplicates_id:

                # check if duplicate belongs to the train split
                if id in reports.index:
                    
                    # create duplicate pair and add it to the duplicates set if it's not there already
                    duplicates_pair = tuple(sorted([index, id]))
                    if duplicates_pair not in duplicates_pairs_set:

                        duplicates_pairs_set.add(duplicates_pair)

                        positive = reports.loc[id]

                        # select a random non duplicate to be the negative example of the triplet
                        negative = reports.sample(n=1).iloc[0]
                        
                        while (int(negative.name) == index) or (int(negative.name) in duplicates_id) or (not isinstance(negative['description'], str)):
                            negative = reports.sample(n=1).iloc[0]

                        # finally, create the triplet with the bug report (anchor), its duplicate (positive) and a non-duplicate (negative)
                        if isinstance(value['description'], str) and isinstance(positive['description'], str):
                            train_examples.append(
                                InputExample(texts=[
                                    get_bug_textual_info(value),
                                    get_bug_textual_info(positive),
                                    get_bug_textual_info(negative)
                                ])
                            )
    # return list with all triplets
    return train_examples

In [39]:
reports_sim_dict = {
    'short_desc': {
        1: 2,
        2: "This is another report",
        3: "I need help with this",
        4: "I can't fix this",
        5: "What is happening?"
    },
    'component': {
        1: 'ui',
        2: 'ui',
        3: 'code',
        4: 'code',
        5: 'code'
    },
    'description': {
        1: '1' * 10,
        2: '2' * 10,
        3: '3' * 10,
        4: '4' * 10,
        5: '5' * 10,
        
    }
}

relations_sim_dict = {
    'duplicate': {
        1: '2',
        2: '1',
        3: '4;5',
        4: '3'
    }
}

In [40]:
reports_sim = pd.DataFrame(reports_sim_dict)
reports_sim.rename_axis('bug_id', inplace=True)


relations_sim = pd.DataFrame(relations_sim_dict)
relations_sim.rename_axis('issue_id', inplace=True)

In [41]:
reports_sim.iloc[0]

short_desc              2
component              ui
description    1111111111
Name: 1, dtype: object

In [42]:
get_bug_textual_info(reports_sim.iloc[0])

' - 1111111111'

In [43]:
test = generate_triplets_with_contex(reports_sim, relations_sim)
len(test)

3

In [44]:
test[0].texts

[' - 1111111111',
 'This is another report - 2222222222',
 'What is happening? - 5555555555']

In [19]:
train_file_path = 'data/splits/eclipse/eclipse_train.csv'
relations_file_path = 'data/br/eclipse/eclipse_pairs.csv'

reports_train_eclipse = pd.read_csv(train_file_path, index_col='bug_id')
relations_eclipse = pd.read_csv(relations_file_path, index_col='issue_id')

In [20]:
train_file_path = 'data/splits/openoffice/openoffice_train.csv'
relations_file_path = 'data/br/openoffice/openoffice_pairs.csv'

reports_train_openoffice = pd.read_csv(train_file_path, index_col='bug_id')
relations_openoffice = pd.read_csv(relations_file_path, index_col='issue_id')

In [21]:
train_file_path = 'data/splits/firefox/firefox_train.csv'
relations_file_path = 'data/br/firefox/firefox_pairs.csv'

reports_train_firefox = pd.read_csv(train_file_path, index_col='bug_id')
relations_firefox = pd.read_csv(relations_file_path, index_col='issue_id')

In [22]:
train_file_path = './data/splits/netbeans/netbeans_train.csv'
relations_file_path = 'data/br/netbeans/netbeans_pairs - Copia.csv'

reports_train_netbeans = pd.read_csv(train_file_path, index_col='bug_id')
relations_netbeans = pd.read_csv(relations_file_path, index_col='issue_id')

In [23]:
datasets = {}

In [24]:
datasets['eclipse'] = {
    'reports': reports_train_eclipse,
    'relations': relations_eclipse
}

datasets['openoffice'] = {
    'reports': reports_train_openoffice,
    'relations': relations_openoffice
}

datasets['firefox'] = {
    'reports': reports_train_firefox,
    'relations': relations_firefox
}

datasets['netbeans'] = {
    'reports': reports_train_netbeans,
    'relations': relations_netbeans
}

In [29]:
triplets_netbeans = generate_triplets_with_contex(datasets['netbeans']['reports'], datasets['netbeans']['relations'])

In [116]:
triplets_netbeans[5790].texts

['Undoing an action associated with an automatic import addition leads to a bad carret position - Undoing of "Add throws.." hint leads to bad carret position.\n\nSteps to reproduce:\n-------------------\n1) Open attached Surround.java\n2) Place carret on line 25.\n3) Click the Bulb and invoke "Add Throws Clause .."\n4) Invoke Undo.\n--> The carret is placed in the import statements block. It should be placed\nwhere it has been placed before the action, on line 25.\n\n[200508291800, JDK 1.5.0_05 b03, solaris 10 / sparc]',
 'Incorrect caret position after fast import - After performing a fast import (Ctrl-Shift-I), undo (incorrectly) places the caret position just after the removed import. \n\n1. More specifically, type something like\n\nFileInputStream in;\n\n2. Place the cursor after FileInputStream and press Ctrl-Shift-I. java.io.FileInputStream is imported and the cursor\nremains (properly) at its location.\n\n3. Press Ctrl-Z to undo. The cursor should remain static, but it is moved 

In [118]:
for key, value in datasets.items():

    # Generate triplets for the train dataset
    triplets = generate_triplets_with_contex(value['reports'], value['relations'])
    print(key, len(triplets))

    # Save triplets into pickle file
    triplets_file_path = f'data/splits/complete_triplets/complete_{key}_triplets.pkl'
    with open(triplets_file_path, "wb") as f:
        pickle.dump(triplets, f)

    

eclipse 20550
openoffice 13996
save
save
save
save
save
save
save
save
save
save
save
save
save
save
save
firefox 37228
netbeans 22374


In [124]:
train_triplets = {}

for key, value in datasets.items():
    
    triplets_file_path = f'data/splits/complete_triplets/complete_{key}_triplets.pkl'

    with open(triplets_file_path, "rb") as f:
        loaded_input_examples = pickle.load(f)

    train_triplets[key] = loaded_input_examples

    print(key, len(loaded_input_examples))

eclipse 20550
openoffice 13996
firefox 37228
netbeans 22374


In [125]:
netb = train_triplets['netbeans']

In [153]:
netb[7187].texts

['[70cat] IllegalStateException: paragraphViewEndOffset=9359 > docViewEndOffset=9358 docView: DV@00c5d01c#306:<0,9358>, WxH:960.0x5202.0; incomingMod=false; lengthyAtomicEdit=0 Doc: NbEditorDocument@0144dd0d, L - Build: NetBeans IDE Dev (Build 201102100500)\nVM: Java HotSpot(TM) Client VM, 20.0-b02, Java(TM) SE Runtime Environment, 1.7.0-ea-b118\nOS: Windows XP\n\nStacktrace: \njava.lang.IllegalStateException: paragraphViewEndOffset=9359 > docViewEndOffset=9358\ndocView:\nDV@00c5d01c#306:<0,9358>, WxH:960.0x5202.0; incomingMod=false; lengthyAtomicEdit=0\nDoc: NbEditorDocument@0144dd0d, Length=9357, Version=106, StreamDesc:org.netbeans.modules.java.JavaDataObject@e551a4[MasterFileObject[E:\\quaglan\\My Documents\\projects\\subversion\\sdi3\\Geoportal\\Common\\MavenProjects\\INSPIREGeoportalLibrary\\src\\main\\java\\eu\\europa\\ec\\inspire\\model\\Locator.java@4d834f:1bf756b,valid=true]]\n    [  0]: PV@00e212fa#0:<0,3> Y=0.0(R0.0) "/*\\n", WxH:0.0x17.0, children=null\n    [  1]: PV@004bf