Install the sentence transformers library

In [None]:
!pip install -U sentence-transformers

Import dependencies

In [None]:
from sentence_transformers import util
from torch import load, Tensor
import pandas as pd

import multiprocessing
import pickle
import boto3
import math
import io

If the encodings data and data splits are store in a S3 bucket, import and read the data with the boto3 library 

In [None]:
s3 = boto3.client('s3')

bucket = 'bucket name'

encodings_file_key = f'encodings file key'
encodings_obj = s3.get_object(Bucket = bucket, Key = encodings_file_key)


test_encodings_file_key = f'data split file key' # data for the specific dataset split e.g. firefox test split
relations_file_key = f'relations file key'       # ground truth file, where the the actual duplicates can be found


test_obj = s3.get_object(Bucket = bucket, Key = test_encodings_file_key)
relations_obj = s3.get_object(Bucket = bucket, Key = relations_file_key)

In [None]:
encodings_bytes = encodings_obj['Body'].read()

In [None]:
reports_test = pd.read_csv(test_obj['Body'], index_col='bug_id')
relations = pd.read_csv(relations_obj['Body'], index_col='issue_id')

In [None]:
emb_dict = load(io.BytesIO(encodings_bytes))

In [None]:
emb = emb_dict['encoded_desc']
emb_keys = [x for x in emb.keys()]

If data splits and encodings are stored locally read them from local files

In [None]:
# data_split_file_path = "data split file path"
# relations_file_path = "relations file path"

# reports_test = pd.read_csv(data_split_file_path, index_col='bug_id')
# relations = pd.read_csv(relations_file_path, index_col='issue_id')

# embedding_file_path = "embeddings file path"
# emb_dict = load(embedding_file_path)
# emb = emb_dict['encoded_desc']
# emb_keys = [x for x in emb.keys()]

Define functions to gather the model results

In [None]:
# Returns an array with the [true_positives, false_negatives, false_positives, true_negatives] in the top k suggestions that resulted from the model's encodings for a specific report
def results(prompt: Tensor, reports: pd.DataFrame, tensor_dict: dict, duplicate_ids: set, k: int):

    similarity_scores = [] # array that will store tuples with a report id and its similarity score with the prompt

    # iterate trough the dataframe
    for bug_id in tensor_dict:

        if bug_id in reports.index:

            # append current report id and cosine similarity for the current report
            # and the prompt descriptions the the selected model has generated
            try:
                similarity_scores.append(
                    (
                        bug_id,
                        util.cos_sim(
                            prompt,
                            tensor_dict[bug_id]
                        )
                    )
                )
            except:
                pass

    # sort the similarity_scores list based on the similarity scores in descending order
    similarity_scores.sort(key=lambda x: -x[1])

    relevant_at_top_k = 0 # initialize counter of identified duplicates in top k as 0

    # iterate trough the tuples in the similarity_scores array. We skip the first since it will be the prompt itself
    for value in similarity_scores[1:k+1]:
        # if the current report is a duplicate of the prompt, increase relevant_at_top_k by one
        if value[0] in duplicate_ids:
            relevant_at_top_k += 1

    positives = len(duplicate_ids)
    negatives = len(reports) - len(duplicate_ids)

    false_positives = k - relevant_at_top_k
    false_negatives = len(duplicate_ids) - relevant_at_top_k

    true_positives = relevant_at_top_k
    true_negatives = negatives - false_positives

    return [true_positives, false_negatives, false_positives, true_negatives]

In [None]:
# Run the function above for each report to collect their results
def generalResults(reports: pd.DataFrame, relations: pd.DataFrame, tensor_dict_keys: list,  tensor_dict: dict, k: int, confusion_matrix_dict):

    # for each report encoded
    for index in tensor_dict_keys:
        if not (tensor_dict[index] is None):
            duplicates_id = []

            # filter the report's duplicates to be the one's in the current split
            if index in relations.index:
                try:
                    duplicates_id_whole_dataset = [int(id) for id in relations.loc[index].values[0].split(';')]
                    duplicates_id = [x for x in duplicates_id_whole_dataset if x in reports.index]
                except:
                    duplicates_id = []

            if len(duplicates_id) > 0:
                # Get the results for the current report and save them in the confusion_matrix_dict datastructure
                confusion_matrix = results(tensor_dict[index], reports, tensor_dict, set(duplicates_id), k)
                confusion_matrix_dict[index] = confusion_matrix

Due to the splits large size, it takes a long time to get the results for all the reports. To speed up the process we can use multiple processes

In [None]:
num_of_chunks = 33 # the number of process you can use will vary depending on your cpu
# split the reports into chunks to be sent to a process (here we just split the encodings that are used as prompts, they will still be compared to all the reports in the split)
chunk_size = math.ceil(len(emb_keys) / num_of_chunks)
chunks = []
results = []

for i in range(0, len(emb_keys), chunk_size):
    chunk = emb_keys[i:i + chunk_size]
    chunks.append(chunk)

print(len(chunks))

In [None]:
# create the processes
processes = []
k = 5

for chunk in chunks:
    manager = multiprocessing.Manager()
    result = manager.dict()
    results.append(result)

    args_process = (reports_test, relations, chunk, emb, k, result)
    process = multiprocessing.Process(target=generalResults, args=args_process)

    processes.append(process)


In [None]:
# start the processes
for process in processes:
    process.start()

# wait for all process to finish
for process in processes:
    process.join()

In [None]:
# merge all the results from each process
total_results_dict = {}
for result in results:
    test = dict(result)
    total_results_dict |= test

With the results with the [true_positives, false_negatives, false_positives, true_negatives] format, we can calculate multiple metrics, such as recall rate@k and precision@k

In [None]:
recall_values = {}
precision_values = {}

total_true_positives = 0
total_false_negatives = 0

for key, value in total_results_dict.items():
    true_positives = value[0]
    false_negatives = value[1]
    false_positives = value[2]

    #recall rate at k is the number of actual duplicate report in the top k suggestions (true positives) devided by all the actual duplicate report a bug has (true positives + false negatives)
    recall = true_positives / (true_positives + false_negatives)
    #precision at k is the number of actual duplicate report in the top k suggestions (true positives) devided by k
    precision = true_positives / k
    
    recall_values[key] = recall
    precision_values[key] = precision
    
    total_true_positives += true_positives
    total_false_negatives += false_negatives

In [None]:
print(total_true_positives)
print(total_false_negatives)

We can use the pandas library to form data series and get statistics such as mean, median, standard deviation ...

In [None]:
precision_series = pd.Series(precision_values)
precision_series.describe()

In [None]:
rr_series = pd.Series(recall_values)
rr_series.describe()

Save the results

In [None]:
pickle_file_key = 'file path'
pickle_data = pickle.dumps(total_results_dict)

s3.put_object(Body=pickle_data, Bucket=bucket, Key=pickle_file_key)
