In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import util
from collections import Counter
from torch import load, Tensor
import pandas as pd

import multiprocessing
import pickle
import boto3
import math
import io

In [None]:
s3 = boto3.client('s3')

bucket = 'sagemaker-studio-113002098422-fuoot0q3jmu'

encodings_file_key = 'thesis/encodings/all-mpnet-base-v2/netbeans/mpnet_netbeans_test_enc.pt'
encodings_obj = s3.get_object(Bucket = bucket, Key = encodings_file_key)

netbeans_test_file_key = 'thesis/splits/netbeans/netbeans_test.csv'
netbeans_relations_file_key = 'thesis/datasets/netbeans/netbeans_pairs - Copia.csv'

pickle_file_key = 'thesis/results/all-mpnet-base-v2/mpnet_recall_rate_netbeans_test.pkl'


netbeans_test_obj = s3.get_object(Bucket = bucket, Key = netbeans_test_file_key)
netbeans_relations_obj = s3.get_object(Bucket = bucket, Key = netbeans_relations_file_key)

In [None]:
encodings_bytes = encodings_obj['Body'].read()

In [None]:
emb_dict = load(io.BytesIO(encodings_bytes))

In [None]:
emb = emb_dict['encoded_desc']
emb_keys = [x for x in emb.keys()]

In [None]:
reports_netbeans_test = pd.read_csv(netbeans_test_obj['Body'], index_col='bug_id')
relations_netbeans = pd.read_csv(netbeans_relations_obj['Body'], index_col='issue_id')

In [None]:
def rr_k_dict(prompt: Tensor, tensor_dict: dict, duplicate_ids: set, k: int):

    similarity_scores = [] # array that will store tuples with a report id and its similarity score with the prompt

    # iterate trough the dataframe
    for bug_id in tensor_dict:

        # append current report id and cosine similarity for the current report
        # and the prompt descriptions the the selected model has generated
        try:
            similarity_scores.append(
                (
                    bug_id, 
                    util.cos_sim(
                        prompt,
                        tensor_dict[bug_id]
                    )
                )
            )
        except:
            pass

    # sort the similarity_scores list based on the similarity scores in descending order
    similarity_scores.sort(key=lambda x: -x[1])

    relevant_at_top_k = 0 # initialize counter of identified duplicates in top k as 0

    # iterate trough the tuples in the similarity_scores array. We skip the first since it will be the prompt itself
    for value in similarity_scores[1:k+1]:

        # if the current report is a duplicate of the prompt, increase relevant_at_top_k by one
        if value[0] in duplicate_ids:
            relevant_at_top_k += 1

    # the recall rate at k is the number of duplicates retrieved in the first k over the total number of duplicates
    recall_rate = relevant_at_top_k / len(duplicate_ids)
    return recall_rate

In [None]:
def generalRRK(reports: pd.DataFrame, relations: pd.DataFrame, tensor_dict_keys: list,  tensor_dict: dict, k: int, recall_rate_dict):

    for index in tensor_dict_keys:
        if not (emb[index] is None):
            duplicates_id = []

            if index in relations.index:
                try:
                    duplicates_id_whole_dataset = [int(id) for id in relations.loc[index].values[0].split(';')]
                    duplicates_id = [x for x in duplicates_id_whole_dataset if x in reports.index]
                except:
                    duplicates_id = []

            if len(duplicates_id) > 0:
                recall_rate = rr_k_dict(emb[index], tensor_dict, set(duplicates_id), k)
                recall_rate_dict[index] = recall_rate
                # print(f'{index} -> {recall_rate}')

In [None]:
num_of_chunks = 12
chunk_size = math.ceil(len(emb_keys) / num_of_chunks)
chunks = []
results = []

for i in range(0, len(emb_keys), chunk_size):
    chunk = emb_keys[i:i + chunk_size]
    chunks.append(chunk)

In [None]:
processes = []

for chunk in chunks:
    manager = multiprocessing.Manager()
    result = manager.dict()
    results.append(result)

    args_process = (reports_netbeans_test, relations_netbeans, chunk, emb, 10, result)
    process = multiprocessing.Process(target=generalRRK, args=args_process)

    processes.append(process)

In [None]:
for process in processes:
    process.start()

for process in processes:
    process.join()

In [None]:
total_results_counter = Counter({})

for result in results:
    total_results_counter += Counter(result)

total_results_dict = dict(total_results_counter)
total_results_dict

In [None]:
pickle_data = pickle.dumps(total_results_dict)

s3.put_object(Body=pickle_data, Bucket=bucket, Key=pickle_file_key