In [1]:
import pandas as pd
from sentence_transformers import util
from torch import load, Tensor
import pickle

In [2]:
file_path = "data/firefox_samples.pkl"
with open(file_path, 'rb') as f:
    samples_reports = pickle.load(f)

In [3]:
file_path_encodings = "data/encodings/fine-tuned-models/mpnet-lr-1e-8-m-5-e-3/mpnet-lr-1e-8-m-5-e-3-firefox-encodings-samples.pkl"
# file_path_encodings = "data/encodings/mpnet-base/mpnet-base-firefox-encodings-samples.pkl"
with open(file_path_encodings, 'rb') as f:
    samples_embeddings = pickle.load(f)

In [4]:
def rr_k_dict(prompt: Tensor, reports: pd.DataFrame, tensor_dict: dict, duplicate_ids: set, k: int):

    similarity_scores = [] # array that will store tuples with a report id and its similarity score with the prompt

    # iterate trough the dataframe
    for bug_id in tensor_dict:

        if bug_id in reports.index:

            # append current report id and cosine similarity for the current report
            # and the prompt descriptions the the selected model has generated
            try:
                similarity_scores.append(
                    (
                        bug_id,
                        util.cos_sim(
                            prompt,
                            tensor_dict[bug_id]
                        )[0].item()
                    )
                )
            except:
                pass

    # sort the similarity_scores list based on the similarity scores in descending order
    similarity_scores.sort(key=lambda x: -x[1])

    relevant_at_top_k = 0 # initialize counter of identified duplicates in top k as 0

    order = 0
    
    
    # iterate trough the tuples in the similarity_scores array. We skip the first since it will be the prompt itself
    for value in similarity_scores[1:k+1]:
        order += 1

        # if the current report is a duplicate of the prompt, increase relevant_at_top_k by one
        if value[0] in duplicate_ids:
            relevant_at_top_k += 1

    
    
   
    
    positives = len(duplicate_ids)
    negatives = len(reports) - len(duplicate_ids)

    false_positives = k - relevant_at_top_k
    false_negatives = len(duplicate_ids) - relevant_at_top_k

    true_positives = relevant_at_top_k
    true_negatives = negatives - false_positives

    
    return [true_positives, false_negatives, false_positives, true_negatives]
    

In [5]:
def generalRRK2(reports: pd.DataFrame, relations: pd.DataFrame, tensor_dict_keys: list,  tensor_dict: dict, k: int):
    confusion_matrix_dict = {}
    for index in tensor_dict_keys:
        if not (tensor_dict[index] is None):
            duplicates_id = []

            if index in relations.index:
                duplicates_id = relations.loc[index]['duplicates']

            if len(duplicates_id) > 0:
                recall_rate = rr_k_dict(tensor_dict[index], reports, tensor_dict, set(duplicates_id), k)
                confusion_matrix_dict[index] = recall_rate
                #print(f'{index} -> {recall_rate}')

    true_positives  = sum(x[0] for x in confusion_matrix_dict.values())
    false_negatives = sum(x[1] for x in confusion_matrix_dict.values())
    false_positives = sum(x[2] for x in confusion_matrix_dict.values())
    true_negatives  = sum(x[3] for x in confusion_matrix_dict.values())
    
    return {
        "true_positives" : true_positives,
        "false_negatives": false_negatives,
        "false_positives": false_positives,
        "true_negatives" : true_negatives
    }

In [6]:
results = {}

for key, value in samples_reports.items():

    similarity_scores = [] # array that will store tuples with a report id and its similarity score with the prompt

    emb = samples_embeddings[key]
    emb_keys = [x for x in emb.keys()]

    print(len(emb))

    results[key] = generalRRK2(value["reports"], value["relations"], emb_keys, emb, 10)

436
418
425
428
407
476
405
430
402
352
394
429
366
440
465
409
458
474
525
390
422
453
419
459
484
433
427
401
394
422
439
446
444
435
446
413


In [8]:
results_file_path = "data/results/fine-tuned/firefox-fine-tuned-results.pkl"
with open(results_file_path, 'wb') as f:
    pickle.dump(results, f)

In [9]:
recall_values = {}
for key, value in results.items():
    recall = value['true_positives'] / (value['true_positives']  + value['false_negatives'])
    recall_values[key] = recall

In [10]:
recal_series = pd.Series(recall_values)
recal_series.describe()

count    36.000000
mean      0.738164
std       0.056966
min       0.637500
25%       0.699929
50%       0.727434
75%       0.766169
max       0.913208
dtype: float64

In [10]:
recal_series = pd.Series(recall_values)
recal_series.describe()

count    36.000000
mean      0.724678
std       0.054804
min       0.639583
25%       0.685408
50%       0.719260
75%       0.746369
max       0.898113
dtype: float64

In [12]:
recall_values

{1: 0.7397660818713451,
 2: 0.6928934010152284,
 3: 0.7247474747474747,
 4: 0.7041564792176039,
 5: 0.7566844919786097,
 6: 0.7096018735362998,
 7: 0.6821705426356589,
 8: 0.6762402088772846,
 9: 0.7112299465240641,
 10: 0.8981132075471698,
 11: 0.7439024390243902,
 12: 0.7537688442211056,
 13: 0.7255520504731862,
 14: 0.7628571428571429,
 15: 0.6395833333333333,
 16: 0.8006230529595015,
 17: 0.6650366748166259,
 18: 0.6556603773584906,
 19: 0.6602564102564102,
 20: 0.8074534161490683,
 21: 0.7994579945799458,
 22: 0.7008928571428571,
 23: 0.7327327327327328,
 24: 0.735632183908046,
 25: 0.6636971046770601,
 26: 0.7252124645892352,
 27: 0.7146401985111662,
 28: 0.6798866855524079,
 29: 0.8353658536585366,
 30: 0.7043010752688172,
 31: 0.7263681592039801,
 32: 0.6744791666666666,
 33: 0.7238805970149254,
 34: 0.6864864864864865,
 35: 0.6948717948717948,
 36: 0.7802197802197802}

In [11]:
inverse = 1 / recal_series
inverse

1     1.335938
2     1.417266
3     1.337838
4     1.377104
5     1.303136
6     1.377419
7     1.438662
8     1.450758
9     1.411321
10    1.095041
11    1.322581
12    1.283871
13    1.372294
14    1.277372
15    1.568627
16    1.220532
17    1.471223
18    1.503546
19    1.490446
20    1.219697
21    1.217822
22    1.395639
23    1.305882
24    1.342593
25    1.491694
26    1.368217
27    1.384880
28    1.429150
29    1.184116
30    1.409091
31    1.348993
32    1.449057
33    1.344482
34    1.428571
35    1.407942
36    1.255172
dtype: float64

In [12]:
inverse.describe()

count    36.000000
mean      1.362166
std       0.099894
min       1.095041
25%       1.305196
50%       1.374699
75%       1.428716
max       1.568627
dtype: float64

In [15]:
inverse.describe()

count    36.000000
mean      1.387140
std       0.098676
min       1.113445
25%       1.339863
50%       1.390374
75%       1.458997
max       1.563518
dtype: float64

In [16]:
results

{1: {'true_positives': 253,
  'false_negatives': 89,
  'false_positives': 747,
  'true_negatives': 42511},
 2: {'true_positives': 273,
  'false_negatives': 121,
  'false_positives': 727,
  'true_negatives': 40679},
 3: {'true_positives': 287,
  'false_negatives': 109,
  'false_positives': 713,
  'true_negatives': 41391},
 4: {'true_positives': 288,
  'false_negatives': 121,
  'false_positives': 712,
  'true_negatives': 41679},
 5: {'true_positives': 283,
  'false_negatives': 91,
  'false_positives': 717,
  'true_negatives': 39609},
 6: {'true_positives': 303,
  'false_negatives': 124,
  'false_positives': 697,
  'true_negatives': 46476},
 7: {'true_positives': 264,
  'false_negatives': 123,
  'false_positives': 736,
  'true_negatives': 39377},
 8: {'true_positives': 259,
  'false_negatives': 124,
  'false_positives': 741,
  'true_negatives': 41876},
 9: {'true_positives': 266,
  'false_negatives': 108,
  'false_positives': 734,
  'true_negatives': 39092},
 10: {'true_positives': 238,
 