# Introduction - Using COSINE Metric

In this notebook we demonstrate the use of **LSI (Latent Semantic Indexing)** technique of Information Retrieval context to make trace link recovery between Features and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each feature description and title as an entire document that must be returned to the query made

This notebook follows the analysis made in **oracle_v2_analysis**, where we obtained an Cohen's kappa score of _0.41_ between the answers of the researcher and the answers of the volunteers.

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import pprint

from modules.models_runner.feat_br_models_runner import Feat_BR_Models_Runner
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import similarity_measures as sm

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [2]:
bugreports = fd.Datasets.read_selected_bugreports_df()
features = fd.Datasets.read_features_df()
br_2_features_matrix_final = fd.Feat_BR_Oracles.read_br_2_features_matrix_final_df()

SelectedBugReports.shape: (91, 18)
Features.shape: (21, 8)
BR_2_Features Matrix Final.shape: (91, 4)


# Running LSI Model

In [3]:
models_runner_1 = Feat_BR_Models_Runner()

lsi_model_1 = models_runner_1.run_lsi_model()
evaluator_1 = m_eval.ModelEvaluator(oracle=fd.Feat_BR_Oracles.read_feat_br_expert_volunteers_union_df().T)
lsi_eval_1 = evaluator_1.run_evaluator(model=lsi_model_1, top_values=[1], sim_thresholds=[(sm.SimilarityMeasure.COSINE, 0.0)])

evaluator_1.get_evaluations_df().head()

#pprint.pprint(lsi_model_1.model_setup())

Features.shape: (21, 8)
SelectedBugReports.shape: (91, 18)
Running LSI model -----
Expert and Volunteers Matrix UNION.shape: (91, 21)
Evaluating LSI Model ----- 


Unnamed: 0,model,ref_name,perc_precision,perc_recall,perc_fscore
0,lsi,top_1_cosine_0.0,42.86,31.55,34.7


In [4]:
aux_functions.highlight_df(evaluator_1.oracle.iloc[0:23, 0:9])

Bug_Number,1248267,1248268,1257087,1264988,1267480,1267501,1269348,1269485,1270274
new_awesome_bar,0,0,0,1,0,0,0,0,0
windows_child_mode,0,0,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,1,0,1,0
browser_customization,0,1,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0,0,0


In [5]:
aux_functions.highlight_df(evaluator_1.get_trace_links_df().iloc[0:20, 0:9])

Bug_Number,1248267,1248268,1257087,1264988,1267480,1267501,1269348,1269485,1270274
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
new_awesome_bar,0,1,1,1,0,1,0,1,0
windows_child_mode,0,0,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0,0,0


In [6]:
aux_functions.highlight_df(lsi_model_1.get_sim_matrix().iloc[0:20, 0:9])

Bug_Number,1248267,1248268,1257087,1264988,1267480,1267501,1269348,1269485,1270274
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
new_awesome_bar,0.320741,0.568394,0.671144,0.931044,0.164355,0.631198,0.181918,0.640709,0.164355
windows_child_mode,0.0561108,0.406251,0.165129,0.0772069,0.212747,0.206565,0.126042,0.223301,0.212747
apz_async_scrolling,0.00122383,0.0295661,0.047032,0.0630632,0.042386,0.260099,0.00498738,0.173511,0.042386
browser_customization,0.0252172,0.279508,0.057479,0.139368,0.520398,0.0431226,0.102766,0.0478245,0.520398
pdf_viewer,0.00557269,0.048238,0.0127021,0.0170318,0.193004,0.00952956,0.0227099,0.0105686,0.193004
context_menu,0.971288,0.429624,0.44198,0.131035,0.0496182,0.33884,0.0514961,0.430191,0.0496182
w10_comp,0.190804,0.318948,0.220759,0.23969,0.433016,0.491275,0.179211,0.514725,0.433016
tts_in_desktop,0.0175157,0.202421,0.0399245,0.0898432,0.400903,0.59806,0.0713803,0.528585,0.400903
tts_in_rm,0.0231009,0.256051,0.0526551,0.127672,0.476724,0.430549,0.0941412,0.433093,0.476724
webgl_comp,0.0160424,0.288438,0.0365664,0.0822864,0.367183,0.0274333,0.0653764,0.0304245,0.367183


# Analysis of BR x Feat Generated Traces

In [7]:
trace_links = evaluator_1.get_trace_links_df()
oracle = evaluator_1.get_oracle_df()

mistaken_bugs = []
for bug_id in trace_links.columns:
    if str(list(trace_links[bug_id])) != str(list(oracle[bug_id])):
        mistaken_bugs.append(bug_id)

print(mistaken_bugs)

[1248268, 1257087, 1267480, 1267501, 1269348, 1269485, 1270274, 1271607, 1276120, 1277937, 1278388, 1279140, 1279143, 1281190, 1281493, 1282759, 1283542, 1285041, 1285328, 1287687, 1287748, 1289240, 1290424, 1291770, 1292566, 1295502, 1296322, 1296366, 1297336, 1297686, 1297976, 1299458, 1300738, 1301056, 1301784, 1302468, 1303339, 1305195, 1305676, 1305737, 1306639, 1312018, 1313290, 1313778, 1313969, 1314643, 1316126, 1318903, 1319119, 1319433, 1319919, 1320557, 1323211, 1325902, 1328913, 1335538, 1336227, 1343256, 1344446, 1345687, 1352539, 1353831, 1357085, 1357458, 1365887, 1408361, 1430603, 1432915, 1449700, 1451475]


In [8]:
bugreports[bugreports.br_name == mistaken_bugs[0]]

Unnamed: 0,Bug_Number,Summary,Platform,Component,Version,Creation_Time,Whiteboard,QA_Whiteboard,First_Comment_Text,First_Comment_Creation_Time,Status,Product,Priority,Resolution,Severity,Is_Confirmed,br_name,br_desc


In [22]:
def get_features(br_id, matrix):
    features_ids = ""
    matrix = matrix.T
      
    for col in matrix.columns:
        if matrix.at[br_id, col] == 1:
            if features_ids == "":
                features_ids = str(matrix.columns.get_loc(col) + 1)
            else:
                features_ids = features_ids + " " + str(matrix.columns.get_loc(col) + 1)
    
    return features_ids

In [25]:
a = bugreports.apply(lambda row : get_features(row['Bug_Number'], evaluator_1.get_trace_links_df()), axis=1)
print(a)

#br_2_features_matrix_final['Features_IDs_lsi_t1_m'] = 
#br_2_features_matrix_final['Features_IDs_orc'] = bugreports.apply(lambda row : get_features(row['Bug_Number'], evaluator_1.get_oracle_df()), axis=1)
#br_2_features_matrix_final.replace(" ", "", inplace=True)

#br_2_features_matrix_final.head(10)

1      6
2      1
3      1
4      1
5     16
6      1
7     18
8      1
9     16
10     6
11     7
12     7
13     1
14    16
15     8
16     7
17     1
18    18
19     1
20     9
21    16
22     2
23     1
24    15
25    18
26     4
27    15
28     1
29     6
30     1
      ..
62     1
64     1
65     7
66    19
67    18
68     8
69     1
70     1
71     1
72    21
73     6
74     1
75    19
76     1
77     7
78     1
79     1
80     6
81     1
82    18
83     4
84     4
85    18
86     1
87     1
88    18
89    13
90     9
91     1
92    16
Length: 91, dtype: object


In [None]:
br_2_features_final_matrix[['']]