# Introduction - Using COSINE Metric

In this notebook we demonstrate the use of **LSI (Latent Semantic Indexing)** technique of Information Retrieval context to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lsi import LSI
from modules.models.model_hyperps import LSI_Model_Hyperp

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Dataset

In [2]:
features_df = fd.Datasets.read_features_df()
bug_reports_df = fd.Datasets.read_selected_bug_reports_2_df()

corpus = features_df.feat_desc
query = bug_reports_df.br_desc

features_names = features_df.feat_name
bug_reports_names = bug_reports_df.br_name

orc = fd.Feat_BR_Oracles.read_feat_br_expert_volunteers_df()

Features.shape: (21, 8)
BugReports.shape: (93, 18)
Feat_BR_Trace.shape: (21, 93)


# Running LSI Model

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

In [3]:
%%time

lsi_hyperp = {
    LSI_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine' , .75),
    LSI_Model_Hyperp.TOP.value : 10,
    LSI_Model_Hyperp.SVD_MODEL_N_COMPONENTS.value: 100,
    LSI_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: (1,1),
    LSI_Model_Hyperp.VECTORIZER.value : TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=False),
    LSI_Model_Hyperp.VECTORIZER_TOKENIZER.value : tok.WordNetBased_LemmaTokenizer()
}

lsi_model = LSI(**lsi_hyperp)
lsi_model.set_name('LSI_Model_AllData')
lsi_model.recover_links(corpus, query, features_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, lsi_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of LSI_Model_AllData': 0.06451612903225806,
              'Mean Precision of LSI_Model_AllData': 0.06451612903225806,
              'Mean Recall of LSI_Model_AllData': 0.06451612903225806},
 'Setup': [{'Name': 'LSI_Model_AllData'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 10},
           {'SVD Model': {'algorithm': 'randomized',
                          'n_components': 100,
                          'n_iter': 10,
                          'random_state': 42,
                          'tol': 0.0}},
           {'Vectorizer': {'analyzer': 'word',
                           'binary': False,
                           'decode_error': 'strict',
                           'dtype': <class 'numpy.float64'>,
                           'encoding': 'utf-8',
                           'input': 'content',
                           'lowercase': True,
  

In [4]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,1,0,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [5]:
aux_functions.highlight_df(lsi_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,0,1,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,0,1,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [6]:
aux_functions.highlight_df(lsi_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0.344829,0.29829,0.590112,0.654052,0.944804,0.158683,0.650807
windows_child_mode,0.410335,0.0476185,0.387082,0.148649,0.0601579,0.202815,0.198118
apz_async_scrolling,0.132236,0.000877407,0.0270014,0.0384289,0.0518481,0.0387036,0.247854
browser_customization,0.148189,0.0188055,0.2358,0.0438369,0.107292,0.491887,0.03531
pdf_viewer,0.018386,0.00423263,0.0401236,0.00986654,0.0133119,0.186707,0.00794736
context_menu,0.0781588,0.973157,0.42588,0.425435,0.126453,0.047533,0.314562
w10_comp,0.489118,0.175973,0.29404,0.188912,0.195068,0.42177,0.494389
tts_in_desktop,0.261668,0.0136667,0.182279,0.031858,0.0724875,0.395945,0.569138
tts_in_rm,0.429789,0.0179371,0.224911,0.0418125,0.102337,0.469172,0.402295
webgl_comp,0.239497,0.0125087,0.263929,0.0291587,0.0663458,0.362398,0.023487
