# Introduction - Using COSINE Metric

In this notebook we demonstrate the use of **LSI (Latent Semantic Indexing)** technique of Information Retrieval context to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

## Import Libraries

In [4]:
import sys
if '../..' not in sys.path:
    sys.path.append('../..')

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lsi import LSI
from modules.models.model_hyperps import LSI_Model_Hyperp

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [5]:
test_cases_df = fd.read_testcases_df()
bug_reports_df = fd.read_bugreports_df()

corpus = test_cases_df.tc_desc
query = bug_reports_df.br_desc

test_cases_names = test_cases_df.tc_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_trace_df()

TestCases.shape: (207, 12)
BugReports.shape: (93, 19)
Oracle.shape: (207, 93)


## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

## Running LSI Model

In [4]:
%%time

lsi_hyperp = {
    LSI_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine' , .80),
    LSI_Model_Hyperp.TOP.value : 10,
    LSI_Model_Hyperp.SVD_MODEL_N_COMPONENTS.value: 100,
    LSI_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: (1,1),
    LSI_Model_Hyperp.VECTORIZER.value : TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True),
    LSI_Model_Hyperp.VECTORIZER_TOKENIZER.value : tok.WordNetBased_LemmaTokenizer()
}

lsi_model = LSI(**lsi_hyperp)
lsi_model.set_name('LSI_Model_AllData')
lsi_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, lsi_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of LSI_Model_AllData': 0.0030721966205837174,
              'Mean Precision of LSI_Model_AllData': 0.010752688172043012,
              'Mean Recall of LSI_Model_AllData': 0.0017921146953405018},
 'Setup': [{'Name': 'LSI_Model_AllData'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 100},
           {'SVD Model': {'algorithm': 'randomized',
                          'n_components': 100,
                          'n_iter': 10,
                          'random_state': 42,
                          'tol': 0.0}},
           {'Vectorizer': {'analyzer': 'word',
                           'binary': False,
                           'decode_error': 'strict',
                           'dtype': <class 'numpy.float64'>,
                           'encoding': 'utf-8',
                           'input': 'content',
                           'lowercase': Tru

In [5]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0,0,0,0,0,0,0
TC_2_TRG,0,0,0,0,0,0,0
TC_3_TRG,0,0,0,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0
TC_5_TRG,0,0,0,0,0,0,0
TC_6_TRG,0,0,0,0,0,0,0
TC_7_TRG,0,0,0,0,0,0,0
TC_8_TRG,0,0,0,0,0,0,0
TC_9_TRG,0,0,0,0,0,0,0
TC_10_TRG,0,0,0,0,0,0,0


In [6]:
aux_functions.highlight_df(lsi_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0,0,0,0,0,0,0
TC_2_TRG,0,0,0,0,0,0,0
TC_3_TRG,0,0,0,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0
TC_5_TRG,0,0,0,0,0,0,0
TC_6_TRG,0,0,0,0,0,0,0
TC_7_TRG,0,0,0,0,0,0,0
TC_8_TRG,0,0,0,0,0,0,0
TC_9_TRG,0,0,0,0,0,0,0
TC_10_TRG,0,0,0,0,0,0,0


In [7]:
aux_functions.highlight_df(lsi_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0.166009,0.0302695,0.118855,0.0402435,0.197487,0.169482,0.0875081
TC_2_TRG,0.178568,0.0419425,0.0667533,0.0694235,0.238795,0.108699,0.0511044
TC_3_TRG,0.207219,0.0313484,0.0424547,0.0382382,0.201793,0.123772,0.0203888
TC_4_TRG,0.154775,0.0368831,0.0435161,0.0338954,0.289283,0.106224,0.0166459
TC_5_TRG,0.244159,0.0356419,0.0745114,0.0375381,0.157371,0.179249,0.0516371
TC_6_TRG,0.190981,0.0726353,0.269722,0.0857263,0.270214,0.145943,0.0641959
TC_7_TRG,0.0951049,0.067788,0.338148,0.130708,0.18727,0.0634075,0.0489821
TC_8_TRG,0.37241,0.0489751,0.235335,0.0782138,0.226403,0.141243,0.0621119
TC_9_TRG,0.379654,0.0368069,0.223415,0.0907266,0.20294,0.109846,0.059637
TC_10_TRG,0.059329,0.0355497,0.14702,0.0501149,0.126843,0.0784329,0.0402505
