# Introduction - Using COSINE Metric

In this notebook we demonstrate the use of **LSI (Latent Semantic Indexing)** technique of Information Retrieval context to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

## Import Libraries

In [13]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lsi import LSI
from modules.models.model_hyperps import LSI_Model_Hyperp

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [14]:
features_df = fd.read_features_df()
bug_reports_df = fd.read_bugreports_df()

corpus = features_df.feat_desc
query = bug_reports_df.br_desc

features_names = features_df.feat_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_feat_br_trace_df()

Features.shape: (21, 8)
BugReports.shape: (93, 19)
Feat_BR_Trace.shape: (21, 93)


## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

## Running LSI Model

In [29]:
%%time

lsi_hyperp = {
    LSI_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine' , .75),
    LSI_Model_Hyperp.TOP.value : 10,
    LSI_Model_Hyperp.SVD_MODEL_N_COMPONENTS.value: 100,
    LSI_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: (1,1),
    LSI_Model_Hyperp.VECTORIZER.value : TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=False),
    LSI_Model_Hyperp.VECTORIZER_TOKENIZER.value : tok.WordNetBased_LemmaTokenizer()
}

lsi_model = LSI(**lsi_hyperp)
lsi_model.set_name('LSI_Model_AllData')
lsi_model.recover_links(corpus, query, features_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, lsi_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of LSI_Model_AllData': 0.06451612903225806,
              'Mean Precision of LSI_Model_AllData': 0.06451612903225806,
              'Mean Recall of LSI_Model_AllData': 0.06451612903225806},
 'Setup': [{'Name': 'LSI_Model_AllData'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 10},
           {'SVD Model': {'algorithm': 'randomized',
                          'n_components': 100,
                          'n_iter': 10,
                          'random_state': 42,
                          'tol': 0.0}},
           {'Vectorizer': {'analyzer': 'word',
                           'binary': False,
                           'decode_error': 'strict',
                           'dtype': <class 'numpy.float64'>,
                           'encoding': 'utf-8',
                           'input': 'content',
                           'lowercase': True,
  

In [16]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,1,0,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [17]:
aux_functions.highlight_df(lsi_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,0,0,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,0,1,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [18]:
aux_functions.highlight_df(lsi_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0.340071,0.399195,0.425931,0.70485,0.741355,0.268173,0.603528
windows_child_mode,0.507652,0.117614,0.462524,0.261069,0.262108,0.28444,0.256055
apz_async_scrolling,0.117173,0.0128948,0.0707576,0.104083,0.114947,0.10395,0.311923
browser_customization,0.485774,0.128302,0.625804,0.258902,0.476544,0.689528,0.15518
pdf_viewer,0.110163,0.0484935,0.221748,0.0978558,0.10807,0.390926,0.0586524
context_menu,0.100598,0.963148,0.32399,0.497059,0.160365,0.0669339,0.418434
w10_comp,0.706617,0.285129,0.52153,0.418449,0.577659,0.522396,0.501617
tts_in_desktop,0.425711,0.0936982,0.428459,0.189075,0.313217,0.566505,0.679963
tts_in_rm,0.58897,0.111112,0.541962,0.224216,0.412699,0.597149,0.537558
webgl_comp,0.388619,0.0855344,0.508466,0.172601,0.285927,0.517146,0.103453
