# Introduction - Using COSINE Metric

In this notebook we demonstrate the use of **LSI (Latent Semantic Indexing)** technique of Information Retrieval context to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.models_runner.tc_br_models_runner import TC_BR_Models_Hyperp
from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

from modules.models.lsi import LSI

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [2]:
tcs = [x for x in range(37,59)]
orc = fd.Tc_BR_Oracles.read_oracle_expert_df()
orc_subset = orc[orc.index.isin(tcs)]
#aux_functions.highlight_df(orc_subset)

OracleExpert.shape: (195, 91)


In [3]:
tcs = [13,37,60]
brs = [1267501]

testcases = fd.Datasets.read_testcases_df()
testcases = testcases[testcases.TC_Number.isin(tcs)]
bugreports = fd.Datasets.read_selected_bugreports_df()
bugreports = bugreports[bugreports.Bug_Number.isin(brs)]

print('tc.shape: {}'.format(testcases.shape))
print('br.shape: {}'.format(bugreports.shape))

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
tc.shape: (3, 12)
br.shape: (1, 18)


In [4]:
print(bugreports.iloc[0,:].Summary)
bugreports

New Private Browsing start-page overflows off the *left side of the window* (making content unscrollable) for small window sizes


Unnamed: 0,Bug_Number,Summary,Platform,Component,Version,Creation_Time,Whiteboard,QA_Whiteboard,First_Comment_Text,First_Comment_Creation_Time,Status,Product,Priority,Resolution,Severity,Is_Confirmed,br_name,br_desc
6,1267501,New Private Browsing start-page overflows off ...,Unspecified,Private Browsing,48 Branch,2016-04-26T01:12:11Z,[fxprivacy],,STR: 1. Open a new private browsing window. ...,2016-04-26T01:12:11Z,RESOLVED,Firefox,P1,FIXED,normal,True,BR_1267501_SRC,1267501 New Private Browsing start-page overfl...


In [5]:
testcases

Unnamed: 0,TC_Number,TestDay,Feature_ID,Firefox_Feature,Gen_Title,Crt_Nr,Title,Preconditions,Steps,Expected_Result,tc_name,tc_desc
12,13,20160603 + 20160624 + 20161014,1,New Awesome Bar,Awesome Bar Search,1,Default State,,1. Launch Firefox.\t\n2. No AwesomeBar Entry,1. Firefox launches without any issues.\n2. UR...,TC_13_TRG,13 20160603 + 20160624 + 20161014 1 New Awesom...
36,37,20160603 + 20160708,3,APZ - Async Scrolling,APZ - Async Scrolling,1,Scroll through a long web page,- make sure layers.async-pan-zoom.enabled is t...,1. Launch Firefox.\t\n2. Open: https://en.wiki...,"1. \n2.\n3. The scrolling is smooth, without a...",TC_37_TRG,37 20160603 + 20160708 3 APZ - Async Scrolling...
59,60,20160722,4,Browser Customization,browser customization,2,Install and use complete themes,,1. Install a few complete themes.\n2. Restart ...,1. The user is able to initiate installation p...,TC_60_TRG,60 20160722 4 Browser Customization browser cu...


# Running LSI Model

In [6]:
corpus = testcases.tc_desc
query = bugreports.br_desc
test_cases_names = testcases.tc_name
bug_reports_names = bugreports.br_name

lsi_hyperp = TC_BR_Models_Hyperp.get_lsi_model_hyperp()
lsi_model = LSI(**lsi_hyperp)
lsi_model.set_name('LSI_Model_TC_BR')
lsi_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

In [7]:
lsi_model.get_sim_matrix().shape

(3, 1)

In [8]:
sim_matrix = lsi_model.get_sim_matrix()
aux_functions.highlight_df(sim_matrix)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.462094
TC_37_TRG,0.948398
TC_60_TRG,0.0642657


In [9]:
lsi_model.get_svd_matrix().shape

(3, 3)

In [10]:
svd_matrix = pd.DataFrame(lsi_model.get_svd_matrix())
#svd_matrix.index = test_cases_names
aux_functions.highlight_df(svd_matrix)

Unnamed: 0,0,1,2
0,0.763211,0.0,0.646149
1,0.750932,-0.178656,-0.635753
2,0.136353,0.983912,-0.115439


In [11]:
query_vec = lsi_model._query_vector
query_vec = pd.DataFrame(query_vec)
query_vec.index = bug_reports_names
query_vec

Unnamed: 0_level_0,0,1,2
br_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BR_1267501_SRC,0.434341,-0.050055,-0.175978


In [12]:
from sklearn.metrics import pairwise
results = pd.DataFrame(pairwise.cosine_similarity(X=svd_matrix, Y=query_vec))
results.index = test_cases_names
results.rename(columns={0:bug_reports_names.values[0]}, inplace=True)
aux_functions.highlight_df(results)

Unnamed: 0_level_0,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.462094
TC_37_TRG,0.948398
TC_60_TRG,0.0642657


In [13]:
import numpy as np

tokenizer = tok.WordNetBased_LemmaTokenizer()
tokens = [tokenizer.__call__(doc) for doc in testcases.tc_desc]
final_tokens = []
for token_list in tokens:
    for t in token_list:
        final_tokens.append(t)

print(np.unique(final_tokens))
print(len(np.unique(final_tokens)))

['able' 'active' 'all' 'appearance' 'appears' 'apz' 'arrow' 'async'
 'awesome' 'awesomebar' 'bar' 'browser' 'complete' 'config' 'ctrl'
 'customization' 'default' 'disabled' 'display' 'enabled' 'entry'
 'firefox' 'home' 'http' 'initiate' 'install' 'installation' 'installed'
 'issue' 'jerkiness' 'key' 'latest' 'launch' 'lightweight' 'long' 'make'
 'manager' 'mouse' 'nan' 'new' 'no' 'once' 'open' 'page' 'previous'
 'previously' 'process' 'rendering' 'replaces' 'restart' 'restarted'
 'scroll' 'scrolling' 'search' 'section' 'set' 'smooth' 'space' 'state'
 'sure' 'the' 'theme' 'true' 'url' 'use' 'user' 'using' 'web' 'wheel'
 'without']
70


In [18]:
df = pd.DataFrame(lsi_model.svd_model.components_.T)
df.index = lsi_model.vectorizer.get_feature_names()
df.rename(columns={0:'TC_13_TRG',1:'TC_37_TRG',2:'TC_60_TRG'}, inplace=True)
print(df.shape)
aux_functions.highlight_df(df)

(65, 3)


Unnamed: 0,TC_13_TRG,TC_37_TRG,TC_60_TRG
able,0.00928981,0.0780943,-0.0109728
active,0.00928981,0.0780943,-0.0109728
appearance,0.00928981,0.0780943,-0.0109728
appears,0.00928981,0.0780943,-0.0109728
apz,0.153688,-0.0425969,-0.181532
arrow,0.0768442,-0.0212985,-0.090766
async,0.153688,-0.0425969,-0.181532
awesome,0.280415,-1.53494e-16,0.331217
awesomebar,0.140207,-7.67472e-17,0.165609
bar,0.330146,-0.032396,0.113839


In [16]:
import numpy as np

tokenizer = tok.WordNetBased_LemmaTokenizer()
tokens = [tokenizer.__call__(doc) for doc in bugreports.br_desc]
final_tokens = []
for token_list in tokens:
    for t in token_list:
        final_tokens.append(t)

dff = pd.DataFrame(final_tokens)
dff.shape
display(dff)

Unnamed: 0,0
0,new
1,private
2,browsing
3,overflow
4,side
5,making
6,content
7,unscrollable
8,small
9,window
