# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made


# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.models_runner.tc_br_models_runner import TC_BR_Models_Hyperp
from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

from modules.models.bm25 import BM_25

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Datasets

In [2]:
tcs = [x for x in range(37,59)]
orc = fd.Tc_BR_Oracles.read_oracle_expert_df()
orc_subset = orc[orc.index.isin(tcs)]
#aux_functions.highlight_df(orc_subset)

OracleExpert.shape: (195, 91)


In [3]:
tcs = [13,37,60]
brs = [1267501]

testcases = fd.Datasets.read_testcases_df()
testcases = testcases[testcases.TC_Number.isin(tcs)]
bugreports = fd.Datasets.read_selected_bugreports_df()
bugreports = bugreports[bugreports.Bug_Number.isin(brs)]

print('tc.shape: {}'.format(testcases.shape))
print('br.shape: {}'.format(bugreports.shape))

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
tc.shape: (3, 12)
br.shape: (1, 18)


# Running BM25 Model

In [4]:
corpus = testcases.tc_desc
query = bugreports.br_desc
test_cases_names = testcases.tc_name
bug_reports_names = bugreports.br_name

bm25_hyperp = TC_BR_Models_Hyperp.get_bm25_model_hyperp()
bm25_model = BM_25(**bm25_hyperp)
bm25_model.set_name('BM25_Model_TC_BR')
bm25_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

In [5]:
bm25_model.get_sim_matrix().shape

(3, 1)

In [6]:
sim_matrix = bm25_model.get_sim_matrix()
aux_functions.highlight_df(sim_matrix)

br_name,BR_1267501_SRC
tc_name,Unnamed: 1_level_1
TC_13_TRG,0.201572
TC_37_TRG,1.0
TC_60_TRG,0.0


In [14]:
df = pd.DataFrame(bm25_model.bm25.corpus)
df.T

#df.index = test_cases_names
#df = df.T
#df.index = bm25_model.vectorizer.get_feature_names()
#df.index.name = 'token'
#print(df.shape)
#aux_functions.highlight_df(df.head(15))

Unnamed: 0,0,1,2
0,new,apz,browser
1,awesom,async,custom
2,bar,scroll,browser
3,awesom,apz,custom
4,bar,async,instal
5,search,scroll,use
6,default,scroll,complet
7,state,long,theme
8,,web,
9,launch,page,instal


In [None]:
df_q = pd.DataFrame(lda_model._query_vector.toarray())
df_q.index = bug_reports_names
df_q = df_q.T
df_q.index = lda_model.vectorizer.get_feature_names()
df_q.index.name = 'token'
print(df_q.shape)
aux_functions.highlight_df(df_q.iloc[30:50,:])