# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made


## Import Libraries

In [8]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.bm25 import BM_25
from modules.models.model_hyperps import BM25_Model_Hyperp

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [9]:
features_df = fd.read_features_df()
bug_reports_df = fd.read_bugreports_df()

corpus = features_df.feat_desc
query = bug_reports_df.br_desc

features_names = features_df.feat_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_feat_br_trace_df()

Features.shape: (21, 8)
BugReports.shape: (93, 19)
Feat_BR_Trace.shape: (21, 93)


# BM25 Model

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

## Running BM25 Model

In [10]:
%%time

bm25_hyperp = {
    BM25_Model_Hyperp.TOP.value : 10,
    BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('-', 0.0),
    BM25_Model_Hyperp.TOKENIZER.value : tok.PorterStemmerBased_Tokenizer()
}

bm25_model = BM_25(**bm25_hyperp)
bm25_model.set_name('BM25_Model_AllData')
bm25_model.recover_links(corpus, query, features_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, bm25_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_AllData': 0.04496578690127076,
              'Mean Precision of BM25_Model_AllData': 0.02473118279569893,
              'Mean Recall of BM25_Model_AllData': 0.24731182795698925},
 'Setup': [{'Name': 'BM25_Model_AllData'},
           {'Top Value': 10},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'modules.utils.tokenizers.PorterStemmerBased_Tokenizer'>}]}
CPU times: user 649 ms, sys: 4.02 ms, total: 653 ms
Wall time: 651 ms


In [11]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,1,0,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [12]:
aux_functions.highlight_df(bm25_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,1,1,1,1,1,1,1
windows_child_mode,1,1,1,1,1,1,1
apz_async_scrolling,1,1,1,1,1,1,1
browser_customization,0,0,0,0,0,1,0
pdf_viewer,0,1,0,0,0,0,0
context_menu,0,1,1,1,1,1,1
w10_comp,1,1,0,1,1,1,1
tts_in_desktop,0,0,0,0,0,0,1
tts_in_rm,0,0,0,0,0,0,1
webgl_comp,0,0,0,0,0,0,0


In [13]:
aux_functions.highlight_df(bm25_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,36.6928,48.3773,88.4544,42.8078,40.3212,6.43651,55.7437
windows_child_mode,50.4416,23.9618,74.5526,23.3641,23.501,6.95738,40.7501
apz_async_scrolling,50.0095,13.226,64.6668,18.8481,16.2101,5.90332,54.6896
browser_customization,9.22648,10.8258,29.089,10.0557,13.2674,6.19986,17.0895
pdf_viewer,12.8431,16.1602,31.2747,8.1284,11.0537,4.51578,18.2389
context_menu,15.4563,90.6807,129.948,27.4717,25.5741,6.65394,43.0543
w10_comp,22.154,25.4508,36.4506,17.3667,14.9348,5.71471,60.8178
tts_in_desktop,11.0643,8.81533,24.9455,8.81533,11.4599,5.2892,30.1534
tts_in_rm,13.4592,8.83424,26.5824,8.83424,11.642,5.39056,29.2717
webgl_comp,17.7433,8.67684,37.8852,8.67684,12.1476,5.20611,17.3537
