# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made


## Import Libraries

In [5]:
import pandas as pd
import numpy as np

from utils import plots
from utils import firefox_dataset as fd
from utils import tokenizers as tok
from utils import aux_functions
from utils import model_evaluator as m_eval

from models.bm25 import BM_25
from models.model_hyperps import BM25_Model_Hyperp

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [6]:
test_cases_df = fd.read_testcases_df()
bug_reports_df = fd.read_bugreports_df()

corpus = test_cases_df.tc_desc
query = bug_reports_df.br_desc

test_cases_names = test_cases_df.tc_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_trace_df()
orc.set_index('tc_name', inplace=True, drop=True)

(35314, 12)
(207, 10)
(207, 35314)


# BM25 Model

#### Quick Test with Model

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

#### Analysis with Default Values of BM25 Model

In [5]:
best_model = BM_25()
best_model.recover_links(corpus, query, use_cases_names, bug_reports_names)

df = pd.DataFrame(best_model.get_sim_matrix())
df.head(10)

#evaluator = ModelEvaluator(orc.oracle, best_model)
#evaluator.evaluate_model(verbose=True)
#evaluator.plot_precision_vs_recall()

artf_name,BR_4020_SRC,BR_3890_SRC,BR_3844_SRC,BR_4065_SRC,BR_3880_SRC,BR_3987_SRC,BR_4067_SRC,BR_3973_SRC,BR_3898_SRC,BR_3908_SRC,BR_4058_SRC,BR_4018_SRC,BR_4005_SRC,BR_3974_SRC
artf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UC_003_TRG,7.089528,0.0,0.923743,1.11465,2.150948,0.355476,4.012595,1.697277,3.342732,3.461319,1.227205,5.756607,7.164948,0.0
UC_007_TRG,9.834788,1.797495,1.810439,0.0,0.742184,0.366688,3.204764,0.542561,0.742184,1.096859,9.115214,4.190159,0.33044,0.0
UC_010_TRG,16.810307,0.0,6.353957,0.183335,1.333368,0.347976,0.916676,0.183335,2.280733,17.809227,0.183335,2.415047,1.46993,0.0
UC_002_TRG,5.330881,0.0,0.0,0.19285,1.533159,0.342094,2.128454,2.101317,1.219134,0.877039,0.342094,4.030716,2.237427,0.0
UC_006_TRG,5.080511,0.0,0.0,0.241239,0.368304,0.338581,2.452156,0.609543,1.711465,0.958172,0.368304,4.94281,2.290405,0.0
UC_004_TRG,3.906332,0.0,0.0,0.0,1.524473,0.351135,3.682771,2.108979,1.053406,0.713617,0.351135,3.886018,5.45972,0.0
UC_005_TRG,5.319835,0.0,0.0,0.186401,0.371506,0.349795,2.502583,0.816893,1.704921,0.907702,0.371506,5.111546,2.367286,0.0
UC_008_TRG,13.300459,0.0,0.0,3.028716,9.086149,0.350704,6.057432,0.485578,6.057432,3.364868,0.0,7.028083,31.087417,0.0
UC_001_TRG,2.346083,0.0,1.031577,1.293069,1.135627,0.356966,3.094731,0.455406,2.949481,8.640988,0.0,4.156527,4.60412,0.0
UC_009_TRG,5.418284,1.931059,1.124214,0.177497,1.301711,0.344392,0.887485,0.177497,70.831526,45.959684,3.38884,4.203115,0.500531,0.0


## Running BM25 Model with Different Types of Oracles

### Strong and Weak Links Datasets

In [7]:
br_tc_strong_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Strong.csv')
br_tc_weak_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Weak.csv')
br_tc_mix_df = pd.read_csv('../../data/mozilla_firefox_v2/firefoxDataset/oracle/output/BR_TC_Mix.csv')

print(br_tc_strong_df.shape)
print(br_tc_weak_df.shape)
print(br_tc_mix_df.shape)

(10, 4)
(10, 4)
(10, 4)


### Define **run_bm25_model()** Function

In [8]:
def run_bm25_model(selected_tcs, selected_brs):
    tcs_df = test_cases_df[test_cases_df.tc_name.isin(selected_tcs)]
    brs_df = bug_reports_df[bug_reports_df.br_name.isin(selected_brs)]

    corpus_subset = tcs_df.tc_desc
    query_subset = brs_df.br_desc
    testcases_names_subset = tcs_df.tc_name
    bug_reports_names_subset = brs_df.br_name
    orc_subset_df = orc.loc[testcases_names_subset, bug_reports_names_subset]

    print('TestCases Subset Shape: {}'.format(tcs_df.shape))
    print('BugReports Subset Shape: {}'.format(brs_df.shape))
    print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

    bm25_hyperp = {
        BM25_Model_Hyperp.TOP.value : 5,
        BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('-', 0.0),
        BM25_Model_Hyperp.TOKENIZER.value : tok.PorterStemmerBased_Tokenizer()
    }

    bm25_model = BM_25(**bm25_hyperp)
    bm25_model.set_name('BM25_Model_0')
    bm25_model.recover_links(corpus_subset, query_subset, testcases_names_subset, bug_reports_names_subset)

    print("\nModel Evaluation -------------------------------------------")
    evaluator = m_eval.ModelEvaluator(orc_subset_df, bm25_model)
    evaluator.evaluate_model(verbose=True)
    
    print("\n\nTraceLinks Matrix --------------------------------------")
    display(aux_functions.highlight_df(bm25_model.get_trace_links_df()))

    print("\n\nOracle -----------------------------------------")
    display(aux_functions.highlight_df(orc_subset_df))

### Oracle with Strong Links Only

In [9]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_strong_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_strong_df.BR.values]

run_bm25_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (6, 12)
Oracle Subset Shape: (10, 6)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_0': 0.5555555555555555,
              'Mean Precision of BM25_Model_0': 0.6333333333333333,
              'Mean Recall of BM25_Model_0': 0.5158730158730158},
 'Setup': [{'Name': 'BM25_Model_0'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1298575_SRC,BR_1313805_SRC,BR_1320658_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TC_118_TRG,0,0,0,0,0,1
TC_120_TRG,1,1,0,1,1,0
TC_121_TRG,1,1,1,1,1,0
TC_143_TRG,0,1,1,0,0,1
TC_155_TRG,0,1,1,0,0,1
TC_172_TRG,1,1,0,1,0,1
TC_181_TRG,0,0,1,0,1,1
TC_183_TRG,0,0,1,0,0,0
TC_196_TRG,1,0,0,1,1,0
TC_197_TRG,1,0,0,1,1,0




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1298575_SRC,BR_1313805_SRC,BR_1320658_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TC_118_TRG,1,0,0,0,0,0
TC_120_TRG,1,0,0,0,0,0
TC_121_TRG,1,0,0,0,0,0
TC_143_TRG,0,1,1,1,1,1
TC_155_TRG,0,1,1,1,1,1
TC_172_TRG,0,1,1,1,1,1
TC_181_TRG,0,1,1,1,1,1
TC_183_TRG,0,1,1,1,1,1
TC_196_TRG,0,1,1,1,1,1
TC_197_TRG,0,1,1,1,1,1


### Oracle with Weak Links Only

In [10]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_weak_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_weak_df.BR.values]

run_bm25_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (5, 12)
Oracle Subset Shape: (10, 5)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_0': 0.480952380952381,
              'Mean Precision of BM25_Model_0': 0.4800000000000001,
              'Mean Recall of BM25_Model_0': 0.657142857142857},
 'Setup': [{'Name': 'BM25_Model_0'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,1,1,1
TC_75_TRG,0,1,0,1,1
TC_105_TRG,0,1,1,0,0
TC_154_TRG,0,0,0,0,0
TC_155_TRG,1,0,0,0,1
TC_174_TRG,1,1,1,1,0
TC_196_TRG,0,1,1,1,0
TC_197_TRG,1,1,1,1,0
TC_200_TRG,0,0,0,0,1
TC_207_TRG,1,0,0,0,1




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,0,0,0
TC_75_TRG,0,1,0,0,0
TC_105_TRG,0,1,0,0,0
TC_154_TRG,0,0,1,1,1
TC_155_TRG,0,0,1,1,1
TC_174_TRG,0,0,1,1,1
TC_196_TRG,0,0,1,1,1
TC_197_TRG,0,0,1,1,1
TC_200_TRG,0,0,1,1,1
TC_207_TRG,0,0,1,1,1


### Oracle with Mixed Links (Strong and Weak)

In [11]:
selected_tcs = ['TC_{}_TRG'.format(tc_num) for tc_num in br_tc_mix_df.TC.values]
selected_brs = ['BR_{}_SRC'.format(bg_num) for bg_num in br_tc_mix_df.BR.values]

run_bm25_model(selected_tcs, selected_brs)

TestCases Subset Shape: (10, 10)
BugReports Subset Shape: (5, 12)
Oracle Subset Shape: (10, 5)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_0': 0.3712121212121212,
              'Mean Precision of BM25_Model_0': 0.36000000000000004,
              'Mean Recall of BM25_Model_0': 0.5},
 'Setup': [{'Name': 'BM25_Model_0'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,1,1,1
TC_105_TRG,1,0,1,1,0
TC_118_TRG,1,0,0,0,1
TC_121_TRG,1,1,1,1,0
TC_143_TRG,0,0,0,0,1
TC_154_TRG,0,0,0,0,0
TC_155_TRG,0,1,0,0,1
TC_172_TRG,1,1,1,1,1
TC_174_TRG,0,1,0,0,0
TC_197_TRG,0,1,1,1,0




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1285719_SRC,BR_1298575_SRC,BR_1329292_SRC,BR_1329421_SRC,BR_1329430_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TC_35_TRG,1,0,0,0,0
TC_105_TRG,0,1,0,0,0
TC_118_TRG,0,1,0,0,0
TC_121_TRG,0,1,0,0,0
TC_143_TRG,0,0,1,1,1
TC_154_TRG,0,0,1,1,1
TC_155_TRG,0,0,1,1,1
TC_172_TRG,0,0,1,1,1
TC_174_TRG,0,0,1,1,1
TC_197_TRG,0,0,1,1,1


### General Test

In [4]:
bugreports_subset_df = bug_reports_df[(bug_reports_df.Version == '48 Branch') | (bug_reports_df.Version == '60 Branch')].sample(15, random_state=42)
testcases_subset_df = test_cases_df[(test_cases_df.TestDay.str.contains('20161014')) | (test_cases_df.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13, 14, 15, 16, 17, 18]]  # should link with 48 Branch
aux_tc = test_cases_df[test_cases_df.tc_name.isin(selected_testcases)]

selected_bugreports = bugreports_subset_df.br_name
run_bm25_model(selected_testcases, selected_bugreports)


TestCases Subset Shape: (6, 10)
BugReports Subset Shape: (15, 12)
Oracle Subset Shape: (6, 15)

Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_0': 0.2424242424242424,
              'Mean Precision of BM25_Model_0': 0.26666666666666666,
              'Mean Recall of BM25_Model_0': 0.22222222222222224},
 'Setup': [{'Name': 'BM25_Model_0'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>}]}


TraceLinks Matrix --------------------------------------


br_name,BR_1268934_SRC,BR_1282551_SRC,BR_1291175_SRC,BR_1299787_SRC,BR_1418983_SRC,BR_1432520_SRC,BR_1436749_SRC,BR_1443632_SRC,BR_1443754_SRC,BR_1450216_SRC,BR_1461828_SRC,BR_1463274_SRC,BR_1463735_SRC,BR_1497738_SRC,BR_1513270_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_13_TRG,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
TC_14_TRG,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1
TC_15_TRG,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
TC_16_TRG,1,0,0,1,1,1,1,1,0,1,0,1,1,0,1
TC_17_TRG,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
TC_18_TRG,1,1,1,0,0,1,1,0,1,0,1,0,0,1,0




Oracle -----------------------------------------


Unnamed: 0_level_0,BR_1268934_SRC,BR_1282551_SRC,BR_1291175_SRC,BR_1299787_SRC,BR_1418983_SRC,BR_1432520_SRC,BR_1436749_SRC,BR_1443632_SRC,BR_1443754_SRC,BR_1450216_SRC,BR_1461828_SRC,BR_1463274_SRC,BR_1463735_SRC,BR_1497738_SRC,BR_1513270_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_13_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_14_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_15_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_16_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_17_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
TC_18_TRG,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
