# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made


## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from gensim.summarization.bm25 import BM25

from sklearn.metrics import precision_recall_fscore_support, pairwise
from sklearn.externals.joblib import Parallel, delayed

from enum import Enum
import pickle

from utils import plots
from utils import oracle_loader as ol
from utils import jedit_dataset as jd
from utils import tokenizers as tok
from utils import aux_functions
from utils import model_evaluator as m_eval
from utils import generic_model as g_model

import warnings; warnings.simplefilter('ignore')

## Load Dataset and Oracle

In [2]:
trace_df = jd.read_trace_df()
artfs_desc_df = jd.read_artfs_desc_df()

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = ol.OracleLoader(use_cases_names, bug_reports_names)
orc.load(trace_df)

# BM25 Model

#### Model Hyperparameters

In [3]:
class BM25_Model_Hyperp(Enum):
    NAME = 'bm25__name'
    TOP = 'bm25_top'
    K = 'bm25__k'
    B = 'bm25__b'
    EPSILON = 'bm25__epsilon'
    TOKENIZER = 'bm25__tokenizer'
    SIM_MEASURE_MIN_THRESHOLD = 'bm25__sim_measure_min_threshold'

#### Quick Test with Model

#### Model Defintion

In [4]:
"""
params_dict = {
    'bm25__k' : 1.2,
    'bm25__b' : 0.75,
    'bm25__epsilon' : 0.25,
    'bm25__name' : 'BM25',
    'bm25__tokenizer' : Tokenizer(),
    'bm25__min_threshold' : 3
}
"""
class BM_25(g_model.GenericModel):
    # k = 1.2, b = 0.75 (default values)
    def __init__(self, **kwargs):
        self.k = None
        self.b = None
        self.epsilon = None
        self.tokenizer = None
        self._sim_matrix = None
               
        super().__init__()
        
        self.set_basic_params(**kwargs)
        self.set_tokenizer(**kwargs)
    
    def set_name(self, name):
        super().set_name(name)
    
    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)
    
    def set_top(self, top):
        super().set_top(top)
    
    def set_sim_measure_min_threshold(self, threshold):
        super().set_sim_measure_min_threshold(threshold)
    
    def set_basic_params(self, **kwargs):
        self.set_name('BM25' if BM25_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.NAME.value])
        self.set_top(3 if BM25_Model_Hyperp.TOP.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.TOP.value])
        self.set_sim_measure_min_threshold(('', 0.0) if BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value])
        self.set_model_gen_name('bm25')
        
        self.k = 1.2 if BM25_Model_Hyperp.K.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.K.value]
        self.b = 0.75 if BM25_Model_Hyperp.B.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.B.value]
        self.epsilon = 0.25 if BM25_Model_Hyperp.EPSILON.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.EPSILON.value]
        
        
    def set_tokenizer(self, **kwargs):
        self.tokenizer = tok.WordNetBased_LemmaTokenizer() if BM25_Model_Hyperp.TOKENIZER.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.TOKENIZER.value]
        
        #tokenizer_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__tokenizer__' in key}
        #self.tokenizer.set_params(**tokenizer_params)
        
    def recover_links(self, corpus, query, use_cases_names, bug_reports_names):
        bm25 = BM25([self.tokenizer.__call__(doc) for doc in corpus])
        average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
        query = [self.tokenizer.__call__(doc) for doc in query]
        
        self._sim_matrix = pd.DataFrame(index = use_cases_names, 
                                           columns = bug_reports_names,
                                           data=np.zeros(shape=(len(use_cases_names), len(bug_reports_names)),dtype='float64'))
        
        for bug_id, bug_desc in zip(bug_reports_names, query):
            scores = bm25.get_scores(bug_desc, average_idf=average_idf)
            for uc_id, sc in zip(use_cases_names, scores):
                self._sim_matrix.at[uc_id, bug_id] = sc
        
        self._sim_matrix = pd.DataFrame(self._sim_matrix, index=use_cases_names, columns=bug_reports_names)
        super()._fillUp_traceLinksDf(use_cases_names, bug_reports_names, self._sim_matrix)
        
    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.get_name()},
                      {"Top Value" : self.get_top_value()},
                      {"Sim Measure Min Threshold" : self.get_sim_measure_min_threshold()},
                      {"K" : self.k},
                      {"B" : self.b},
                      {"Epsilon" : self.epsilon},
                      {"Tokenizer Type" : type(self.tokenizer)}
                  ]
               }
    
    def get_name(self):
        return super().get_name()
    
    def get_model_gen_name(self):
        return super().get_model_gen_name()
    
    def get_top_value(self):
        return super().get_top_value()
    
    def get_sim_measure_min_threshold(self):
        return super().get_sim_measure_min_threshold()
    
    def get_sim_matrix(self):
        return super().get_sim_matrix()
    
    def get_tokenizer_type(self):
        return type(self.tokenizer)
    
    def get_trace_links_df(self):
        return super().get_trace_links_df()
    
    def save_sim_matrix(self):
        super().save_sim_matrix()
    
    def get_model_dump_path(self):
        return super().get_model_dump_path()

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

#### Analysis with Default Values of BM25 Model

In [5]:
best_model = BM_25()
best_model.recover_links(corpus, query, use_cases_names, bug_reports_names)

df = pd.DataFrame(best_model.get_sim_matrix())
df.head(10)

#evaluator = ModelEvaluator(orc.oracle, best_model)
#evaluator.evaluate_model(verbose=True)
#evaluator.plot_precision_vs_recall()

artf_name,BR_4020_SRC,BR_3890_SRC,BR_3844_SRC,BR_4065_SRC,BR_3880_SRC,BR_3987_SRC,BR_4067_SRC,BR_3973_SRC,BR_3898_SRC,BR_3908_SRC,BR_4058_SRC,BR_4018_SRC,BR_4005_SRC,BR_3974_SRC
artf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UC_003_TRG,7.089528,0.0,0.923743,1.11465,2.150948,0.355476,4.012595,1.697277,3.342732,3.461319,1.227205,5.756607,7.164948,0.0
UC_007_TRG,9.834788,1.797495,1.810439,0.0,0.742184,0.366688,3.204764,0.542561,0.742184,1.096859,9.115214,4.190159,0.33044,0.0
UC_010_TRG,16.810307,0.0,6.353957,0.183335,1.333368,0.347976,0.916676,0.183335,2.280733,17.809227,0.183335,2.415047,1.46993,0.0
UC_002_TRG,5.330881,0.0,0.0,0.19285,1.533159,0.342094,2.128454,2.101317,1.219134,0.877039,0.342094,4.030716,2.237427,0.0
UC_006_TRG,5.080511,0.0,0.0,0.241239,0.368304,0.338581,2.452156,0.609543,1.711465,0.958172,0.368304,4.94281,2.290405,0.0
UC_004_TRG,3.906332,0.0,0.0,0.0,1.524473,0.351135,3.682771,2.108979,1.053406,0.713617,0.351135,3.886018,5.45972,0.0
UC_005_TRG,5.319835,0.0,0.0,0.186401,0.371506,0.349795,2.502583,0.816893,1.704921,0.907702,0.371506,5.111546,2.367286,0.0
UC_008_TRG,13.300459,0.0,0.0,3.028716,9.086149,0.350704,6.057432,0.485578,6.057432,3.364868,0.0,7.028083,31.087417,0.0
UC_001_TRG,2.346083,0.0,1.031577,1.293069,1.135627,0.356966,3.094731,0.455406,2.949481,8.640988,0.0,4.156527,4.60412,0.0
UC_009_TRG,5.418284,1.931059,1.124214,0.177497,1.301711,0.344392,0.887485,0.177497,70.831526,45.959684,3.38884,4.203115,0.500531,0.0


### Find The Best Model

In [6]:
all_hyperparams = {
    BM25_Model_Hyperp.TOP.value : [3,5],
    BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('-', 0.0)],
    BM25_Model_Hyperp.TOKENIZER.value : [tok.PorterStemmerBased_Tokenizer(), tok.LancasterStemmerBased_Tokenizer(), 
                                         tok.WordNetBased_LemmaTokenizer(), tok.SnowballStemmerBased_Tokenizer()]
}

hyperparams = aux_functions.generate_params_comb_list(**all_hyperparams)

print('Performing model hyperparameters search...')

def run_model(idx, **hyperp):    
    current_model = BM_25(**hyperp)
    current_model.set_name('BM25_Model_{}'.format(idx))
    current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
    
    evaluator = m_eval.ModelEvaluator(orc.oracle, current_model)
    evaluator.evaluate_model()
    evaluator.dump_model()
    evaluator.dump_evaluator()
    
    return([evaluator.get_mean_precision(), 
            evaluator.get_mean_recall(),
            evaluator.get_mean_fscore(), 
            evaluator.get_model().get_name(),
            evaluator.get_model().get_top_value(),
            evaluator.get_model().get_tokenizer_type(),
            evaluator.get_model().get_model_dump_path(),
            evaluator.get_evaluator_dump_path()
           ])

tasks = [(idx,hp) for idx,hp in enumerate(hyperparams)]
results = Parallel(n_jobs=-1, verbose=1)(delayed(run_model)(idx,**hp) for idx,hp in tasks)
results_df = pd.DataFrame(data=results, 
                          columns=['precision', 'recall', 'fscore', 'model_name', 'top_value', 'tokenizer', 'model_dump', 'evaluator_dump'])
results_df = results_df.astype(dtype={'model_dump' : str, 'evaluator_dump' : str})


Performing model hyperparameters search...


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.2s finished


### Report

In [7]:
best_model = aux_functions.report_best_model(results_df)

------------ Report -------------------

Total of Analyzed Hyperparameters Combinations: 8

Best Model Hyperparameters Combination Found:

{'Measures': {'Mean FScore of BM25_Model_6': 0.2859461966604825,
              'Mean Precision of BM25_Model_6': 0.17857142857142858,
              'Mean Recall of BM25_Model_6': 0.8095238095238095},
 'Setup': [{'Name': 'BM25_Model_6'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.WordNetBased_LemmaTokenizer'>}]}


### Save Similarity Matrix

In [8]:
best_model.save_sim_matrix()

#### Best Model for TOP 3 and 5

In [9]:
aux_functions.print_report_top_3_and_5_v1(results_df)

{'Measures': {'Mean FScore of BM25_Model_2': 0.30346320346320343,
              'Mean Precision of BM25_Model_2': 0.2119047619047619,
              'Mean Recall of BM25_Model_2': 0.630952380952381},
 'Setup': [{'Name': 'BM25_Model_2'},
           {'Top Value': 3},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenizers.WordNetBased_LemmaTokenizer'>}]}
------------------------------------------------------------------
{'Measures': {'Mean FScore of BM25_Model_6': 0.2859461966604825,
              'Mean Precision of BM25_Model_6': 0.17857142857142858,
              'Mean Recall of BM25_Model_6': 0.8095238095238095},
 'Setup': [{'Name': 'BM25_Model_6'},
           {'Top Value': 5},
           {'Sim Measure Min Threshold': ('-', 0.0)},
           {'K': 1.2},
           {'B': 0.75},
           {'Epsilon': 0.25},
           {'Tokenizer Type': <class 'utils.tokenize

### Plot Highlights

In [10]:
aux_functions.highlight_df(best_model.get_trace_links_df())

artf_name,BR_4020_SRC,BR_3890_SRC,BR_3844_SRC,BR_4065_SRC,BR_3880_SRC,BR_3987_SRC,BR_4067_SRC,BR_3973_SRC,BR_3898_SRC,BR_3908_SRC,BR_4058_SRC,BR_4018_SRC,BR_4005_SRC,BR_3974_SRC
artf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UC_003_TRG,1,1,1,1,1,1,1,1,1,1,1,1,1,1
UC_007_TRG,1,1,1,0,0,1,1,0,0,0,1,0,0,1
UC_010_TRG,1,1,1,0,1,0,0,0,1,1,0,0,0,1
UC_002_TRG,0,1,0,1,1,0,0,1,0,0,0,0,0,1
UC_006_TRG,0,1,0,1,0,0,0,1,0,0,1,1,0,1
UC_004_TRG,0,1,0,0,1,1,1,1,0,0,0,0,1,1
UC_005_TRG,0,1,0,0,0,0,0,1,0,0,1,1,1,1
UC_008_TRG,1,1,0,1,1,1,1,0,1,1,0,1,1,1
UC_001_TRG,0,1,1,1,0,1,1,0,1,1,0,0,1,1
UC_009_TRG,1,1,1,0,0,0,0,0,1,1,1,1,0,1


In [11]:
aux_functions.highlight_df(orc.oracle)

Unnamed: 0_level_0,BR_4020_SRC,BR_3890_SRC,BR_3844_SRC,BR_4065_SRC,BR_3880_SRC,BR_3987_SRC,BR_4067_SRC,BR_3973_SRC,BR_3898_SRC,BR_3908_SRC,BR_4058_SRC,BR_4018_SRC,BR_4005_SRC,BR_3974_SRC
artf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UC_003_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UC_007_TRG,0,0,0,0,0,0,0,0,0,0,1,0,0,0
UC_010_TRG,0,0,1,0,0,0,0,0,0,0,0,0,0,0
UC_002_TRG,0,0,0,0,1,0,0,1,0,0,0,0,0,0
UC_006_TRG,1,1,0,1,1,0,0,0,0,0,0,1,0,1
UC_004_TRG,0,0,0,0,0,1,0,0,0,0,0,0,0,0
UC_005_TRG,1,1,0,0,1,0,1,0,0,0,0,1,0,0
UC_008_TRG,0,0,0,0,0,0,0,0,0,0,0,0,1,0
UC_001_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UC_009_TRG,0,0,0,0,0,0,0,0,1,1,0,0,0,0
