# Introduction

In this notebook we demonstrate the use of **BM25 (Best Matching 25)** Information Retrieval technique to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made


## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from gensim.summarization.bm25 import BM25

import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_fscore_support, pairwise
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import Normalizer, normalize

from scipy.sparse import csr_matrix

import nltk
import datetime
import pprint
from enum import Enum
import pickle

from utils import plots
from utils import oracle_loader as ol
from utils import jedit_dataset as jd
from utils.tokenizers import LancasterStemmerBased_Tokenizer, PorterStemmerBased_Tokenizer, SnowballStemmerBased_Tokenizer, WordNetBased_LemmaTokenizer
from utils import aux_functions
from utils import model_evaluator

import warnings; warnings.simplefilter('ignore')

## Load Dataset and Preprocessing

In [2]:
trace_df = jd.read_trace_df()
artfs_desc_df = jd.read_artfs_desc_df()

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = ol.OracleLoader(use_cases_names, bug_reports_names)
orc.load(trace_df)

# BM25 Model

#### Model Hyperparameters

In [3]:
class BM25_Model_Hyperp(Enum):
    NAME = 'bm25__name'
    TOP = 'bm25_top'
    K = 'bm25__k'
    B = 'bm25__b'
    EPSILON = 'bm25__epsilon'
    TOKENIZER = 'bm25__tokenizer'
    SIM_MEASURE_MIN_THRESHOLD = 'bm25__sim_measure_min_threshold'

#### Quick Test with Model

#### Model Defintion

In [4]:
"""
params_dict = {
    'bm25__k' : 1.2,
    'bm25__b' : 0.75,
    'bm25__epsilon' : 0.25,
    'bm25__name' : 'BM25',
    'bm25__tokenizer' : Tokenizer(),
    'bm25__min_threshold' : 3
}
"""
class BM_25:
    # k = 1.2, b = 0.75, coord_factor = False
    def __init__(self, **kwargs):
        self.k = None
        self.b = None
        self.epsilon = None
        self.name = None
        self.top = None
        self.sim_measure_min_threshold = None
        self.tokenizer = None
        self.trace_links_df = None
        
        self._sim_matrix = None
        
        self.set_basic_params(**kwargs)
        self.set_tokenizer(**kwargs)
    
    
    def set_basic_params(self, **kwargs):
        self.name = 'BM25' if BM25_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.NAME.value]
        self.k = 1.2 if BM25_Model_Hyperp.K.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.K.value]
        self.b = 0.75 if BM25_Model_Hyperp.B.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.B.value]
        self.epsilon = 0.25 if BM25_Model_Hyperp.EPSILON.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.EPSILON.value]
        self.top = 3 if BM25_Model_Hyperp.TOP.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.TOP.value]
        self.sim_measure_min_threshold = ('', 0.0) if BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value]
        
    def set_tokenizer(self, **kwargs):
        self.tokenizer = WordNetBased_LemmaTokenizer() if BM25_Model_Hyperp.TOKENIZER.value not in kwargs.keys() else kwargs[BM25_Model_Hyperp.TOKENIZER.value]
        
        #tokenizer_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__tokenizer__' in key}
        #self.tokenizer.set_params(**tokenizer_params)
    
    def set_name(self, name):
        self.name = name
        
    def recover_links(self, corpus, query, use_cases_names, bug_reports_names):
        bm25 = BM25([self.tokenizer.__call__(doc) for doc in corpus])
        average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())
        query = [self.tokenizer.__call__(doc) for doc in query]
        
        self._sim_matrix = pd.DataFrame(index = use_cases_names, 
                                           columns = bug_reports_names,
                                           data=np.zeros(shape=(len(use_cases_names), len(bug_reports_names)),dtype='float64'))
        
        for bug_id, bug_desc in zip(bug_reports_names, query):
            scores = bm25.get_scores(bug_desc, average_idf=average_idf)
            for uc_id, sc in zip(use_cases_names, scores):
                self._sim_matrix.at[uc_id, bug_id] = sc
        
        self._sim_matrix = pd.DataFrame(self._sim_matrix, index=use_cases_names, columns=bug_reports_names)
        self._fillUp_traceLinksDf(use_cases_names, bug_reports_names, self._sim_matrix)
    
    def _fillUp_traceLinksDf(self, use_cases_names, bug_reports_names, sim_matrix):
        self.trace_links_df = pd.DataFrame(index = use_cases_names,
                                           columns = bug_reports_names,
                                           data = sim_matrix)
                    
        for col in self.trace_links_df.columns:
            nlargest_df = self.trace_links_df.nlargest(n = self.top, columns=col, keep='first')    
            self.trace_links_df[col] = [1 if x in nlargest_df[col].tolist() and x >= self.sim_measure_min_threshold[1] else 0 for x in self.trace_links_df[col]]
    
    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.name},
                      {"Top Value" : self.top},
                      {"Sim Measure Min Threshold" : self.sim_measure_min_threshold},
                      {"K" : self.k},
                      {"B" : self.b},
                      {"Epsilon" : self.epsilon},
                      {"Tokenizer Type" : type(self.tokenizer)}
                  ]
               }
    
    def get_name(self):
        return self.name
    
    def get_top_value(self):
        return self.top
    
    def get_sim_matrix(self):
        return self._sim_matrix
    
    def get_tokenizer_type(self):
        return type(self.tokenizer)
    
    def get_trace_links_df(self):
        return self.trace_links_df
    
    def save_sim_matrix(self):
        self._sim_matrix.to_csv('best_models_sim_matrix/bm25.csv')
    
    def get_model_dump(self):
        return 'dumps/bm25/model/{}.p'.format(self.get_name())

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

#### Analysis with Default Values of BM25 Model

In [5]:
best_model = BM_25()
best_model.recover_links(corpus, query, use_cases_names, bug_reports_names)

df = pd.DataFrame(best_model.get_sim_matrix())
df.head(10)

#evaluator = ModelEvaluator(orc.oracle, best_model)
#evaluator.evaluate_model(verbose=True)
#evaluator.plot_precision_vs_recall()

artf_name,BR_4020_SRC,BR_3890_SRC,BR_3844_SRC,BR_4065_SRC,BR_3880_SRC,BR_3987_SRC,BR_4067_SRC,BR_3973_SRC,BR_3898_SRC,BR_3908_SRC,BR_4058_SRC,BR_4018_SRC,BR_4005_SRC,BR_3974_SRC
artf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UC_003_TRG,7.089528,0.0,0.923743,1.11465,2.150948,0.355476,4.012595,1.697277,3.342732,3.461319,1.227205,5.756607,7.164948,0.0
UC_007_TRG,9.834788,1.797495,1.810439,0.0,0.742184,0.366688,3.204764,0.542561,0.742184,1.096859,9.115214,4.190159,0.33044,0.0
UC_010_TRG,16.810307,0.0,6.353957,0.183335,1.333368,0.347976,0.916676,0.183335,2.280733,17.809227,0.183335,2.415047,1.46993,0.0
UC_002_TRG,5.330881,0.0,0.0,0.19285,1.533159,0.342094,2.128454,2.101317,1.219134,0.877039,0.342094,4.030716,2.237427,0.0
UC_006_TRG,5.080511,0.0,0.0,0.241239,0.368304,0.338581,2.452156,0.609543,1.711465,0.958172,0.368304,4.94281,2.290405,0.0
UC_004_TRG,3.906332,0.0,0.0,0.0,1.524473,0.351135,3.682771,2.108979,1.053406,0.713617,0.351135,3.886018,5.45972,0.0
UC_005_TRG,5.319835,0.0,0.0,0.186401,0.371506,0.349795,2.502583,0.816893,1.704921,0.907702,0.371506,5.111546,2.367286,0.0
UC_008_TRG,13.300459,0.0,0.0,3.028716,9.086149,0.350704,6.057432,0.485578,6.057432,3.364868,0.0,7.028083,31.087417,0.0
UC_001_TRG,2.346083,0.0,1.031577,1.293069,1.135627,0.356966,3.094731,0.455406,2.949481,8.640988,0.0,4.156527,4.60412,0.0
UC_009_TRG,5.418284,1.931059,1.124214,0.177497,1.301711,0.344392,0.887485,0.177497,70.831526,45.959684,3.38884,4.203115,0.500531,0.0


### Find The Best Model

In [6]:
all_hyperparams = {
    BM25_Model_Hyperp.TOP.value : [3,5],
    BM25_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('-', 0.0)],
    BM25_Model_Hyperp.TOKENIZER.value : [PorterStemmerBased_Tokenizer(), LancasterStemmerBased_Tokenizer(), 
                                                   WordNetBased_LemmaTokenizer(), SnowballStemmerBased_Tokenizer()]
}

hyperparams = aux_functions.generate_params_comb_list(**all_hyperparams)

print('Performing model hyperparameters search...')

def run_model(idx, model_name, **hyperp):    
    current_model = BM_25(**hyperp)
    current_model.set_name(model_name.format(idx))
    current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
    
    evaluator = ModelEvaluator(orc.oracle, current_model)
    evaluator.evaluate_model()
    
    model_dump = current_model.get_model_dump()
    evaluator_dump = evaluator.get_evaluator_dump()
    
    pickle.dump(evaluator.get_model(), open(model_dump, 'wb'))
    pickle.dump(evaluator, open(evaluator_dump, 'wb'))
    
    return([evaluator.get_mean_precision(), 
                    evaluator.get_mean_recall(),
                    evaluator.get_mean_fscore(), 
                    evaluator.get_model().get_name(),
                    evaluator.get_model().get_top_value(),
                    evaluator.get_model().get_tokenizer_type(),
                    model_dump,
                    evaluator_dump
           ])

tasks = [(idx,'BM25_Model_{}',hp) for idx,hp in enumerate(hyperparams)]
results = Parallel(n_jobs=-1, verbose=1)(delayed(run_model)(idx, mn, **hp) for idx,mn,hp in tasks)
results_df = pd.DataFrame(data=results, 
                          columns=['precision', 'recall', 'fscore', 'model_name', 'top_value', 'tokenizer', 'model_dump', 'evaluator_dump'])
results_df = results_df.astype(dtype={'model_dump' : str, 'evaluator_dump' : str})


Performing model hyperparameters search...


JoblibNameError: JoblibNameError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7fd701e86930, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/guilherme/anaconda3/envs/trace-link-recove...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/guilherme/anaconda3/envs/trace-link-recove...lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/guilhe.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7fd701e86930, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/guilherme/anaconda3/envs/trace-link-recove...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/home/guilherme/anaconda3/envs/trace-link-recove...lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/home/guilhe.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    492         if self.poller is not None:
    493             self.poller.start()
    494         self.kernel.start()
    495         self.io_loop = ioloop.IOLoop.current()
    496         try:
--> 497             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    498         except KeyboardInterrupt:
    499             pass
    500 
    501 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    127         except (RuntimeError, AssertionError):
    128             old_loop = None
    129         try:
    130             self._setup_logging()
    131             asyncio.set_event_loop(self.asyncio_loop)
--> 132             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    133         finally:
    134             asyncio.set_event_loop(old_loop)
    135 
    136     def stop(self):

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    417             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    418                                    finalizer=self._asyncgen_finalizer_hook)
    419         try:
    420             events._set_running_loop(self)
    421             while True:
--> 422                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    423                 if self._stopping:
    424                     break
    425         finally:
    426             self._stopping = False

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1429                         logger.warning('Executing %s took %.3f seconds',
   1430                                        _format_handle(handle), dt)
   1431                 finally:
   1432                     self._current_handle = None
   1433             else:
-> 1434                 handle._run()
        handle._run = <bound method Handle._run of <Handle IOLoop._run_callback(functools.par...7fd6f80d4048>))>>
   1435         handle = None  # Needed to break cycles when an exception occurs.
   1436 
   1437     def _set_coroutine_wrapper(self, enabled):
   1438         try:

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/asyncio/events.py in _run(self=<Handle IOLoop._run_callback(functools.par...7fd6f80d4048>))>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method IOLoop._run_callback of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (functools.partial(<function wrap.<locals>.null_wrapper at 0x7fd6f80d4048>),)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/tornado/ioloop.py in _run_callback(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, callback=functools.partial(<function wrap.<locals>.null_wrapper at 0x7fd6f80d4048>))
    753         """Runs a callback with error handling.
    754 
    755         For use in subclasses.
    756         """
    757         try:
--> 758             ret = callback()
        ret = undefined
        callback = functools.partial(<function wrap.<locals>.null_wrapper at 0x7fd6f80d4048>)
    759             if ret is not None:
    760                 from tornado import gen
    761                 # Functions that return Futures typically swallow all
    762                 # exceptions and store them in the Future.  If a Future

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = ()
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in <lambda>()
    531             return
    532 
    533         if state & self.socket.events:
    534             # events still exist that haven't been processed
    535             # explicitly schedule handling to avoid missing events due to edge-triggered FDs
--> 536             self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
    537 
    538     def _init_io_state(self):
    539         """initialize the ioloop event handler"""
    540         with stack_context.NullContext():

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=0)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 26, 14, 41, 49, 247011, tzinfo=tzutc()), 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'session': 'bfc29ab5-b8e5-4d2e-9853-8a3f91d2a847', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '892e548b-41be-4e16-bdf8-faa249231fec', 'deletedCells': []}, 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warning("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'bfc29ab5-b8e5-4d2e-9853-8a3f91d2a847']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 26, 14, 41, 49, 247011, tzinfo=tzutc()), 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'session': 'bfc29ab5-b8e5-4d2e-9853-8a3f91d2a847', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '892e548b-41be-4e16-bdf8-faa249231fec', 'deletedCells': []}, 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'bfc29ab5-b8e5-4d2e-9853-8a3f91d2a847'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 26, 14, 41, 49, 247011, tzinfo=tzutc()), 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'session': 'bfc29ab5-b8e5-4d2e-9853-8a3f91d2a847', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '892e548b-41be-4e16-bdf8-faa249231fec', 'deletedCells': []}, 'msg_id': 'c5b21ae3-c958-4b1b-a995-2b1a01a29dc1', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n"
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n",), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n",)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n"
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...pe={'model_dump' : str, 'evaluator_dump' : str})\n", store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.FunctionDef object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>], cell_name='<ipython-input-6-f88927b7077b>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7fd6bf52a1d0, executi...rue silent=False shell_futures=True> result=None>)
   2896             raise ValueError("Interactivity was %r" % interactivity)
   2897         try:
   2898             for i, node in enumerate(to_run_exec):
   2899                 mod = ast.Module([node])
   2900                 code = compiler(mod, cell_name, "exec")
-> 2901                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fd6bacde9c0, file "<ipython-input-6-f88927b7077b>", line 37>
        result = <ExecutionResult object at 7fd6bf52a1d0, executi...rue silent=False shell_futures=True> result=None>
   2902                     return True
   2903 
   2904             for i, node in enumerate(to_run_interactive):
   2905                 mod = ast.Interactive([node])

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fd6bacde9c0, file "<ipython-input-6-f88927b7077b>", line 37>, result=<ExecutionResult object at 7fd6bf52a1d0, executi...rue silent=False shell_futures=True> result=None>)
   2956         outflag = True  # happens in more places, so it's easier as default
   2957         try:
   2958             try:
   2959                 self.hooks.pre_run_code_hook()
   2960                 #rprint('Running code', repr(code_obj)) # dbg
-> 2961                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fd6bacde9c0, file "<ipython-input-6-f88927b7077b>", line 37>
        self.user_global_ns = {'BM25': <class 'gensim.summarization.bm25.BM25'>, 'BM25_Model_Hyperp': <enum 'BM25_Model_Hyperp'>, 'BM_25': <class '__main__.BM_25'>, 'Enum': <enum 'Enum'>, 'In': ['', "import pandas as pd\nimport numpy as np\nimport se...\nimport warnings; warnings.simplefilter('ignore')", 'trace_df = jd.read_trace_df()\nartfs_desc_df = jd...ases_names, bug_reports_names)\norc.load(trace_df)', "class BM25_Model_Hyperp(Enum):\n    NAME = 'bm25_...MIN_THRESHOLD = 'bm25__sim_measure_min_threshold'", '"""\nparams_dict = {\n    \'bm25__k\' : 1.2,\n    \'bm...n \'dumps/bm25/model/{}.p\'.format(self.get_name())', 'best_model = BM_25()\nbest_model.recover_links(co...rbose=True)\n#evaluator.plot_precision_vs_recall()', "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...ype={'model_dump' : str, 'evaluator_dump' : str})"], 'LancasterStemmerBased_Tokenizer': <class 'utils.tokenizers.LancasterStemmerBased_Tokenizer'>, 'Normalizer': <class 'sklearn.preprocessing.data.Normalizer'>, 'Out': {5: artf_name   BR_4020_SRC  BR_3890_SRC  BR_3844_SR...3.388840     4.203115     0.500531          0.0  }, 'Parallel': <class 'sklearn.externals.joblib.parallel.Parallel'>, 'PorterStemmerBased_Tokenizer': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>, ...}
        self.user_ns = {'BM25': <class 'gensim.summarization.bm25.BM25'>, 'BM25_Model_Hyperp': <enum 'BM25_Model_Hyperp'>, 'BM_25': <class '__main__.BM_25'>, 'Enum': <enum 'Enum'>, 'In': ['', "import pandas as pd\nimport numpy as np\nimport se...\nimport warnings; warnings.simplefilter('ignore')", 'trace_df = jd.read_trace_df()\nartfs_desc_df = jd...ases_names, bug_reports_names)\norc.load(trace_df)', "class BM25_Model_Hyperp(Enum):\n    NAME = 'bm25_...MIN_THRESHOLD = 'bm25__sim_measure_min_threshold'", '"""\nparams_dict = {\n    \'bm25__k\' : 1.2,\n    \'bm...n \'dumps/bm25/model/{}.p\'.format(self.get_name())', 'best_model = BM_25()\nbest_model.recover_links(co...rbose=True)\n#evaluator.plot_precision_vs_recall()', "all_hyperparams = {\n    BM25_Model_Hyperp.TOP.va...ype={'model_dump' : str, 'evaluator_dump' : str})"], 'LancasterStemmerBased_Tokenizer': <class 'utils.tokenizers.LancasterStemmerBased_Tokenizer'>, 'Normalizer': <class 'sklearn.preprocessing.data.Normalizer'>, 'Out': {5: artf_name   BR_4020_SRC  BR_3890_SRC  BR_3844_SR...3.388840     4.203115     0.500531          0.0  }, 'Parallel': <class 'sklearn.externals.joblib.parallel.Parallel'>, 'PorterStemmerBased_Tokenizer': <class 'utils.tokenizers.PorterStemmerBased_Tokenizer'>, ...}
   2962             finally:
   2963                 # Reset our crash handler in place
   2964                 sys.excepthook = old_excepthook
   2965         except SystemExit as e:

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/notebooks/<ipython-input-6-f88927b7077b> in <module>()
     32                     model_dump,
     33                     evaluator_dump
     34            ])
     35 
     36 tasks = [(idx,'BM25_Model_{}',hp) for idx,hp in enumerate(hyperparams)]
---> 37 results = Parallel(n_jobs=-1, verbose=1)(delayed(run_model)(idx, mn, **hp) for idx,mn,hp in tasks)
     38 results_df = pd.DataFrame(data=results, 
     39                           columns=['precision', 'recall', 'fscore', 'model_name', 'top_value', 'tokenizer', 'model_dump', 'evaluator_dump'])
     40 results_df = results_df.astype(dtype={'model_dump' : str, 'evaluator_dump' : str})

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
NameError                                          Mon Nov 26 11:41:51 2018
PID: 17143Python 3.6.6: /home/guilherme/anaconda3/envs/trace-link-recovery-study/bin/python
...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function run_model>, (0, 'BM25_Model_{}'), {'bm25__sim_measure_min_threshold': ('-', 0.0), 'bm25__tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object>, 'bm25_top': 3})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function run_model>
        args = (0, 'BM25_Model_{}')
        kwargs = {'bm25__sim_measure_min_threshold': ('-', 0.0), 'bm25__tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object>, 'bm25_top': 3}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/home/guilherme/anaconda3/envs/trace-link-recovery-study/notebooks/<ipython-input-6-f88927b7077b> in run_model(idx=0, model_name='BM25_Model_{}', **hyperp={'bm25__sim_measure_min_threshold': ('-', 0.0), 'bm25__tokenizer': <utils.tokenizers.PorterStemmerBased_Tokenizer object>, 'bm25_top': 3})
     12 def run_model(idx, model_name, **hyperp):    
     13     current_model = BM_25(**hyperp)
     14     current_model.set_name(model_name.format(idx))
     15     current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
     16     
---> 17     evaluator = ModelEvaluator(orc.oracle, current_model)
     18     evaluator.evaluate_model()
     19     
     20     model_dump = current_model.get_model_dump()
     21     evaluator_dump = evaluator.get_evaluator_dump()

NameError: name 'ModelEvaluator' is not defined
___________________________________________________________________________

### Report

In [None]:
print("------------ Report -------------------\n")
print("Total of Analyzed Hyperparameters Combinations: {}".format(len(hyperparams)))

print("\nBest Model Hyperparameters Combination Found:\n")            

row_idx = results_df['model_dump'][results_df.recall == results_df.recall.max()].index[0]
best_model = pickle.load(open(results_df['model_dump'][row_idx], 'rb'))
evalu = pickle.load(open(results_df['evaluator_dump'][row_idx], 'rb'))
evalu.evaluate_model(verbose=True)

#print("\nPlot Precision vs Recall - Best Model")
#evalu.plot_precision_vs_recall()

#print("\nHeatmap of All Models")
#plot_heatmap(results_df)

#evalu.save_log()

### Save Similarity Matrix

In [None]:
best_model.save_sim_matrix()

#### Best Model for TOP 3 and 5

In [None]:
for top in [3,5]:
    row_idx_top = results_df[results_df.top_value == top].recall.argmax()
    best_model_top = pickle.load(open(results_df['model_dump'][row_idx_top], 'rb'))
    evalu_top = pickle.load(open(results_df['evaluator_dump'][row_idx_top], 'rb'))
    evalu_top.evaluate_model(verbose=True)
    print("------------------------------------------------------------------")

### Plot Highlights

In [None]:
highlight_df(best_model.get_trace_links_df())

In [None]:
highlight_df(orc.oracle)