# Introduction

In this notebook we demonstrate the use of **Word Embeddings (Word2Vec)** weighting technique into Information Retrieval to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy

from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, pairwise_distances, pairwise
from sklearn.externals.joblib import Parallel, delayed

from enum import Enum
import pickle
from tqdm import tqdm

from utils import plots
from utils import oracle_loader as ol
from utils import jedit_dataset as jd
from utils import tokenizers as tok
from utils import aux_functions
from utils import model_evaluator as m_eval
from utils import generic_model as g_model

import warnings; warnings.simplefilter('ignore')

## Load Dataset and Preprocessing

In [2]:
trace_df = jd.read_trace_df()
artfs_desc_df = jd.read_artfs_desc_df()

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = ol.OracleLoader(use_cases_names, bug_reports_names)
orc.load(trace_df)

# WordVec Based Model

#### Model Hyperparameters

In [3]:
class WordVec_Model_Hyperp(Enum):
    NAME = 'wordvec__name'
    TOP = 'wordvec__top'
    TOKENIZER = 'wordvec__tokenizer'
    SIM_MEASURE_MIN_THRESHOLD = 'wordvec__sim_measure_min_threshold'

#### Distance Functions / Similarity Measures Available

In [4]:
class SimilarityMeasure(Enum):
    COSINE = 'cosine'

#### Model Definition

In [5]:
"""
params_dict = {
    'wordvec__sim_measure_min_threshold' : ('cosine',.9),
    'wordvec__name' : 'WordVec',
    'wordvec__top' : 3
    'wordvec_tokenizer' : WordNetBased_LemmaTokenizer()
}
"""
class WordVec_BasedModel(g_model.GenericModel):
    def __init__(self, **kwargs):
        self._nlp_model = None
        self.tokenizer = None
        
        super().__init__()
        
        self.set_basic_params(**kwargs)
        self.set_nlp_model()
    
    def set_name(self, name):
        super().set_name(name)
    
    def set_basic_params(self, **kwargs):
        super().set_name('WordVec' if WordVec_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[WordVec_Model_Hyperp.NAME.value])
        super().set_sim_measure_min_threshold((SimilarityMeasure.COSINE.value,.80) if WordVec_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value not in kwargs.keys() else kwargs[WordVec_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value])
        super().set_top(3 if WordVec_Model_Hyperp.TOP.value not in kwargs.keys() else kwargs[WordVec_Model_Hyperp.TOP.value])
        super().set_model_gen_name('wordvector')
        
        self.tokenizer = tok.WordNetBased_LemmaTokenizer() if WordVec_Model_Hyperp.TOKENIZER.value not in kwargs.keys() else kwargs[WordVec_Model_Hyperp.TOKENIZER.value]
        
    
    def set_nlp_model(self):
        """
            WordVec based on GloVe 1.1M keys x 300 dim
            300-dimensional word vectors trained on Common Crawl with GloVe.
        """
        self._nlp_model = spacy.load('en_vectors_web_lg')
    
    def __getstate__(self):
        """to pickle object serialization/deserialization"""
        d = dict(self.__dict__)
        del d['_nlp_model']
        return d
    
    def __setstate__(self, d):
        """to pickle object serialization/deserialization"""
        self.__dict__.update(d)
    
    def recover_links(self, corpus, query, use_cases_names, bug_reports_names):
        return self._recover_links_cosine(corpus, query, use_cases_names, bug_reports_names)
    
    def _recover_links_cosine(self, corpus, query, use_cases_names, bug_reports_names):
        list_corpus_tokens = [self.tokenizer.__call__(doc) for doc in corpus]
        list_query_tokens = [self.tokenizer.__call__(doc) for doc in query]
        
        corpus = [' '.join(tok_list) for tok_list in list_corpus_tokens]
        query = [' '.join(tok_list) for tok_list in list_query_tokens]
        
        self._sim_matrix = pd.DataFrame(index = use_cases_names, 
                                           columns = bug_reports_names,
                                           data=np.zeros(shape=(len(use_cases_names), len(bug_reports_names)),dtype='float64'))
        
        for bug_id, bug_desc in zip(bug_reports_names, query):
            for uc_id, uc_desc in zip(use_cases_names, corpus):
                doc1 = self._nlp_model(bug_desc)
                doc2 = self._nlp_model(uc_desc)
                self._sim_matrix.at[uc_id, bug_id] = doc1.similarity(doc2)  # cosine similarity is default
        
        self._sim_matrix = pd.DataFrame(self._sim_matrix, index=use_cases_names, columns=bug_reports_names)
        super()._fillUp_traceLinksDf(use_cases_names, bug_reports_names, self._sim_matrix)        
    
    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : super().get_name()},
                      {"Similarity Measure and Minimum Threshold" : super().get_sim_measure_min_threshold()},
                      {"Top Value" : super().get_top_value()},
                      {"Tokenizer" : self.tokenizer}
                  ]
               }
    
    def get_name(self):
        return super().get_name()
    
    def get_top_value(self):
        return super().get_top_value()
    
    def get_sim_measure_min_threshold(self):
        return super().get_sim_measure_min_threshold()
    
    def get_sim_matrix(self):
        return super().get_sim_matrix()
    
    def get_tokenizer_type(self):
        return type(self.tokenizer)
    
    def get_trace_links_df(self):
        return super().get_trace_links_df()
    
    def save_sim_matrix(self):
        super().save_sim_matrix()
    
    def get_model_dump_path(self):
        return super().get_model_dump_path()

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

### Analysis with Default Values of WordVec Model

In [6]:
best_model = WordVec_BasedModel()
best_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
evaluator = m_eval.ModelEvaluator(orc.oracle, best_model)
evaluator.evaluate_model(verbose=True)

{'Measures': {'Mean FScore of WordVec': 0.2714285714285714,
              'Mean Precision of WordVec': 0.19047619047619047,
              'Mean Recall of WordVec': 0.5},
 'Setup': [{'Name': 'WordVec'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.8)},
           {'Top Value': 3},
           {'Tokenizer': <utils.tokenizers.WordNetBased_LemmaTokenizer object at 0x7f75d6575358>}]}


### Find The Best Model

In [None]:
all_hyperparams = {
    WordVec_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('cosine' ,x)  for x in [.75,.85,.95]],
    WordVec_Model_Hyperp.TOP.value : [3,5],
    WordVec_Model_Hyperp.TOKENIZER.value : [tok.PorterStemmerBased_Tokenizer(), tok.LancasterStemmerBased_Tokenizer(), 
                                                   tok.WordNetBased_LemmaTokenizer(), tok.SnowballStemmerBased_Tokenizer()]
}

hyperparams = aux_functions.generate_params_comb_list(**all_hyperparams)          

print('Performing model hyperparameters search...')

results_df = pd.DataFrame(columns=['precision', 'recall', 'fscore', 'model_name', 'top_value', 'tokenizer', 'metric', 'metric_value', 'model_dump', 'evaluator_dump'])

#def run_model(idx, **hyperp):    
for idx,hp in tqdm(enumerate(hyperparams)):
    current_model = WordVec_BasedModel(**hp)
    current_model.set_name('WordVec_Based_Model_{}'.format(idx))
    current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
    
    evaluator = m_eval.ModelEvaluator(orc.oracle, current_model)
    evaluator.evaluate_model()
    evaluator.dump_model()
    evaluator.dump_evaluator()
    
    results_df = results_df.append(pd.DataFrame([[evaluator.get_mean_precision(), 
                    evaluator.get_mean_recall(),
                    evaluator.get_mean_fscore(), 
                    evaluator.get_model().get_name(),
                    evaluator.get_model().get_top_value(),
                    evaluator.get_model().get_tokenizer_type(),
                    evaluator.get_model().get_sim_measure_min_threshold()[0],
                    evaluator.get_model().get_sim_measure_min_threshold()[1],
                    evaluator.get_model().get_model_dump_path(),
                    evaluator.get_evaluator_dump_path()
           ]], columns=results_df.columns), ignore_index=True)

#tasks = [(idx,hp) for idx,hp in enumerate(hyperparams)]
#results = Parallel(n_jobs=-1, verbose=1)(delayed(run_model)(idx, **hp) for idx,hp in tasks)
results_df = results_df.astype(dtype={'model_dump' : str, 'evaluator_dump' : str, 'top_value': int})

0it [00:00, ?it/s]

Performing model hyperparameters search...


3it [00:32, 10.59s/it]

### Report

In [None]:
best_model = aux_functions.report_best_model(results_df)

### Save Similarity Matrix

In [None]:
best_model.save_sim_matrix()

#### Best Model for TOP 3 and 5 - Cosine

In [None]:
aux_functions.print_report_top_3_and_5_v3(results_df, metric=SimilairityMeasure.COSINE.value)

### Plot Highlights

In [None]:
aux_functions.highlight_df(best_model.get_trace_links_df())

In [None]:
aux_functions.highlight_df(orc.oracle)