# Introduction

In this notebook we demonstrate the use of **LDA (Latent Dirichlet Allocation)** generative statistical model for Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

## Import Libraries

In [2]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from dit.divergences import jensen_shannon_divergence

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lda import LDA
from modules.models.model_hyperps import LDA_Model_Hyperp

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

## Load Dataset

In [3]:
test_cases_df = fd.read_testcases_df()
bug_reports_df = fd.read_bugreports_df()

corpus = test_cases_df.tc_desc
query = bug_reports_df.br_desc

test_cases_names = test_cases_df.tc_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_oracle_expert_volunteers_df()

TestCases.shape: (207, 12)
BugReports.shape: (93, 19)
Oracle.shape: (207, 93)


## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

### Analysis with Default Values of LDA Model

In [17]:
model = LDA()
model.recover_links(corpus, query, test_cases_names, bug_reports_names)
evaluator = m_eval.ModelEvaluator(orc, model)
evaluator.evaluate_model(verbose=True)

{'Measures': {'Mean FScore of LDA': 0.011351909184726523,
              'Mean Precision of LDA': 0.050179211469534045,
              'Mean Recall of LDA': 0.006481963232743671},
 'Setup': [{'Name': 'LDA'},
           {'Similarity Measure and Minimum Threshold': ('jsd', 0.3)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 10,
                          'n_jobs': -1,
                          'n_topics': None,
                          'perp_tol': 0.1,
                          'random_state': None,
                          '

## Running LDA Model

In [18]:
%%time

lda_hyperp = {
    LDA_Model_Hyperp.TOP.value : 100,
    LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine',.75),
    LDA_Model_Hyperp.LDA_MODEL_N_COMPONENTS.value: 50,
    LDA_Model_Hyperp.LDA_MODEL_RANDOM_STATE.value : 2,
    LDA_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: (1,1),
    LDA_Model_Hyperp.VECTORIZER.value : TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True),
    LDA_Model_Hyperp.VECTORIZER_TOKENIZER.value : tok.PorterStemmerBased_Tokenizer() 
}

lda_model = LDA(**lda_hyperp)
lda_model.set_name('LDA_Model_AllData')
lda_model.recover_links(corpus, query, test_cases_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, lda_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of LDA_Model_AllData': 0.08542313017991841,
              'Mean Precision of LDA_Model_AllData': 0.05509820627037862,
              'Mean Recall of LDA_Model_AllData': 0.2786315363332886},
 'Setup': [{'Name': 'LDA_Model_AllData'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 100},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 50,
                          'n_jobs': -1,
                          'n_topics': None

In [19]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0,0,0,0,0,0,0
TC_2_TRG,0,0,0,0,0,0,0
TC_3_TRG,0,0,0,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0
TC_5_TRG,0,0,0,0,0,0,0
TC_6_TRG,0,0,0,0,0,0,0
TC_7_TRG,0,0,0,0,0,0,0
TC_8_TRG,0,0,0,0,0,0,0
TC_9_TRG,0,0,0,0,0,0,0
TC_10_TRG,0,0,0,0,0,0,0


In [20]:
aux_functions.highlight_df(lda_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0,1,1,0,1,0,0
TC_2_TRG,0,0,0,0,0,0,0
TC_3_TRG,0,0,1,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0
TC_5_TRG,0,0,1,0,0,0,0
TC_6_TRG,0,0,0,0,0,0,0
TC_7_TRG,0,0,0,0,0,0,0
TC_8_TRG,0,0,1,0,0,0,0
TC_9_TRG,0,0,0,0,0,0,0
TC_10_TRG,0,0,1,0,0,0,0


In [21]:
aux_functions.highlight_df(lda_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TC_1_TRG,0.714393,0.792231,0.816649,0.59479,0.789727,0.74114,0.635427
TC_2_TRG,0.422642,0.517238,0.73819,0.214688,0.534402,0.564233,0.191987
TC_3_TRG,0.490006,0.571917,0.779283,0.309195,0.598051,0.628718,0.294042
TC_4_TRG,0.39816,0.492381,0.708192,0.190494,0.506775,0.535785,0.165935
TC_5_TRG,0.517259,0.741753,0.802477,0.33323,0.701863,0.641749,0.329617
TC_6_TRG,0.393564,0.491039,0.7128,0.178981,0.504644,0.533704,0.153478
TC_7_TRG,0.454145,0.497928,0.699728,0.248024,0.57332,0.619893,0.230353
TC_8_TRG,0.620335,0.606368,0.830769,0.428904,0.642551,0.670614,0.432022
TC_9_TRG,0.598285,0.541765,0.732133,0.428449,0.572976,0.592374,0.436556
TC_10_TRG,0.632569,0.507051,0.764654,0.247384,0.526577,0.554912,0.228457
