# Introduction

In this notebook we demonstrate the use of **LDA (Latent Dirichlet Allocation)** generative statistical model for Information Retrieval technique to make trace link recovery between Test Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each test case content as an entire document that must be returned to the query made

# Import Libraries

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from dit.divergences import jensen_shannon_divergence

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from modules.utils import plots
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval

from modules.models.lda import LDA
from modules.models.model_hyperps import LDA_Model_Hyperp

from IPython.display import display

import warnings; warnings.simplefilter('ignore')

# Load Dataset

In [2]:
features_df = fd.read_features_df()
bug_reports_df = fd.read_bugreports_df()

corpus = features_df.feat_desc
query = bug_reports_df.br_desc

features_names = features_df.feat_name
bug_reports_names = bug_reports_df.br_name

orc = fd.read_feat_br_trace_df()

Features.shape: (21, 8)
BugReports.shape: (93, 18)
Feat_BR_Trace.shape: (21, 93)


# Running LDA Model

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

In [3]:
%%time

lda_hyperp = {
    LDA_Model_Hyperp.TOP.value : 10,
    LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : ('cosine',.75),
    LDA_Model_Hyperp.LDA_MODEL_N_COMPONENTS.value: 50,
    LDA_Model_Hyperp.LDA_MODEL_RANDOM_STATE.value : 2,
    LDA_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: (1,1),
    LDA_Model_Hyperp.VECTORIZER.value : TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True),
    LDA_Model_Hyperp.VECTORIZER_TOKENIZER.value : tok.PorterStemmerBased_Tokenizer() 
}

lda_model = LDA(**lda_hyperp)
lda_model.set_name('LDA_Model_AllData')
lda_model.recover_links(corpus, query, features_names, bug_reports_names)

print("\nModel Evaluation -------------------------------------------")
evaluator = m_eval.ModelEvaluator(orc, lda_model)
evaluator.evaluate_model(verbose=True)


Model Evaluation -------------------------------------------
{'Measures': {'Mean FScore of LDA_Model_AllData': 0.05077658303464755,
              'Mean Precision of LDA_Model_AllData': 0.028993855606758826,
              'Mean Recall of LDA_Model_AllData': 0.20430107526881722},
 'Setup': [{'Name': 'LDA_Model_AllData'},
           {'Similarity Measure and Minimum Threshold': ('cosine', 0.75)},
           {'Top Value': 10},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': 'batch',
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 50,
                          'n_jobs': -1,
                          'n_topics': Non

In [4]:
aux_functions.highlight_df(orc.iloc[0:20, 0:7])

Unnamed: 0_level_0,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,0,0,1,0,0,0
windows_child_mode,0,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,0,0,0,0,0,0
pdf_viewer,0,0,0,0,0,0,0
context_menu,1,0,0,0,0,0,0
w10_comp,0,0,0,0,0,0,0
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,0,0,0,0,0,0


In [5]:
aux_functions.highlight_df(lda_model.get_trace_links_df().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0,1,1,1,1,1,1
windows_child_mode,1,0,0,0,0,0,0
apz_async_scrolling,0,0,0,0,0,0,0
browser_customization,0,1,1,1,1,1,1
pdf_viewer,0,0,0,0,0,0,0
context_menu,0,1,1,0,0,0,0
w10_comp,0,1,1,1,1,1,1
tts_in_desktop,0,0,0,0,0,0,0
tts_in_rm,0,0,0,0,0,0,0
webgl_comp,0,1,1,1,1,1,1


In [6]:
aux_functions.highlight_df(lda_model.get_sim_matrix().iloc[0:20, 0:7])

br_name,BR_1181835_SRC,BR_1248267_SRC,BR_1248268_SRC,BR_1257087_SRC,BR_1264988_SRC,BR_1267480_SRC,BR_1267501_SRC
feat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new_awesome_bar,0.562958,0.822464,0.845121,0.93454,0.809617,0.984298,0.849874
windows_child_mode,0.844234,0.278577,0.449095,0.302385,0.630703,0.379786,0.360603
apz_async_scrolling,0.567435,0.0608301,0.155969,0.20732,0.155283,0.0867842,0.48282
browser_customization,0.522734,0.817867,0.83052,0.929313,0.784048,0.974596,0.84096
pdf_viewer,0.140446,0.281083,0.208148,0.231773,0.199583,0.246156,0.24736
context_menu,0.270284,0.84752,0.777522,0.667222,0.681913,0.422551,0.59521
w10_comp,0.519753,0.815923,0.828606,0.927764,0.781869,0.973367,0.838796
tts_in_desktop,0.154437,0.230407,0.23238,0.259295,0.222188,0.275264,0.236096
tts_in_rm,0.495699,0.221529,0.275184,0.392815,0.213714,0.26457,0.628166
webgl_comp,0.521424,0.817024,0.829691,0.928648,0.7831,0.974074,0.84002
