# Introduction

In this notebook we demonstrate the use of **LDA (Latent Dirichlet Allocation)** generative statistical model for Information Retrieval technique to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

## Import Libraries

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns

from dit.divergences import jensen_shannon_divergence

import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_fscore_support, pairwise_distances, pairwise
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from scipy.sparse import csr_matrix
from scipy.stats import entropy

import nltk
import datetime
import pprint
from enum import Enum
import pickle

import warnings; warnings.simplefilter('ignore')

### Oracle Loader

In [26]:
class OracleLoader:
    def __init__(self, rows_names, columns_names):
        self.oracle = None
        self._columns_names = columns_names
        self._rows_names = rows_names
    
    def load(self):
        self.oracle = pd.DataFrame(columns=list(self._columns_names), 
                                   data=np.zeros(shape=(len(self._rows_names), len(self._columns_names)), 
                                                 dtype='int64'))
        self.oracle.insert(0, 'artf_name', list(self._rows_names))
        
        for index, row in trace_df.iterrows():
            idx = self.oracle[self.oracle.artf_name == row['trg_artf']].index
            self.oracle.at[idx, row['src_artf']] = row['link']

        self.oracle.set_index('artf_name', inplace=True)

## Load Dataset and Preprocessing

In [27]:
trace_df = pd.read_csv('../../data/jEdit/jEditDataset/oracle/output/trace_matrix.csv')
artfs_desc_df = pd.read_csv('../../data/jEdit/jEditDataset/oracle/output/artifacts_descriptions.csv', sep="|")

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = OracleLoader(use_cases_names, bug_reports_names)
orc.load()

# LDA Model

#### Model Hyperparameters

In [28]:
class LDA_Model_Hyperp(Enum):
    NAME = 'lda__name'
    TOP = 'lda__top_value'
    SIM_MEASURE_MIN_THRESHOLD = 'lda__sim_measure_min_threshold'
    VECTORIZER = 'lda__vectorizer'
    VECTORIZER_STOP_WORDS = 'lda__vectorizer__stop_words'
    VECTORIZER_TOKENIZER = 'lda__vectorizer__tokenizer'
    VECTORIZER_USE_IDF = 'lda__vectorizer__use_idf'
    VECTORIZER_SMOOTH_IDF = 'lda__vectorizer__smooth_idf'
    VECTORIZER_NGRAM_RANGE = 'lda__vectorizer__ngram_range'
    LDA_MODEL = 'lda__lda_model'
    LDA_MODEL_N_COMPONENTS = 'lda__lda_model__n_components'
    LDA_MODEL_RANDOM_STATE = 'lda__lda_model__random_state'
    TOKENIZER = 'lda__tokenizer'

#### Tokenizers

In [29]:
"""
Others stemmers are not relevant for our analysis:
 . RSLP Stemmer: portuguese language
 . ISRIS Stemmer: returns Arabic root for the given token 
 . Regexp Stemmer: uses regulax expressions to identify morphological affixes
 
Relevant Stemmers/Lemmatizers are implemented below. 
"""

class GenericTokenizer(object):
    def __init__(self):
        self.stopwords = nltk.corpus.stopwords.words('english')
    def __call__(self, doc):
        tokens = [self.stemmer.stem(token) for token in nltk.word_tokenize(doc)]
        #return [token.lower() for token in tokens if token.isalpha() and token not in self.stopwords and len(token) > 1]
        #return [unicode(token.lower(), 'utf-8') for token in tokens if token.isalpha() and token not in self.stopwords]
        return [token.lower() for token in tokens if token not in self.stopwords]
        
class WordNetBased_LemmaTokenizer(GenericTokenizer):
    def __init__(self):
        super().__init__()
        self.wnl = nltk.stem.WordNetLemmatizer()
    def __call__(self, doc):
        tokens = [self.wnl.lemmatize(token) for token in nltk.word_tokenize(doc)]
        return [token.lower() for token in tokens if token.isalpha() and token not in self.stopwords]

class LancasterStemmerBased_Tokenizer(GenericTokenizer):
    def __init__(self):
        super().__init__()
        self.stemmer = nltk.stem.LancasterStemmer()
    def __call__(self, doc):
        return super().__call__(doc)

class PorterStemmerBased_Tokenizer(GenericTokenizer):
    def __init__(self):
        super().__init__()
        self.stemmer = nltk.stem.PorterStemmer()
    def __call__(self, doc):
        return super().__call__(doc)
    
class SnowballStemmerBased_Tokenizer(GenericTokenizer):    
    def __init__(self):
        super().__init__()
        self.stemmer = nltk.stem.SnowballStemmer('english')    
    def __call__(self, doc):
        return super().__call__(doc)
        
        

#### Similarity Measure

In [39]:
class SimilarityMeasureName(Enum):
    JSD = 'jsd'

class SimilarityMeasure:
    def __init__(self):
        self.name = SimilarityMeasureName.JSD
    
    # static method
    def jsd(p, q):
        p = np.asarray(p)
        q = np.asarray(q)
        # normalize
        #p /= p.sum()
        #q /= q.sum()
        m = (p + q) / 2
        return (entropy(p, m) + entropy(q, m)) / 2

### Scikit Learn Model Defintion

In [99]:
"""
params_dict = {
    'lda__name' : 'LDA',
    'lda__sim_measure_min_threshold' : ('cosine',.9),
    'lda__vectorizer' : TfidfVectorizer(),
    'lda__vectorizer__stop_words' : 'english',
    'lda__vectorizer__tokenizer' : Tokenizer(),
    'lda__vectorizer__use_idf' : True,          # optional if type(Vectorizer) == TfidfVectorizer
    'lda__vectorizer__smooth_idf' : True,       # optional if type(Vectorizer) == TfidfVectorizer
    'lda__vectorizer__ngram_range' : (1,2),
    'lda__lda_model' : TruncatedSVD(),
    'lda__lda_model__n_components' : 5
}
"""
class LDA:
    def __init__(self, **kwargs):
        self._corpus_matrix = None
        self._query_vector = None
        
        self.name = None
        self.top = None
        self.sim_measure_min_threshold = None
        self.trace_links_df = None
        self.vectorizer = None
        self.lda_model = LatentDirichletAllocation()
        
        self.set_basic_params(**kwargs)
        
        self.set_vectorizer(**kwargs)
        self.set_lda_model(**kwargs)
    
    def set_name(self, name):
        self.name = name
    
    def set_basic_params(self, **kwargs):
        self.name = 'LDA' if LDA_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value]
        self.sim_measure_min_threshold = (SimilarityMeasureName.JSD.value, .3) if LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value]       
        self.top = 3 if LDA_Model_Hyperp.TOP.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.value]
    
    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(stop_words='english',
                                             use_idf=True, 
                                             smooth_idf=True) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.VECTORIZER.value]
        
        vec_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__vectorizer__' in key}
        self.vectorizer.set_params(**vec_params)
    
    def set_lda_model(self, **kwargs):      
        lda_model_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__lda_model__' in key}
        self.lda_model.set_params(**lda_model_params)
    
    def recover_links(self, corpus, query, use_cases_names, bug_reports_names):
        self._corpus_matrix = self.vectorizer.fit_transform(corpus)
        self._query_vector = self.vectorizer.transform(query)
        
        out_1 = self.lda_model.fit_transform(self._corpus_matrix)
        out_2 = self.lda_model.transform(self._query_vector)
        
        # D 14 x 10
        self._sim_matrix = pairwise_distances(X=out_1, Y=out_2, metric=SimilarityMeasure.jsd)
            
        self.trace_links_df = pd.DataFrame(index = use_cases_names, 
                                           columns = bug_reports_names,
                                           data = self._sim_matrix)
        
        for col in self.trace_links_df.columns:
            self.trace_links_df[col] = [1 if x >= self.sim_measure_min_threshold[1] else 0 for x in self.trace_links_df[col]]


    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.name},
                      {"Similarity Measure and Minimum Threshold" : self.sim_measure_min_threshold},
                      {"Top Value" : self.top},
                      {"LDA Model" : self.lda_model.get_params()},
                      {"Vectorizer" : self.vectorizer.get_params()},
                      {"Vectorizer Type" : type(self.vectorizer)}
                  ]
               }
    
    def get_name(self):
        return self.name
    
    def get_query_vector(self):
        return self._query_vector
    
    def get_corpus_matrix(self):
        return self._corpus_matrix
    
    def get_sim_measure_min_threshold(self):
        return self.sim_measure_min_threshold
    
    def get_trace_links_df(self):
        return self.trace_links_df
    
    def get_sim_matrix(self):
        return self._sim_matrix

In [100]:
model = LDA()
model.recover_links(corpus, query, use_cases_names, bug_reports_names)
evaluator = ModelEvaluator(orc.oracle, model)
evaluator.evaluate_model(verbose=True)

{'Measures': {'Mean FScore of LDA': 0.11190476190476191,
              'Mean Precision of LDA': 0.07539682539682539,
              'Mean Recall of LDA': 0.2857142857142857},
 'Setup': [{'Name': 'LDA'},
           {'Similarity Measure and Minimum Threshold': ('jsd', 0.3)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': None,
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 10,
                          'n_jobs': 1,
                          'n_topics': None,
                          'perp_tol': 0.1,
                          'random_state': None,
                          'topic_wo

In [101]:
model.get_sim_matrix()
#model.get_trace_links_df()

array([[0.00456899, 0.02245819, 0.02163138, 0.42322775, 0.45040189,
        0.06368359, 0.01963009, 0.45654209, 0.0175761 , 0.01754003,
        0.0350373 , 0.0167226 , 0.46034281, 0.33459058],
       [0.00361593, 0.02031754, 0.01952987, 0.41724524, 0.44489478,
        0.06016244, 0.0176269 , 0.4511469 , 0.01567975, 0.01564561,
        0.03238004, 0.01487264, 0.4550201 , 0.32760279],
       [0.00280842, 0.0183668 , 0.01761691, 0.41155137, 0.43965727,
        0.05687646, 0.01580879, 0.44601573, 0.01396465, 0.01393237,
        0.02993035, 0.01320227, 0.44995934, 0.32094313],
       [0.49428799, 0.45492169, 0.45619991, 0.41095309, 0.43913354,
        0.4109366 , 0.45940894, 0.0242606 , 0.46291535, 0.46297922,
        0.43821196, 0.46443094, 0.44951692, 0.32024172],
       [0.00186788, 0.01585041, 0.01515271, 0.40380167, 0.4325342 ,
        0.05250949, 0.01347564, 0.43903672, 0.01177378, 0.01174409,
        0.02672353, 0.01107316, 0.44307858, 0.31186671],
       [0.5007996 , 0.46253101, 0.4

### Gensim OLDA Model Definition

In [57]:
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import dit

"""
params_dict = {
    'lda__name' : 'LDA',
    'lda__sim_measure_min_threshold' : ('cosine',.9),
    'lda__vectorizer' : TfidfVectorizer(),
    'lda__vectorizer__stop_words' : 'english',
    'lda__vectorizer__tokenizer' : Tokenizer(),
    'lda__vectorizer__use_idf' : True,          # optional if type(Vectorizer) == TfidfVectorizer
    'lda__vectorizer__smooth_idf' : True,       # optional if type(Vectorizer) == TfidfVectorizer
    'lda__vectorizer__ngram_range' : (1,2),
    'lda__lda_model' : TruncatedSVD(),
    'lda__lda_model__n_components' : 5,
    'lda__tokenizer' : WordNetBased_LemmaTokenizer()
}
"""
class OLDA:
    def __init__(self, **kwargs):
        self.name = None
        self.sim_measure_min_threshold = None
        self.top = None
        self.trace_links_df = None
        self.tokenizer = None
        
        self.set_basic_params(**kwargs)
    
    def set_basic_params(self, **kwargs):
        self.name = 'LDA' if LDA_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value]
        self.sim_measure_min_threshold = (SimilarityMeasureName.JSD.value, 0.8) if LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value]
        self.top = 3 if LDA_Model_Hyperp.TOP.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.value]
        self.tokenizer = WordNetBased_LemmaTokenizer() if LDA_Model_Hyperp.TOKENIZER.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.TOKENIZER.value]
    
    def set_name(self, name):
        self.name = name

    def recover_links(self, corpus, query, use_cases_names, bug_reports_names):                
        corpus_tokens = [self.tokenizer.__call__(doc) for doc in corpus]
        dictionary = Dictionary(corpus_tokens)
        print('dictionary: {}'.format(dictionary))
        
        corpus_docs_lengths = [len(x) for x in corpus_tokens]
        print('corpus_docs_lengths: {}'.format(corpus_docs_lengths))
        
        MIN_AMOUNT_TOKENS_PER_DOC = 50
        corpus_tokens = list(filter(lambda x : len(x) >= MIN_AMOUNT_TOKENS_PER_DOC, corpus_tokens))
        
        print("corpus_tokens_length: {}".format(len(corpus_tokens)))
        
        corpus_bow = [dictionary.doc2bow(doc) for doc in corpus_tokens]
        
        ldamodel = LdaModel(corpus_bow, num_topics=4, id2word=dictionary, passes=2)
        
        topics = ldamodel.print_topics(num_words=10)
        for topic in topics:
            print('topic: {}'.format(topic))
        
        queries_tokens = [self.tokenizer.__call__(doc) for doc in query]
        
        queries_docs_lengths = [len(x) for x in queries_tokens]
        print('queries_docs_lengths: {}'.format(queries_docs_lengths))
        
        queries_bow = [dictionary.doc2bow(doc) for doc in queries_tokens]
        print('queries bows: {}'.format([ldamodel.get_document_topics(doc_bow) for doc_bow in queries_bow]))
        
        self.trace_links_df = pd.DataFrame(index = use_cases_names, 
                                           columns = bug_reports_names,
                                           data=np.zeros(shape=(len(use_cases_names), len(bug_reports_names)), dtype='float64'))
        
        doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in ldamodel[corpus_bow]])
        print("doc_topic_dist.shape: {}".format(doc_topic_dist.shape))
        print("doc_topic_dist: {}".format(doc_topic_dist))
        
        #queries_topic_dist = np.array([[tup[1] for tup in lst] for lst in ldamodel.get_document_topics(bow=query) for query in queries_bow])
        #print(queries_topic_dist)
        
        for lst in ldamodel[corpus_bow]:
            print("lst: {}".format(lst))
        
        for bug_id, bug_tokens in zip(bug_reports_names, queries_bow):
            #print('bug_tokens: {}'.format(bug_tokens))
            print("ldamodel[bug_tokens]: {}".format(ldamodel[bug_tokens]))
            print("len(ldamodel[bug_tokens]): {}".format(len(ldamodel.get_document_topics(bug_tokens))))
            topics_distrib_bug = [x[1] for x in ldamodel.get_document_topics(bug_tokens)]
            #print('topics_distrib_bug: {}'.format(topics_distrib_bug))
            for uc_id, uc_bow in zip(use_cases_names, corpus_bow):
                print('ldamodel.get_document_topics(uc_bow): {}'.format(ldamodel[uc_bow]))
                topics_distrib_uc = [x[1] for x in ldamodel[uc_bow]]
                #print('topics_distrib_uc: {}'.format(topics_distrib_uc))
                self.trace_links_df.at[uc_id, bug_id] = SimilarityMeasure.jsd(topics_distrib_bug, topics_distrib_uc)
        
        for col in self.trace_links_df.columns:
            nlargest_df = self.trace_links_df.nlargest(n = self.sim_measure_min_threshold[1], columns=col, keep='first')    
            self.trace_links_df[col] = [1 if x in nlargest_df[col].tolist() else 0 for x in self.trace_links_df[col]]
        

    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.name},
                      {"Similarity Measure and Minimum Threshold" : self.sim_measure_min_threshold},
                      {"Top Value" : self.top},
                      {"LDA Model" : self.lda_model.get_params()},
                      {"Vectorizer" : self.vectorizer.get_params()},
                      {"Vectorizer Type" : type(self.vectorizer)}
                  ]
               }
    
    def get_name(self):
        return self.name
    
    def get_top_value(self):
        return self.top
    
    def get_sim_measure_min_threshold(self):
        return self.sim_measure_min_threshold
    
    def get_trace_links_df(self):
        return self.trace_links_df

### Test with Default Parameters

In [58]:
model = OLDA()
model.recover_links(corpus, query, use_cases_names, bug_reports_names)

dictionary: Dictionary(135 unique tokens: ['area', 'asks', 'basic', 'beginning', 'box']...)
corpus_docs_lengths: [138, 94, 74, 65, 90, 118, 71, 53, 104, 80]
corpus_tokens_length: 10
topic: (0, '0.087*"user" + 0.066*"system" + 0.045*"display" + 0.033*"button" + 0.032*"click" + 0.031*"new" + 0.025*"text" + 0.024*"window" + 0.022*"use" + 0.022*"shortcut"')
topic: (1, '0.068*"user" + 0.059*"system" + 0.044*"text" + 0.043*"click" + 0.038*"button" + 0.025*"display" + 0.024*"found" + 0.023*"window" + 0.022*"file" + 0.022*"replace"')
topic: (2, '0.049*"user" + 0.037*"system" + 0.026*"option" + 0.022*"text" + 0.020*"button" + 0.019*"display" + 0.019*"indent" + 0.019*"file" + 0.016*"indented" + 0.016*"caret"')
topic: (3, '0.080*"system" + 0.071*"user" + 0.062*"file" + 0.039*"display" + 0.035*"click" + 0.033*"button" + 0.031*"text" + 0.028*"view" + 0.027*"name" + 0.025*"main"')
queries_docs_lengths: [140, 17, 24, 27, 33, 21, 37, 15, 83, 105, 34, 71, 66, 37]
queries bows: [[(0, 0.19594266), (2, 0.

ValueError: qk and pk must have same length.

### Model Evaluator

In [66]:
class ModelEvaluator:
    def __init__(self, oracle, model):
        self.model = model
        self.oracle = oracle
        self.recovered_links = model.trace_links_df
        
        self.eval_df = pd.DataFrame(columns=['precision','recall','fscore','support'])
        self.mean_precision = -1
        self.mean_recall = -1
        self.mean_fscore = -1
    
    def evaluate_model(self, verbose=False, file=None):
        y_true = csr_matrix(self.oracle.values, dtype=int)
        y_pred = csr_matrix(self.recovered_links.values, dtype=int)
        
        p, r, f, sp = precision_recall_fscore_support(y_true, y_pred)

        i = 0
        for idx, row in self.oracle.iteritems():
            self.eval_df.at[idx, 'precision'] = p[i]
            self.eval_df.at[idx, 'recall'] = r[i]
            self.eval_df.at[idx, 'fscore'] = f[i]
            self.eval_df.at[idx, 'support'] = sp[i]
            i += 1
        
        self.mean_precision = self.eval_df.precision.mean()
        self.mean_recall = self.eval_df.recall.mean()
        self.mean_fscore = self.eval_df.fscore.mean()
        
        if verbose:
            self.print_report(file)
    
    #def check_best_model(self, best_pre, best_rec, best_fs, best_md):
    #    if best_rec <= self.get_mean_recall():
    #        if best_pre <= self.get_mean_precision():
    #            return (self.get_mean_precision(), self.get_mean_recall(), self.get_mean_fscore(), self.get_model())
    #    return (best_pre, best_rec, best_fs, best_md)
    
    def print_report(self, file=None):
        dic = self.model.model_setup()
        dic['Measures'] = {}
        dic['Measures']['Mean Precision of {}'.format(self.model.get_name())] = self.get_mean_precision()
        dic['Measures']['Mean Recall of {}'.format(self.model.get_name())] = self.get_mean_recall()
        dic['Measures']['Mean FScore of {}'.format(self.model.get_name())] = self.get_mean_fscore()
        
        if file is None:    
            pprint.pprint(dic)
        else:
            file.write(pprint.pformat(dic))
        
    def plot_precision_vs_recall(self):
        plt.figure(figsize=(6,6))
        plt.plot(self.eval_df.recall, self.eval_df.precision, 'ro', label='Precision vs Recall')

        plt.ylabel('Precision')
        plt.xlabel('Recall')

        plt.axis([0, 1.1, 0, 1.1])
        plt.title("Precision vs Recall Plot - " + self.model.get_name())
        plt.show()
    
    def save_log(self):
        print("\nSaving model log...")
        with open('../logs/' + str(datetime.datetime.now()) + '.txt', 'a') as f:
            evaluator.evaluate_model(verbose=True, file=f)
        print("Model log saved with success!")
            
    def get_mean_precision(self):
        return self.mean_precision
    
    def get_mean_recall(self):
        return self.mean_recall
    
    def get_mean_fscore(self):
        return self.mean_fscore

    def get_model(self):
        return self.model

## Evaluate Recovering Efficiency

In order to evaluate the efficiency of the algorithm tested (LSI), we use common metrics applied in the field of IR:

    * Precision
    * Recall
    * F1-score

### Auxiliary Functions

In [63]:
from itertools import product

def generate_params_comb_list(**kwargs):
    list_params = []
    for key, values in kwargs.items():
        aux_list = []
        for v in values:
            aux_list.append((key, v))
        list_params.append(aux_list)
    
    list_tuples = list(product(*list_params))
    
    list_dicts = []
    for ex_tup in list_tuples:
        dic = {}
        for in_tup in ex_tup:
            dic[in_tup[0]] = in_tup[1]
        list_dicts.append(dic)
        
    return list_dicts


def plot_heatmap(results_df):
    tmp_df = pd.DataFrame({'precision': results_df['precision'], 
                           'recall' : results_df['recall'], 
                           'fscore': results_df['fscore'], 
                           'model': results_df['model_name']})
    tmp_df.set_index('model', inplace=True)
    fig, ax = plt.subplots(figsize=(10, 4 * 100)) 
    ax = sns.heatmap(tmp_df, vmin=0, vmax=1, linewidths=.5, cmap="Greens", annot=True, cbar=False, ax=ax)


def highlight_df(df):
    cm = sns.light_palette("green", as_cmap=True)
    return df.style.background_gradient(cmap=cm)    

### Test with One Combination of Hyperparameters

In [75]:
all_hyperparams = {
    LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('jsd', .3)],
    #LDA_Model_Hyperp.LDA_MODEL_N_COMPONENTS.value: [100],
    #LDA_Model_Hyperp.LDA_MODEL_RANDOM_STATE.value : [2],
    #LDA_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: [(1,1)],
    #LDA_Model_Hyperp.VECTORIZER.value : [CountVectorizer(stop_words='english')],
    #LDA_Model_Hyperp.VECTORIZER_TOKENIZER.value : [WordNetBased_LemmaTokenizer()]    
    LDA_Model_Hyperp.TOKENIZER.value : [WordNetBased_LemmaTokenizer()]
}

hyperparams = generate_params_comb_list(**all_hyperparams)

print('Performing model optimizations...')
best_precision = 0.0
best_recall = 0.0
best_fscore = 0.0
best_model = None

results = {'precision': [], 'recall': [], 'fscore': [], 'model': []}

i = 0
for hyperp in hyperparams:
    current_model = LDA(**hyperp)
    current_model.set_name('LDA_Model_{}'.format(i))
    current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
    
    evaluator = ModelEvaluator(orc.oracle, current_model)
    evaluator.evaluate_model()
    
    if best_recall <= evaluator.get_mean_recall():
        best_recall = evaluator.get_mean_recall()
        best_precision = evaluator.get_mean_precision()
        best_fscore = evaluator.get_mean_fscore()
        best_model = current_model
    
    results['precision'].append(evaluator.get_mean_precision())
    results['recall'].append(evaluator.get_mean_recall())
    results['fscore'].append(evaluator.get_mean_fscore())
    results['model'].append(current_model.get_name())
    
    i += 1

print("------------ Report -------------------\n")
print("Total of Analyzed Hyperparameters Combinations: {}".format(len(hyperparams)))

print("\nBest Model and Hyperparameters Found: {}\n".format(best_model.get_name()))            
evaluator = ModelEvaluator(orc.oracle, best_model)
evaluator.evaluate_model(verbose=True)

#print("\nPlot Precision vs Recall - Best Model")
#evaluator.plot_precision_vs_recall()

#print("\nHeatmap of All Models")
#plot_heatmap(results)

Performing model optimizations...
------------ Report -------------------

Total of Analyzed Hyperparameters Combinations: 1

Best Model and Hyperparameters Found: LDA_Model_0

{'Measures': {'Mean FScore of LDA_Model_0': 0.06587301587301587,
              'Mean Precision of LDA_Model_0': 0.040674603174603176,
              'Mean Recall of LDA_Model_0': 0.21428571428571427},
 'Setup': [{'Name': 'LDA_Model_0'},
           {'Similarity Measure and Minimum Threshold': ('jsd', 0.3)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': None,
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 10

### Find The Best Model

In [76]:
all_hyperparams = {
    LDA_Model_Hyperp.SIM_MEASURE_MIN_THRESHOLD.value : [('jsd', .80), ('jsd', .85), ('jsd', .90), ('jsd', .95)],
    LDA_Model_Hyperp.LDA_MODEL_N_COMPONENTS.value: [5,10,20,50,100],
    LDA_Model_Hyperp.LDA_MODEL_RANDOM_STATE.value : [2],
    LDA_Model_Hyperp.VECTORIZER_NGRAM_RANGE.value: [(1,1), (1,2)],
    LDA_Model_Hyperp.VECTORIZER.value : [TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True), 
                         CountVectorizer(stop_words='english')],
    LDA_Model_Hyperp.VECTORIZER_TOKENIZER.value : [PorterStemmerBased_Tokenizer(), LancasterStemmerBased_Tokenizer(), 
                                                   WordNetBased_LemmaTokenizer(), SnowballStemmerBased_Tokenizer()]
}

hyperparams = generate_params_comb_list(**all_hyperparams)

print('Performing model optimizations...')
best_precision = -1
best_recall = -1
best_fscore = -1
best_model = None

results = {'precision': [], 'recall': [], 'fscore': [], 'model': []}

i = 0
for hyperp in hyperparams:
    hyperp[LDA_Model_Hyperp.NAME.value] = 'LDA_Model_{}'.format(i)
    current_model = LDA(**hyperp)
    current_model.recover_links(corpus, query, use_cases_names, bug_reports_names)
    
    evaluator = ModelEvaluator(orc.oracle, current_model)
    evaluator.evaluate_model()

    if best_recall <= evaluator.get_mean_recall():
        if best_precision <= evaluator.get_mean_precision():
            best_recall = evaluator.get_mean_recall()
            best_precision = evaluator.get_mean_precision()
            best_fscore = evaluator.get_mean_fscore()
            best_model = current_model
    
    results['precision'].append(evaluator.get_mean_precision())
    results['recall'].append(evaluator.get_mean_recall())
    results['fscore'].append(evaluator.get_mean_fscore())
    results['model'].append(current_model.get_name())
    
    i += 1

print("------------ Report -------------------\n")
print("Total of Analyzed Hyperparameters Combinations: {}".format(len(hyperparams)))

print("\nBest Model and Hyperparameters Found: {}\n".format(best_model.get_name()))            
evaluator = ModelEvaluator(orc.oracle, best_model)
evaluator.evaluate_model(verbose=True)

#print("\nPlot Precision vs Recall - Best Model")
#evaluator.plot_precision_vs_recall()

#print("\nHeatmap of All Models")
#plot_heatmap(results)

#evaluator.save_log()

Performing model optimizations...
------------ Report -------------------

Total of Analyzed Hyperparameters Combinations: 320

Best Model and Hyperparameters Found: LDA_Model_319

{'Measures': {'Mean FScore of LDA_Model_319': 0.0,
              'Mean Precision of LDA_Model_319': 0.0,
              'Mean Recall of LDA_Model_319': 0.0},
 'Setup': [{'Name': 'LDA_Model_319'},
           {'Similarity Measure and Minimum Threshold': ('jsd', 0.95)},
           {'Top Value': 3},
           {'LDA Model': {'batch_size': 128,
                          'doc_topic_prior': None,
                          'evaluate_every': -1,
                          'learning_decay': 0.7,
                          'learning_method': None,
                          'learning_offset': 10.0,
                          'max_doc_update_iter': 100,
                          'max_iter': 10,
                          'mean_change_tol': 0.001,
                          'n_components': 100,
                          'n_jobs

### Plot Highlights

In [None]:
highlight_df(best_model.get_trace_links_df())

In [None]:
highlight_df(orc.oracle)