# Introduction

In this notebook we demonstrate the use of an **Stacking Model**, based on **Logistic Regression** model, in the Information Retrieval context to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

### Import Libraries

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from enum import Enum

#from utils import plots
from utils import oracle_loader as ol
from utils import jedit_dataset as jd
from utils import model_evaluator as m_eval
from utils import generic_model as g_model

### Load Dataset and Preprocessing

In [2]:
trace_df = jd.read_trace_df()
artfs_desc_df = jd.read_artfs_desc_df()

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = ol.OracleLoader(use_cases_names, bug_reports_names)
orc.load(trace_df)

### Load Models Results

In [3]:
lsi_results_df = pd.read_csv('best_models_sim_matrix/lsi.csv')
lda_results_df = pd.read_csv('best_models_sim_matrix/lda.csv')
bm25_results_df = pd.read_csv('best_models_sim_matrix/bm25.csv')
wv_results_df = pd.read_csv('best_models_sim_matrix/wordvector.csv')

lsi_results_df.set_index('artf_name', inplace=True)
lda_results_df.set_index('artf_name', inplace=True)
bm25_results_df.set_index('artf_name', inplace=True)
wv_results_df.set_index('artf_name', inplace=True)

### Tranform Results Matrices to Vectors

In [4]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_results_df, 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_results_df, 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_results_df, 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(wv_results_df, 'wv')

### Transform Vectors to DataFrame

In [5]:
ensemble_input_df = pd.DataFrame(columns=['ens_pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,ens_pred
UC_003_TRG_BR_4020_SRC,0.361541,0.988073,7.08953,0.922483,
UC_007_TRG_BR_4020_SRC,0.46841,0.9881,9.83479,0.878566,
UC_010_TRG_BR_4020_SRC,0.690679,0.154894,16.8103,0.906589,
UC_002_TRG_BR_4020_SRC,0.897997,0.988134,5.33088,0.923327,
UC_006_TRG_BR_4020_SRC,0.911746,0.988105,5.08051,0.924785,


### Insert Oracle Data

In [6]:
orc_vec_df = transform_sim_matrix_to_sim_vec(orc.oracle, 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,oracle,ens_pred
UC_003_TRG_BR_4020_SRC,0.361541,0.988073,7.08953,0.922483,0.0,
UC_007_TRG_BR_4020_SRC,0.46841,0.9881,9.83479,0.878566,0.0,
UC_010_TRG_BR_4020_SRC,0.690679,0.154894,16.8103,0.906589,0.0,
UC_002_TRG_BR_4020_SRC,0.897997,0.988134,5.33088,0.923327,0.0,
UC_006_TRG_BR_4020_SRC,0.911746,0.988105,5.08051,0.924785,1.0,


### Define Stacked Model

In [7]:
class STACK_MODEL_Hyperp(Enum):
    NAME = 'stacked_model__name'
    TOP = 'stacked_model__top'
    THRESHOLD = 'stacked_model__threshold'

class StackedModel(g_model.GenericModel):
    def __init__(self):
        self.model = None
        self.threshold = None
    
    def set_basic_params(self, **kwargs):
        self.set_name('Stacked Model' if STACK_MODEL_Hyperp.NAME.value not in kwargs.keys() else kwargs[STACK_MODEL_Hyperp.NAME.value])
        self.set_top(3 if STACK_MODEL_Hyperp.TOP.value not in kwargs.keys() else kwargs[STACK_MODEL_Hyperp.TOP.value])
        self.set_threshold(0.75 if STACK_MODEL_Hyperp.THRESHOLD.value not in kwargs.keys() else kwargs[STACK_MODEL_Hyperp.THRESHOLD.value])
        self.set_model_gen_name('stacked_model')
        self.set_model(LogisticRegression())
    
    def set_name(self, name):
        super().set_name(name)
        
    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)
    
    def set_top(self, top):
        super().set_top(top)
    
    def set_threshold(self, threshold):
        self.threshold = threshold
    
    def set_model(self, model):
        self.model = model
    
    def recover_links(self, X, y):
        #elf._svd_matrix = svd_transformer.fit_transform(corpus)
        #elf._query_vector = svd_transformer.transform(query)
        #elf._sim_matrix = pairwise.cosine_similarity(X=self._svd_matrix, Y=self._query_vector)
        #elf._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=use_cases_names, columns=bug_reports_names)
        #uper()._fillUp_traceLinksDf(use_cases_names, bug_reports_names, self._sim_matrix) 
        pass
    
    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.get_name()},
                      {"Threshold" : self.get_threshold()},
                      {"Top Value" : self.get_top_value()},
                      {"Model" : self.model.get_params()}
                  ]
               }

    def get_name(self):
        return super().get_name()
    
    def get_model_gen_name(self):
        return super().get_model_gen_name()
    
    def get_top_value(self):
        return super().get_top_value()
    
    def get_threshold(self):
        return self.threshold
    
    def get_model(self):
        return self.model

### Split Data on Train and Test

In [8]:
X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((112, 4), (112,), (28, 4), (28,))

### Logistic Regressor

In [45]:
def discretizer(x):
    if x < 0.5:
        return 0
    else:
        return 1

stack_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = stack_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score  - Test Data: {:2.3}'.format(fscore))

print(stack_model.coef_)

Recall: 0.333
Precision: 1.0
F-Score 0.5
[[ 1.07826905e+01 -4.73084591e-03  5.06331915e-02 -7.55623531e+00]]


#### Predict Probabilities over Entire Dataset

In [48]:
preds = stack_model.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - All Data: {:2.3}'.format(recall))
print('Precision - All Data: {:2.3}'.format(precision))
print('F-Score - All Data: {:2.3}'.format(fscore))

print(stack_model.coef_)

Recall: 0.368
Precision: 0.7
F-Score 0.483
[[ 1.07826905e+01 -4.73084591e-03  5.06331915e-02 -7.55623531e+00]]


### Test with Other Model Types

#### XGBoost

In [43]:
from xgboost import XGBClassifier

X_train = X_train.infer_objects()
y_train = y_train.infer_objects()
X_test = X_test.infer_objects()
y_test = y_test.infer_objects()

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(xgb.feature_importances_)

Recall: 0.333
Precision: 1.0
F-Score 0.5
[0.47671232 0.12602739 0.16164383 0.23561645]


In [49]:
X = X.infer_objects()
y = y.infer_objects()

preds = xgb.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 0.895
Precision - Test Data: 1.0
F-Score - Test Data: 0.944
[0.47671232 0.12602739 0.16164383 0.23561645]


#### Extra Trees Classifier

In [50]:
from sklearn.ensemble import ExtraTreesClassifier

X_train = X_train.infer_objects()
y_train = y_train.infer_objects()
X_test = X_test.infer_objects()
y_test = y_test.infer_objects()

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(et.feature_importances_)

Recall: 0.333
Precision: 0.5
F-Score 0.4
[0.43848474 0.13354034 0.21145338 0.21652154]


In [52]:
X = X.infer_objects()
y = y.infer_objects()

preds = et.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - All Data: {:2.3}'.format(recall))
print('Precision - All Data: {:2.3}'.format(precision))
print('F-Score - All Data: {:2.3}'.format(fscore))

print(et.feature_importances_)

Recall - All Data: 0.895
Precision - All Data: 0.944
F-Score - All Data: 0.919
[0.43848474 0.13354034 0.21145338 0.21652154]
