# Introduction

In this notebook we demonstrate the use of an **Stacking Model**, based on **Logistic Regression** model, in the Information Retrieval context to make trace link recovery between Use Cases and Bug Reports.

We model our study as follows:

* Each bug report title, summary and description compose a single query.
* We use each use case content as an entire document that must be returned to the query made

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from imblearn.over_sampling import SMOTE, ADASYN

from enum import Enum
from collections import Counter

#from utils import plots
from utils import oracle_loader as ol
from utils import jedit_dataset as jd
from utils import model_evaluator as m_eval
from utils import generic_model as g_model

### Load Dataset and Preprocessing

In [2]:
trace_df = jd.read_trace_df()
artfs_desc_df = jd.read_artfs_desc_df()

use_cases_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Use Case ID')]
bug_reports_df = artfs_desc_df[artfs_desc_df.artf_description.str.contains('Bug Number')]

corpus = use_cases_df.artf_description
query = bug_reports_df.artf_description

use_cases_names = use_cases_df.artf_name
bug_reports_names = bug_reports_df.artf_name

orc = ol.OracleLoader(use_cases_names, bug_reports_names)
orc.load(trace_df)

### Load Models Results

In [3]:
lsi_results_df = pd.read_csv('best_models_sim_matrix/lsi.csv')
lda_results_df = pd.read_csv('best_models_sim_matrix/lda.csv')
bm25_results_df = pd.read_csv('best_models_sim_matrix/bm25.csv')
wv_results_df = pd.read_csv('best_models_sim_matrix/wordvector.csv')

lsi_results_df.set_index('artf_name', inplace=True)
lda_results_df.set_index('artf_name', inplace=True)
bm25_results_df.set_index('artf_name', inplace=True)
wv_results_df.set_index('artf_name', inplace=True)

### Tranform Results Matrices to Vectors

In [4]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_results_df, 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_results_df, 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_results_df, 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(wv_results_df, 'wv')

### Transform Vectors to DataFrame

In [5]:
ensemble_input_df = pd.DataFrame(columns=['ens_pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,ens_pred
UC_003_TRG_BR_4020_SRC,0.361541,0.988073,7.08953,0.922483,
UC_007_TRG_BR_4020_SRC,0.46841,0.9881,9.83479,0.878566,
UC_010_TRG_BR_4020_SRC,0.690679,0.154894,16.8103,0.906589,
UC_002_TRG_BR_4020_SRC,0.897997,0.988134,5.33088,0.923327,
UC_006_TRG_BR_4020_SRC,0.911746,0.988105,5.08051,0.924785,


### Insert Oracle Data

In [8]:
orc_vec_df = transform_sim_matrix_to_sim_vec(orc.oracle, 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head(15)

Unnamed: 0,lsi,lda,bm25,wv,oracle_x,ens_pred
UC_003_TRG_BR_4020_SRC,0.361541,0.988073,7.08953,0.922483,0.0,
UC_007_TRG_BR_4020_SRC,0.46841,0.9881,9.83479,0.878566,0.0,
UC_010_TRG_BR_4020_SRC,0.690679,0.154894,16.8103,0.906589,0.0,
UC_002_TRG_BR_4020_SRC,0.897997,0.988134,5.33088,0.923327,0.0,
UC_006_TRG_BR_4020_SRC,0.911746,0.988105,5.08051,0.924785,1.0,
UC_004_TRG_BR_4020_SRC,0.814917,0.988081,3.90633,0.915229,0.0,
UC_005_TRG_BR_4020_SRC,0.921105,0.988126,5.31983,0.917549,1.0,
UC_008_TRG_BR_4020_SRC,0.567673,0.988148,13.3005,0.900552,0.0,
UC_001_TRG_BR_4020_SRC,0.347406,0.988092,2.34608,0.91602,0.0,
UC_009_TRG_BR_4020_SRC,0.565652,0.154902,5.41828,0.899775,0.0,


### Balancing Dataset and Split Data on Train and Test

In [48]:
ensemble_input_df = ensemble_input_df.infer_objects()

X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print("Train SMOTE: {}".format(sorted(Counter(y_train).items())))

X_train = pd.DataFrame(X_train, columns=['lsi','lda','bm25','wv'])
y_train = pd.DataFrame(y_train)

(112, 4) (112,) (28, 4) (28,)
Train SMOTE: [(0.0, 96), (1.0, 96)]


### Discretizer Function

In [49]:
def discretizer(x):
    return 0 if x < 0.5 else 1

### Logistic Regressor

In [50]:
stack_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = stack_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score  - Test Data: {:2.3}'.format(fscore))

print(stack_model.coef_)

Recall - Test Data: 1.0
Precision - Test Data: 0.5
F-Score  - Test Data: 0.667
[[ 8.96483893  1.26924417  0.03050525 -6.34493974]]


  y = column_or_1d(y, warn=True)


#### Predict Probabilities over Entire Dataset

In [51]:
preds = stack_model.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - All Data: {:2.3}'.format(recall))
print('Precision - All Data: {:2.3}'.format(precision))
print('F-Score - All Data: {:2.3}'.format(fscore))

print(stack_model.coef_)

Recall - All Data: 0.947
Precision - All Data: 0.409
F-Score - All Data: 0.571
[[ 8.96483893  1.26924417  0.03050525 -6.34493974]]


### Test with Other Model Types

#### XGBoost

In [52]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 0.333
Precision - Test Data: 0.25
F-Score - Test Data: 0.286
[0.32142857 0.11428571 0.25       0.31428573]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#### Predict Probabilities over Entire Dataset

In [53]:
preds = xgb.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - Test Data: {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 0.842
Precision - Test Data: 0.8
F-Score - Test Data: 0.821
[0.32142857 0.11428571 0.25       0.31428573]


#### Extra Trees Classifier

In [54]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3}'.format(recall))
print('Precision - Test Data: {:2.3}'.format(precision))
print('F-Score - Test Data: {:2.3}'.format(fscore))

print(et.feature_importances_)

Recall - Test Data 0.667
Precision - Test Data: 0.4
F-Score - Test Data: 0.5
[0.50901156 0.14735919 0.12649077 0.21713847]


  


#### Predict Probabilities over Entire Dataset

In [55]:
preds = et.predict_proba(X)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y, y_pred=preds)
recall = recall_score(y_true=y, y_pred=preds)
fscore = f1_score(y_true=y, y_pred=preds)

print('Recall - All Data: {:2.3}'.format(recall))
print('Precision - All Data: {:2.3}'.format(precision))
print('F-Score - All Data: {:2.3}'.format(fscore))

print(et.feature_importances_)

Recall - All Data: 0.947
Precision - All Data: 0.857
F-Score - All Data: 0.9
[0.50901156 0.14735919 0.12649077 0.21713847]
