# Introduction

Notebook to test a **boosting** model in the context of traceability between test cases and bug reports.

# Load Libraries and Datasets

In [6]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from modules.models_runner.tc_br_models_runner import TC_BR_Runner
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval
from modules.utils import similarity_measures as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from imblearn.over_sampling import SMOTE, ADASYN

from enum import Enum
from collections import Counter

import warnings; warnings.simplefilter('ignore')

# Run All Models

## Volunteers Only Strategy

In [8]:
models_runner_4 = TC_BR_Runner()
lsi_model_4 = models_runner_4.run_lsi_model()
lda_model_4 = models_runner_4.run_lda_model()
bm25_model_4 = models_runner_4.run_bm25_model()
w2v_model_4 = models_runner_4.run_word2vec_model()

TestCases.shape: (195, 12)
SelectedBugReports.shape: (91, 18)
Running LSI Model ------
Running LDA Model -----
Running BM25 Model -----
Running W2V Model ------


# Ensemble Model

## Tranform Results Matrices to Vectors

In [9]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_model_4.get_sim_matrix(), 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_model_4.get_sim_matrix(), 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_model_4.get_sim_matrix(), 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(w2v_model_4.get_sim_matrix(), 'wv')

## Transform Vectors to DataFrame

In [10]:
ensemble_input_df = pd.DataFrame(columns=['pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,pred
TC_13_TRG_BR_1248267_SRC,0.0572857,0.859508,20.4839,0.891912,
TC_14_TRG_BR_1248267_SRC,0.0586889,0.654907,32.6923,0.921482,
TC_15_TRG_BR_1248267_SRC,0.0237049,0.749372,15.4464,0.809223,
TC_16_TRG_BR_1248267_SRC,0.160639,0.546288,41.1061,0.889683,
TC_17_TRG_BR_1248267_SRC,0.113961,0.729185,38.2526,0.903198,


## Insert Oracle Data

In [12]:
orc_vec_df = transform_sim_matrix_to_sim_vec(fd.Tc_BR_Oracles.read_oracle_volunteers_df(), 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head(15)

OracleVolunteers.shape: (195, 91)


Unnamed: 0,lsi,lda,bm25,wv,oracle,pred
TC_13_TRG_BR_1248267_SRC,0.0572857,0.859508,20.4839,0.891912,0.0,
TC_14_TRG_BR_1248267_SRC,0.0586889,0.654907,32.6923,0.921482,0.0,
TC_15_TRG_BR_1248267_SRC,0.0237049,0.749372,15.4464,0.809223,0.0,
TC_16_TRG_BR_1248267_SRC,0.160639,0.546288,41.1061,0.889683,0.0,
TC_17_TRG_BR_1248267_SRC,0.113961,0.729185,38.2526,0.903198,0.0,
TC_18_TRG_BR_1248267_SRC,0.0276158,0.822241,15.1044,0.86637,0.0,
TC_19_TRG_BR_1248267_SRC,0.0280664,0.87262,29.8445,0.871488,0.0,
TC_20_TRG_BR_1248267_SRC,0.0249423,0.918118,29.7718,0.88673,0.0,
TC_21_TRG_BR_1248267_SRC,0.0283293,0.922962,29.7718,0.888665,0.0,
TC_22_TRG_BR_1248267_SRC,0.0314005,0.758897,20.7144,0.891587,0.0,


## Balancing Dataset and Split Data on Train and Test

In [13]:
ensemble_input_df = ensemble_input_df.infer_objects()

X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print("Train SMOTE: {}".format(sorted(Counter(y_train).items())))

X_train = pd.DataFrame(X_train, columns=['lsi','lda','bm25','wv'])
y_train = pd.DataFrame(y_train)

(14196, 4) (14196,) (3549, 4) (3549,)
Train SMOTE: [(0.0, 13247), (1.0, 13247)]


## Discretizer Function

In [14]:
def discretizer(x):
    return 0 if x < 0.5 else 1

## Logistic Regressor

In [15]:
ensemb_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = ensemb_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score  - Test Data: {:2.3%}'.format(fscore))

print(ensemb_model.coef_)

Recall - Test Data: 56.641%
Precision - Test Data: 10.147%
F-Score  - Test Data: 17.211%
[[ 1.713023    1.35066602  0.00245523 -0.30618117]]


# Test with Other Model Types

## XGBoost

In [21]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 60.938%
Precision - Test Data: 14.338%
F-Score - Test Data: 23.214%
[0.14354838 0.27580646 0.3935484  0.18709677]


## Extra Trees Classifier

In [23]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(et.feature_importances_)

Recall - Test Data 56.641%
Precision - Test Data: 16.763%
F-Score - Test Data: 25.870%
[0.26435072 0.25915275 0.27043427 0.20606226]
