# Introduction

Notebook to test a **boosting** model in the context of traceability between features and bug reports.

# Load Libraries and Datasets

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from modules.models_runner.feat_br_models_runner import Feat_BR_Models_Runner
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import aux_functions
from modules.utils import model_evaluator as m_eval
from modules.utils import similarity_measures as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegressionCV

from imblearn.over_sampling import SMOTE, ADASYN

from enum import Enum
from collections import Counter

import warnings; warnings.simplefilter('ignore')

# Run All Models

## Volunteers Only Strategy

In [2]:
models_runner_4 = Feat_BR_Models_Runner()
lsi_model_4 = models_runner_4.run_lsi_model()
lda_model_4 = models_runner_4.run_lda_model()
bm25_model_4 = models_runner_4.run_bm25_model()
w2v_model_4 = models_runner_4.run_word2vec_model()

Features.shape: (19, 8)
SelectedBugReports.shape: (91, 18)
Running LSI model -----
Running LDA model -----
Running BM25 model -----
Running W2V model -----


# Ensemble Model

## Tranform Results Matrices to Vectors

In [3]:
def transform_sim_matrix_to_sim_vec(sim_matrix_df, model_name):
    sim_vec_df = pd.DataFrame(columns=[model_name])
    for col in sim_matrix_df.columns:
        for idx, row in sim_matrix_df.iterrows():
            artfs_names = '{}_{}'.format(idx, col)
            sim_vec_df.at[artfs_names, model_name] = row[col]
    return sim_vec_df

sim_vec_lsi = transform_sim_matrix_to_sim_vec(lsi_model_4.get_sim_matrix(), 'lsi')
sim_vec_lda = transform_sim_matrix_to_sim_vec(lda_model_4.get_sim_matrix(), 'lda')
sim_vec_bm25 = transform_sim_matrix_to_sim_vec(bm25_model_4.get_sim_matrix(), 'bm25')
sim_vec_wv = transform_sim_matrix_to_sim_vec(w2v_model_4.get_sim_matrix(), 'wv')

## Transform Vectors to DataFrame

In [4]:
ensemble_input_df = pd.DataFrame(columns=['pred'], index=sim_vec_lsi.index)

out_df = pd.merge(ensemble_input_df, sim_vec_lsi, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_lda, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_bm25, left_index=True, right_index=True)
out_df = pd.merge(out_df, sim_vec_wv, left_index=True, right_index=True)

new_order = [1,2,3,4,0]
out_df = out_df[out_df.columns[new_order]]

ensemble_input_df = out_df.copy()
ensemble_input_df.head()

Unnamed: 0,lsi,lda,bm25,wv,pred
new_awesome_bar_1248267,0.31417,0.869301,46.17,0.934949,
windows_child_mode_1248267,0.0572629,0.374227,23.155,0.843091,
apz_async_scrolling_1248267,0.00130207,0.605183,12.7326,0.822531,
browser_customization_1248267,0.0285014,0.871522,10.4111,0.835262,
pdf_viewer_1248267,0.00596331,0.304652,15.5965,0.778693,


## Insert Oracle Data

In [5]:
orc_vec_df = transform_sim_matrix_to_sim_vec(fd.Feat_BR_Oracles.read_feat_br_volunteers_df().T, 'oracle')

ensemble_input_df = pd.merge(ensemble_input_df, orc_vec_df, left_index=True, right_index=True)

new_order = [0,1,2,3,5,4]
ensemble_input_df = ensemble_input_df[ensemble_input_df.columns[new_order]]

ensemble_input_df.head(15)

Feat_BR Volunteers Matrix shape: (91, 19)


Unnamed: 0,lsi,lda,bm25,wv,oracle,pred
new_awesome_bar_1248267,0.31417,0.869301,46.17,0.934949,0.0,
windows_child_mode_1248267,0.0572629,0.374227,23.155,0.843091,0.0,
apz_async_scrolling_1248267,0.00130207,0.605183,12.7326,0.822531,0.0,
browser_customization_1248267,0.0285014,0.871522,10.4111,0.835262,0.0,
pdf_viewer_1248267,0.00596331,0.304652,15.5965,0.778693,0.0,
context_menu_1248267,0.975032,0.788344,86.6177,0.920058,1.0,
w10_comp_1248267,0.191083,0.869828,24.3176,0.880215,0.0,
tts_in_desktop_1248267,0.0197739,0.248741,8.4998,0.831585,0.0,
tts_in_rm_1248267,0.0262221,0.240972,8.52168,0.84053,0.0,
webgl_comp_1248267,0.0180356,0.870782,8.3707,0.610047,0.0,


## Balancing Dataset and Split Data on Train and Test

In [6]:
ensemble_input_df = ensemble_input_df.infer_objects()

X = ensemble_input_df[['lsi','lda','bm25','wv']]
y = ensemble_input_df['oracle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print("Train SMOTE: {}".format(sorted(Counter(y_train).items())))

X_train = pd.DataFrame(X_train, columns=['lsi','lda','bm25','wv'])
y_train = pd.DataFrame(y_train)

(1383, 4) (1383,) (346, 4) (346,)
Train SMOTE: [(0.0, 1310), (1.0, 1310)]


## Discretizer Function

In [7]:
def discretizer(x):
    return 0 if x < 0.5 else 1

## Logistic Regressor

In [8]:
ensemb_model = LogisticRegressionCV(cv=3, scoring='recall').fit(X_train, y_train)

preds = ensemb_model.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))
    
precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score  - Test Data: {:2.3%}'.format(fscore))

print(ensemb_model.coef_)

Recall - Test Data: 60.000%
Precision - Test Data: 12.766%
F-Score  - Test Data: 21.053%
[[ 3.02221666  2.44564657  0.00500791 -0.18914145]]


# Test with Other Model Types

## XGBoost

In [14]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=42).fit(X_train, y_train)

preds = xgb.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data: {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(xgb.feature_importances_)

Recall - Test Data: 55.000%
Precision - Test Data: 14.286%
F-Score - Test Data: 22.680%
[0.17235188 0.22800718 0.3913824  0.20825852]


## Extra Trees Classifier

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(random_state=42).fit(X_train, y_train)

preds = et.predict_proba(X_test)[:,1]
preds = list(map(discretizer, preds))

precision = precision_score(y_true=y_test, y_pred=preds)
recall = recall_score(y_true=y_test, y_pred=preds)
fscore = f1_score(y_true=y_test, y_pred=preds)

print('Recall - Test Data {:2.3%}'.format(recall))
print('Precision - Test Data: {:2.3%}'.format(precision))
print('F-Score - Test Data: {:2.3%}'.format(fscore))

print(et.feature_importances_)

Recall - Test Data 40.000%
Precision - Test Data: 11.940%
F-Score - Test Data: 18.391%
[0.27934896 0.27734954 0.2602341  0.1830674 ]
