## Import libraries

In [5]:
import numpy as np
import pandas as pd
import shap
import lime
import os
import pickle


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn import tree
import xgboost as xgb
from sklearn.datasets import make_regression
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


import matplotlib.pyplot as plt

## Dataset Importing

In [6]:
df = pd.read_csv(('/Users/saumenduroy/Desktop/Defect_Detection/data/lucene-2.9.0.csv'), index_col = 'File')

## Select a specific row or column from the dataset and dataset splitting

In [7]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Dataset Balancing

In [8]:
from imblearn.over_sampling import RandomOverSampler

In [9]:
from collections import Counter
oversample =  RandomOverSampler()
X_train,y_train = oversample.fit_resample(X_train,y_train)

In [10]:
Counter(y_train)

Counter({0: 767,
         1: 767,
         3: 767,
         4: 767,
         2: 767,
         11: 767,
         12: 767,
         6: 767,
         8: 767,
         5: 767})

In [11]:
Counter(y_test)

Counter({0: 328, 4: 5, 1: 41, 3: 11, 2: 20, 5: 2, 9: 1, 6: 2, 8: 1})

In [12]:
print("X_Train shape: ", X_train.shape)
print("Train label shape: ", y_train.shape)
print("X_Test shape: ", X_test.shape)
print("Test label shape: ", y_test.shape)

X_Train shape:  (7670, 68)
Train label shape:  (7670,)
X_Test shape:  (411, 68)
Test label shape:  (411,)


In [13]:
X_train

Unnamed: 0,CountDeclMethodPrivate,AvgLineCode,CountLine,MaxCyclomatic,CountDeclMethodDefault,AvgEssential,CountDeclClassVariable,SumCyclomaticStrict,AvgCyclomatic,AvgLine,...,Del_lines,OWN_LINE,OWN_COMMIT,MINOR_COMMIT,MINOR_LINE,MAJOR_COMMIT,MAJOR_LINE,HeuBug,HeuBugCount,RealBug
0,0,6,55,3,1,2,0,6,2,6,...,0,1.000000,0.0,0,1,0,0,False,0,False
1,0,14,329,5,2,2,2,13,4,14,...,0,0.945289,0.0,0,1,0,5,False,0,False
2,0,11,269,4,0,1,0,21,3,44,...,0,0.988848,1.0,0,1,1,1,False,0,False
3,0,1,34,1,0,1,0,2,1,1,...,0,0.941176,1.0,0,2,1,0,False,0,False
4,1,14,327,2,0,1,0,20,1,18,...,4,0.694190,0.5,0,3,2,4,True,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7665,7,15,1811,6,5,1,3,151,1,19,...,226,0.685809,0.7,0,2,3,6,True,3,True
7666,7,15,1811,6,5,1,3,151,1,19,...,226,0.685809,0.7,0,2,3,6,True,3,True
7667,7,15,1811,6,5,1,3,151,1,19,...,226,0.685809,0.7,0,2,3,6,True,3,True
7668,7,15,1811,6,5,1,3,151,1,19,...,226,0.685809,0.7,0,2,3,6,True,3,True


In [14]:
df.columns

Index(['CountDeclMethodPrivate', 'AvgLineCode', 'CountLine', 'MaxCyclomatic',
       'CountDeclMethodDefault', 'AvgEssential', 'CountDeclClassVariable',
       'SumCyclomaticStrict', 'AvgCyclomatic', 'AvgLine',
       'CountDeclClassMethod', 'AvgLineComment', 'AvgCyclomaticModified',
       'CountDeclFunction', 'CountLineComment', 'CountDeclClass',
       'CountDeclMethod', 'SumCyclomaticModified', 'CountLineCodeDecl',
       'CountDeclMethodProtected', 'CountDeclInstanceVariable',
       'MaxCyclomaticStrict', 'CountDeclMethodPublic', 'CountLineCodeExe',
       'SumCyclomatic', 'SumEssential', 'CountStmtDecl', 'CountLineCode',
       'CountStmtExe', 'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 

## Checking the null value

In [15]:
df.isnull().sum()

CountDeclMethodPrivate    0
AvgLineCode               0
CountLine                 0
MaxCyclomatic             0
CountDeclMethodDefault    0
                         ..
MAJOR_LINE                0
HeuBug                    0
HeuBugCount               0
RealBug                   0
RealBugCount              0
Length: 69, dtype: int64

In [16]:
df.columns[df.dtypes == 'object']

Index([], dtype='object')

In [17]:
df.dtypes

CountDeclMethodPrivate    int64
AvgLineCode               int64
CountLine                 int64
MaxCyclomatic             int64
CountDeclMethodDefault    int64
                          ...  
MAJOR_LINE                int64
HeuBug                     bool
HeuBugCount               int64
RealBug                    bool
RealBugCount              int64
Length: 69, dtype: object

In [18]:
#our_rf_model = RandomForestClassifier(random_state=0)
#our_rf_model.fit(X_train, y_train) 

In [19]:
## For SVM data fitting
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

## Create classifier object, Train, and Test 

In [20]:
# Create classifer object
our_rf_model = RandomForestClassifier()
our_dt_model = DecisionTreeClassifier()
our_lr_model = LogisticRegression()
our_mlp_model = MLPClassifier()
our_xgb_model = XGBClassifier()
our_svm_model = svm.SVC()

# Train Classifer
our_rf_model = our_rf_model.fit(X_train,y_train)
our_dt_model = our_dt_model.fit(X_train,y_train)
our_lr_model = our_lr_model.fit(X_train,y_train)
our_mlp_model = our_mlp_model.fit(X_train,y_train)
our_xgb_model = our_xgb_model.fit(X_train,y_train)
our_svm_model = our_svm_model.fit(X_train [:500], y_train[:500])

#Predict the response for test dataset

y_pred_our_rf_model = our_rf_model.predict(X_test)
y_pred_our_dt_model = our_dt_model.predict(X_test)
y_pred_our_lr_model = our_lr_model.predict(X_test)
y_pred_our_mlp_model = our_mlp_model.predict(X_test)
y_pred_our_xgb_model = our_xgb_model.predict(X_test)
y_pred_our_svm_model = our_svm_model.predict(X_test)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Measuring Accuracy

In [21]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy for Random Forest (RF):",metrics.accuracy_score(y_test, y_pred_our_rf_model))
print("Accuracy for Decission Tree (DT):",metrics.accuracy_score(y_test, y_pred_our_dt_model))
print("Accuracy for Logistic Regression (LR):",metrics.accuracy_score(y_test, y_pred_our_lr_model))
print("Accuracy for Multi-Layer Perceptron Neural Network (MLP):",metrics.accuracy_score(y_test, y_pred_our_mlp_model))
print("Accuracy for Gradient Boosting (XGB):",metrics.accuracy_score(y_test, y_pred_our_xgb_model))
print("Accuracy for Support Vector Machine (SVM):",metrics.accuracy_score(y_test, y_pred_our_svm_model))


Accuracy for Random Forest (RF): 0.9124087591240876
Accuracy for Decission Tree (DT): 0.878345498783455
Accuracy for Logistic Regression (LR): 0.0072992700729927005
Accuracy for Multi-Layer Perceptron Neural Network (MLP): 0.681265206812652
Accuracy for Gradient Boosting (XGB): 0.9099756690997567
Accuracy for Support Vector Machine (SVM): 0.7980535279805353


## Precision, Recall, and f1 Score, and AUC for RF

In [608]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_rf_model, average = "macro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_rf_model, average = "macro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_rf_model, average = "macro"))

Precision: 0.35539814947984083
Recall: 0.3
f1 score: 0.3219522335901646


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


In [609]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_rf_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for RF:",sum(all_auc)/len(all_auc))

[0.16666666666666669, 0.8121119733924611, 0.9940405244338498, 0.8377403846153847, 0.48573127229488705]
AUC for RF: 0.6592581642806499


In [610]:
list(y_test.unique())

[0, 2, 3, 1, 4]

## Precision, Recall, and f1 Score, and AUC for DT

In [611]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_dt_model, average = "macro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_dt_model, average = "macro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_dt_model, average = "macro"))

Precision: 0.3095238095238095
Recall: 0.29350649350649355
f1 score: 0.297999297999298


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


In [612]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_dt_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for DT:",sum(all_auc)/len(all_auc))

[0.0, 0.9927937915742794, 0.9884783472387764, 0.9788461538461538, 0.9809750297265161]
AUC for DT: 0.7882186644771452


## Precision, Recall, and f1 Score, and AUC for LR

In [613]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_lr_model, average = "micro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_lr_model, average = "macro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_lr_model, average = "macro"))

Precision: 0.029691211401425176
Recall: 0.06570526247945603
f1 score: 0.01753361167273814


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


In [614]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_lr_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for LR:",sum(all_auc)/len(all_auc))

[0.3850461814171492, 0.6560421286031042, 0.5154946364719905, 0.6072115384615384, 0.011890606420927485]
AUC for LR: 0.43513701827494194


## Precision, Recall, and f1 Score, and AUC for MLP

In [615]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_mlp_model, average = "macro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_mlp_model, average = "macro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_mlp_model, average = "macro"))

Precision: 0.24617546874057997
Recall: 0.30667343409278897
f1 score: 0.2667093791397808


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


In [616]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_mlp_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for MLP:",sum(all_auc)/len(all_auc))

[0.1770402536531569, 0.858259423503326, 0.6301152165276122, 0.8141225961538461, 0.46076099881093935]
AUC for MLP: 0.5880596977297761


## Precision, Recall, and f1 Score, and AUC for XGB

In [617]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_xgb_model, average = "macro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_xgb_model, average = "macro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_xgb_model, average = "macro"))

Precision: 0.3136363636363636
Recall: 0.312987012987013
f1 score: 0.31292517006802717


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


In [618]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_xgb_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for XGB:",sum(all_auc)/len(all_auc))

[0.0, 0.9934866962305987, 0.9896702423520064, 0.9768028846153847, 0.9827586206896552]
AUC for XGB: 0.788543688777529


## Precision, Recall, and f1 Score, and AUC for SVM

In [619]:
print("Precision:",metrics.precision_score(y_test, y_pred_our_svm_model, average = "micro"))
print("Recall:",metrics.recall_score(y_test, y_pred_our_svm_model, average = "micro"))
print("f1 score:",metrics.f1_score(y_test,y_pred_our_svm_model, average = "micro"))

Precision: 0.9572446555819477
Recall: 0.9572446555819477
f1 score: 0.9572446555819477


In [620]:
from sklearn import metrics
u_value= list(y_test.unique())
all_auc = []
for i in u_value:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_our_svm_model, pos_label=i)
    all_auc.append(metrics.auc(fpr, tpr))
print(all_auc)
print("AUC for SVM:",sum(all_auc)/len(all_auc))

[0.5, 0.5, 0.5, 0.5, 0.5]
AUC for SVM: 0.5


## CV Score for RF

In [621]:
scores = cross_val_score(our_rf_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for RF', scores)

Cross-Validation Accuracy Scores for RF [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## After CV: Minimum, Mean, and Maximum Value for RF

In [622]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(1.0, 1.0, 1.0)

In [623]:
print('Mean Accuracy Scores for RF', scores.mean())

Mean Accuracy Scores for RF 1.0


## CV Score for DT

In [452]:
scores = cross_val_score(our_dt_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for DT', scores)

Cross-Validation Accuracy Scores for DT [1.         1.         1.         0.99886364 1.         1.
 1.         1.         1.         1.        ]


## After CV: Minimum, Mean, and Maximum Value for DT

In [453]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(0.9988636363636364, 0.9998863636363637, 1.0)

In [454]:
print('Mean Accuracy Scores for DT', scores.mean())

Mean Accuracy Scores for DT 0.9998863636363637


## CV Score for LR

In [455]:
scores = cross_val_score(our_lr_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for LR', scores)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the doc

Cross-Validation Accuracy Scores for LR [0.72045455 0.70681818 0.74545455 0.65681818 0.72272727 0.74772727
 0.75227273 0.73863636 0.68295455 0.72840909]


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## After CV: Minimum, Mean, and Maximum Value for LR

In [456]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(0.6568181818181819, 0.7202272727272726, 0.7522727272727273)

In [457]:
print('Mean Accuracy Scores for LR', scores.mean())

Mean Accuracy Scores for LR 0.7202272727272726


## CV Score for MLP

In [458]:
scores = cross_val_score(our_mlp_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for MLP', scores)

Cross-Validation Accuracy Scores for MLP [0.99659091 0.99090909 0.99090909 0.9875     0.99545455 0.9875
 0.99431818 0.99772727 0.98522727 0.99318182]


## After CV: Minimum, Mean, and Maximum Value for MLP 

In [459]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(0.9852272727272727, 0.9919318181818181, 0.9977272727272727)

In [460]:
print('Mean Accuracy Scores for MLP', scores.mean())

Mean Accuracy Scores for MLP 0.9919318181818181


## CV Score for XGB

In [461]:
scores = cross_val_score(our_xgb_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for XGB', scores)

Cross-Validation Accuracy Scores for XGB [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## After CV: Minimum, Mean, and Maximum Value for XGB

In [462]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(1.0, 1.0, 1.0)

In [463]:
print('Mean Accuracy Scores for XGB', scores.mean())

Mean Accuracy Scores for XGB 1.0


## CV Score for SVM

In [140]:
scores = cross_val_score(our_svm_model, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores for SVM', scores)

Cross-Validation Accuracy Scores for SVM [0.2289282  0.22476587 0.22060354 0.22083333 0.25       0.33229167
 0.32604167 0.253125   0.25625    0.228125  ]


## After CV: Minimum, Mean, and Maximum Value for SVM

In [94]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

(0.12521150592216582, 0.2216816435725065, 0.43243243243243246)

In [95]:
print('Mean Accuracy Scores for XGB', scores.mean())

Mean Accuracy Scores for XGB 0.2216816435725065
