# Comparative analysis for defect detection in software applications

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
data = drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv_result-jm1.csv')

In [None]:
data.head()
data = data.drop(columns = 'id')

In [None]:
data.describe()

# Exploratory Data Analysis

In [None]:
data.dtypes

### 1. Replacing ? with Not a Number in the Miscellaneous Attributes

In [None]:
data['uniq_Op'] = data['uniq_Op'].replace('?', np.NaN)
data['uniq_Opnd'] = data['uniq_Opnd'].replace('?', np.NaN)
data['total_Op'] = data['total_Op'].replace('?', np.NaN)
data['total_Opnd'] = data['total_Opnd'].replace('?', np.NaN)
data['branchCount'] = data['branchCount'].replace('?', np.NaN)

### 2. Converting Object Data type to numeric

In [None]:
data['uniq_Op'] = pd.to_numeric(data['uniq_Op'])
data['uniq_Opnd'] = pd.to_numeric(data['uniq_Opnd'])
data['total_Op'] = pd.to_numeric(data['total_Op'])
data['total_Opnd'] = pd.to_numeric(data['total_Opnd'])
data['branchCount'] = pd.to_numeric(data['branchCount'])

In [None]:
data.dtypes

### 3. Removing rows with Not a Number values

In [None]:
data = data[data['uniq_Op'].notna()]

In [None]:
data.describe()

# 2. Feature Label Split

In [None]:
features = ['defects']
X = data.drop(columns=features)
y = data.defects

In [None]:
ax = sns.countplot(y,label="Count")       # M = 212, B = 357
F, T = y.value_counts()
print('Number of True: ',T)
print('Number of False : ',F)

# 3. Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
ax = sns.countplot(y,label="Count")       # M = 212, B = 357
T, F = y_train.value_counts()
print('Number of True: ',T)
print('Number of False : ',F)

In [None]:
X_train.shape

In [None]:
X_test.shape

# 4. Normalized Data

## 4.1. Standard Scalar Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
X_test_norm = sc.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm)
X_train_norm.columns = X_train.columns
X_train_norm.head()

## 4.2 Min Max Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# scale the goals from 0 to 1
mms = MinMaxScaler()

X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm)
X_train_norm.columns = X_train.columns
X_train_norm.head()

# Convert Normalized Data into Data Frame

In [None]:
X_train_norm = pd.DataFrame(X_train_norm)
X_train_norm.columns = X_train.columns
X_train_norm.head()

# 5. Correlation Coefficient

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        
        
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(25,20))
cor = X_train_norm.corr()
sns.heatmap(cor, annot=True)
plt.show()

In [None]:
corr_features = correlation(X_train, 0.9)
print(set(corr_features))
len(set(corr_features))

In [None]:
X_drop = X_train_norm.drop(corr_features,axis=1)

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(15,10))
cor = X_drop.corr()
sns.heatmap(cor, annot=True)
plt.show()

In [None]:
X_drop.head()

# 5. Applying chi square method for Feature Selection

In [None]:
# Load libraries
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
  
# Two features with highest chi-squared statistics are selected
chi2_features = SelectKBest(chi2, k = 5)
X_kbest_features = chi2_features.fit_transform(X_train[X_drop.columns], y_train)

# Reduced features
print('Original feature number:', X_drop.shape[1])
print('Reduced feature number:', X_kbest_features.shape[1])

cols = chi2_features.get_support(indices=True)
features_df_new = X_train.iloc[:,cols]
features_df_new = features_df_new.columns
print(features_df_new)

# 6. Balancing using Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=2)

X_train_ros, y_train_ros = ros.fit_resample(X_train[features_df_new], y_train.ravel())

print('Random over-sampling:')
y_train_ros = pd.DataFrame(y_train_ros)
y_train_ros.columns = ['defects']

print(y_train_ros.defects.value_counts())
y_train_ros.defects.value_counts().plot(kind='bar', title='Count (target)');

In [None]:
X_train_ros.shape

In [None]:
y_train_ros.shape

In [None]:
X_test[features_df_new].shape

In [None]:
y_test.shape

In [None]:
from sklearn import metrics
from sklearn.metrics import roc_curve,accuracy_score,classification_report,f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

from sklearn import tree
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 7. Hyper Parameter Tuning & Classification

In [None]:
X = X_train_ros[features_df_new]
Y = y_train_ros

X_test = X_test[features_df_new]

In [None]:
X.head()

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### 1.1 Parameter Tuning for KNN

In [None]:
score_knn = cross_val_score(KNeighborsClassifier(), X, Y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score_knn}')
print(f'Average score: {"{:.2f}".format(score_knn.mean())}')

acc_max = 0
k_max = 2
for val in range(3,25):
    score_knn = cross_val_score(KNeighborsClassifier(n_neighbors = val), X, Y, cv= kf, scoring="accuracy")
    curr_score = score_knn.mean()
    curr_k = val
    if acc_max < curr_score:
        acc_max = curr_score
        k_max = curr_k
print(f'Average Max score at ({k_max}): {"{:.3f}".format(acc_max)}')

weights = ['uniform','distance']
acc_max = 0
wei = 'na'
for val in weights:
    score_knn = cross_val_score(KNeighborsClassifier(n_neighbors = 3, weights = val), X, Y, cv= kf, scoring="accuracy")
    curr_score = score_knn.mean()
    if acc_max < curr_score:
        acc_max = curr_score
        wei = val
print(f'Average score({wei}): {"{:.3f}".format(score_knn.mean())}')

algo = ['ball_tree','kd_tree','brute','auto']
acc_max = 0
algo_max = 'na'
for val in algo:
    score_knn = cross_val_score(KNeighborsClassifier(n_neighbors = 3, weights = 'distance' ,algorithm = val), X, Y, cv= kf, scoring="accuracy")
    curr_score = score_knn.mean()
    if acc_max < curr_score:
        acc_max = curr_score
        algo_max = val
print(f'Average score({algo_max}): {"{:.3f}".format(acc_max)}')

### 1.2 Classification Report for k Nearest Neighbor

In [None]:
knn = KNeighborsClassifier(n_neighbors = k_max, weights = wei, algorithm = algo_max)
#knn = KNeighborsClassifier()
knn.fit(X, Y)
y_pred = knn.predict(X_test)

print("k Nearest Neighbor Algorithm")
cls_rep = classification_report(y_test, y_pred)
print(cls_rep)

#Accuracy score
acc_knn = round(accuracy_score(y_pred,y_test),2) * 100
f1_knn = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_knn = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_knn = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_knn,"%")
print("F1 Score: ",f1_knn,"%")
print("Precision: ",pr_knn,"%")
print("Recall: ",rc_knn,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = knn.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.show()

knn_auc = metrics.auc(fpr, tpr)
print("Area Under the curve for ROC =",knn_auc)

### 2.1 Parameter Tuning for Decision Tree

In [None]:
from sklearn import tree
score_dt = cross_val_score(tree.DecisionTreeClassifier(random_state= 42,max_depth=33), X, Y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score_dt}')
print(f'Average score: {"{:.2f}".format(score_dt.mean())}')

acc_max = 0
max_depth = 1
for val in range(1,50):
    score_dt = cross_val_score(tree.DecisionTreeClassifier(max_depth= val, random_state= 42), X, Y, cv= kf, scoring="accuracy")
    curr_score = score_dt.mean()
    curr_max_depth = val
    if acc_max < curr_score:
        acc_max = curr_score
        max_depth = curr_max_depth
print(f'Average Max score at ({max_depth}): {"{:.3f}".format(acc_max)}')

### 2.2 Classification Report for Decision Tree

In [None]:
tr = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 42)
tr.fit(X, Y) # train the ensemble classifier
y_true, y_pred = y_test, tr.predict(X_test)

print("Decision Tree Algorithm")
tr_rep = classification_report(y_test, y_pred)
print(tr_rep)

#Accuracy score
acc_tr = round(accuracy_score(y_pred,y_test),2) * 100
f1_tr = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_tr = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_tr = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_tr,"%")
print("F1 Score: ",f1_tr,"%")
print("Precision: ",pr_tr,"%")
print("Recall: ",rc_tr,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = tr.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Decision Tree')
plt.show()

tr_auc = metrics.auc(fpr, tpr)
print("Area Under the curve for ROC =",tr_auc)

### 3.1 Parameter Tuning for Logistic Regression

In [None]:
from sklearn import linear_model
acc_max = 0
max_solver = 'na'
algorithms = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

score_lr = cross_val_score(linear_model.LogisticRegression(random_state= 42), X, Y, cv = kf, scoring="accuracy")
print(f'Scores for each fold are:\n {score_lr}')
print(f'Average score: {"{:.2f}".format(score_lr.mean())}')

for val in algorithms:
    score_lr = cross_val_score(linear_model.LogisticRegression(solver = val, random_state= 42), X, Y, cv= kf, scoring="accuracy")
    curr_score = score_lr.mean()
    curr_algo = val
    if acc_max < curr_score:
        acc_max = curr_score
        max_solver = val
print(f'Average Max score at ({max_solver}): {"{:.3f}".format(acc_max)}')

### 3.2 Classification Report for Logistic Regression

In [None]:
lr = LogisticRegression(solver = max_solver, random_state= 42)
lr.fit(X, Y)
y_pred = lr.predict(X_test)

print("Logistic Regression Algorithm")
cls_rep = classification_report(y_test, y_pred)
print(cls_rep)

#Accuracy score
acc_lr = round(accuracy_score(y_pred,y_test),2) * 100
f1_lr = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_lr = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_lr = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_lr,"%")
print("F1 Score: ",f1_lr,"%")
print("Precision: ",pr_lr,"%")
print("Recall: ",rc_lr,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = lr.predict_proba(X_test)[:,1]

# Compute predicted probabilities: y_pred_prob
y_pred_prob = lr.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')
plt.show()

lr_auc = metrics.auc(fpr, tpr)
print("Area Under the curve for ROC =",lr_auc)

### 4. Classification Report for Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, Y)
y_pred = rf.predict(X_test)

print("Random Forest Algorithm")
rf_rep = classification_report(y_test, y_pred)
print(rf_rep)

#Accuracy score
acc_rf = round(accuracy_score(y_pred,y_test),2) * 100
f1_rf = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_rf = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_rf = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_rf,"%")
print("F1 Score: ",f1_rf,"%")
print("Precision: ",pr_rf,"%")
print("Recall: ",rc_rf,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = rf.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest')
plt.show()

rf_auc = metrics.auc(fpr, tpr)
print("Area Under the curve for ROC =",rf_auc)

### 5. Classification Report for Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X, Y)
y_pred = gnb.predict(X_test)

print("Naive Bayes Algorithm")
gnb_rep = classification_report(y_test, y_pred)
print(gnb_rep)

#Accuracy score
acc_gnb = round(accuracy_score(y_pred,y_test),2) * 100
f1_gnb = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_gnb = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_gnb = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_gnb,"%")
print("F1 Score: ",f1_gnb,"%")
print("Precision: ",pr_gnb,"%")
print("Recall: ",rc_gnb,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = gnb.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Naive Bayes')
plt.show()

gnb_auc = metrics.auc(fpr, tpr)
print("Area Under the curve for ROC =",gnb_auc)

### 6. Classification Report for Multi Layer Perceptron

In [None]:
mlp = MLPClassifier()
mlp.fit(X, Y)
y_pred = mlp.predict(X_test)

#Summary of the predictions made by the classifier
print("Multi Layer Perceptron Algorithm")
mlp_rep = classification_report(y_test, y_pred)
print(mlp_rep)

#Accuracy score
acc_mlp = round(accuracy_score(y_pred,y_test),2) * 100
f1_mlp = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_mlp = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_mlp = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_mlp,"%")
print("F1 Score: ",f1_mlp,"%")
print("Precision: ",pr_mlp,"%")
print("Recall: ",rc_mlp,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = mlp.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Multi Layer Perceptron')
plt.show()

mlp_auc = metrics.auc(fpr, tpr)
print(mlp_auc)

### 7. Classification report for SVM

In [None]:
svm = SVC(probability=True)
svm.fit(X, Y)
y_pred = svm.predict(X_test)

#Summary of the predictions made by the classifier
print("Support Vector Machine Algorithm")
svm_rep = classification_report(y_test, y_pred)
print(svm_rep)

#Accuracy score
acc_svm = round(accuracy_score(y_pred,y_test),2) * 100
f1_svm = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_svm = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_svm = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_svm,"%")
print("F1 Score: ",f1_svm,"%")
print("Precision: ",pr_svm,"%")
print("Recall: ",rc_svm,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = svm.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Support Vector Machine')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

svm_auc = metrics.auc(fpr, tpr)
print(svm_auc)

### 8. Classification Report for AdaBoost

In [None]:
adb = AdaBoostClassifier(n_estimators=50, base_estimator=tr,learning_rate=1)
# Add silent=True to avoid printing out updates with each cycle
adb.fit(X, Y)
y_pred = adb.predict(X_test)

#Summary of the predictions made by the classifier
print("Ada Boost Algorithm")
adb_rep = classification_report(y_test, y_pred)
print(adb_rep)

#Accuracy score
acc_adb = round(accuracy_score(y_pred,y_test),2) * 100
f1_adb = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_adb = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_adb = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_adb,"%")
print("F1 Score: ",f1_adb,"%")
print("Precision: ",pr_adb,"%")
print("Recall: ",rc_adb,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = adb.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Ada Boost Algorithm')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

adb_auc = metrics.auc(fpr, tpr)
print(adb_auc)

### 9. Classification Report for GradientBoosting

In [None]:
gbc = GradientBoostingClassifier()
# Add silent=True to avoid printing out updates with each cycle
gbc.fit(X, Y)
y_pred = gbc.predict(X_test)

#Summary of the predictions made by the classifier
print("Gradient Boosting Algorithm")
gbc_rep = classification_report(y_test, y_pred)
print(gbc_rep)

#Accuracy score
acc_gbc = round(accuracy_score(y_pred,y_test),2) * 100
f1_gbc = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_gbc = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_gbc = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_gbc,"%")
print("F1 Score: ",f1_gbc,"%")
print("Precision: ",pr_gbc,"%")
print("Recall: ",rc_gbc,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = gbc.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Gradient Boosting Algorithm')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

gbc_auc = metrics.auc(fpr, tpr)
print(gbc_auc)

### 10.Classification Report for XGBoost

In [None]:
xgb = XGBClassifier()
# Add silent=True to avoid printing out updates with each cycle
xgb.fit(X, Y)
y_pred = xgb.predict(X_test)

#Summary of the predictions made by the classifier
print("XG Boost Algorithm")
xgb_rep = classification_report(y_test, y_pred)
print(xgb_rep)

#Accuracy score
acc_xgb = round(accuracy_score(y_pred,y_test),2) * 100
f1_xgb = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_xgb = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_xgb = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_xgb,"%")
print("F1 Score: ",f1_xgb,"%")
print("Precision: ",pr_xgb,"%")
print("Recall: ",rc_xgb,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = xgb.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for XGBoost Algorithm')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

xgb_auc = metrics.auc(fpr, tpr)
print(xgb_auc)

### 11. Voting Classifier

In [None]:
clf = VotingClassifier(estimators = [('gnb',gnb), ('lr',lr), ('tr',tr), ('svm', svm), ('mlp',mlp)], voting='soft') # construct the ensemble classifier
clf.fit(X, Y) # train the ensemble classifier
y_true, y_pred = y_test, clf.predict(X_test)

print("Voting Algorithm")
print(classification_report(y_test, y_pred))

#Accuracy score
acc_clf = round(accuracy_score(y_pred,y_test),2) * 100
f1_clf = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_clf = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_clf = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_clf,"%")
print("F1 Score: ",f1_clf,"%")
print("Precision: ",pr_clf,"%")
print("Recall: ",rc_clf,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = clf.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Voting Algorithm')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

ea_auc = metrics.auc(fpr, tpr)
print(ea_auc)

### 12. Heterogeneous Adaboost Classifier

In [None]:
clf1 =  VotingClassifier(estimators = [('gnb',gnb), ('lr',lr), 
                                       ('tr',tr), ('svm', svm)], voting='soft') # construct the ensemble classifier

adb1 = AdaBoostClassifier(n_estimators=5, base_estimator=clf1,learning_rate=1)
# Add silent=True to avoid printing out updates with each cycle
adb1.fit(X, Y)
y_pred = adb1.predict(X_test)

#Summary of the predictions made by the classifier
print("Ada Boost Algorithm")
adb_rep1 = classification_report(y_test, y_pred)
print(adb_rep1)

#Accuracy score
acc_adb1 = round(accuracy_score(y_pred,y_test),2) * 100
f1_adb1 = round(f1_score(y_test, y_pred, average="macro"),2) * 100
pr_adb1 = round(precision_score(y_test, y_pred, average="macro"),2) * 100
rc_adb1 = round(recall_score(y_test, y_pred, average="macro"),2) * 100

print("Accuracy: ",acc_adb1,"%")
print("F1 Score: ",f1_adb1,"%")
print("Precision: ",pr_adb1,"%")
print("Recall: ",rc_adb1,"%")

# Compute predicted probabilities: y_pred_prob
y_pred_prob = adb1.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1],[0, 1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Support Vector Machine')
plt.show()

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

adb_auc1 = metrics.auc(fpr, tpr)
print(adb_auc1)

In [None]:
print("Accuracy")
print("1. KNN",acc_knn)
print("2. NB ",acc_gnb)
print("3. LR ",acc_lr)
print("4. DT ",acc_tr)
print("5. SVM",acc_svm)
print("6. MLP",acc_mlp)
print("7. RF ",acc_rf)
print("8. ADB Homogeneous",acc_adb)
print("9. ADB Heterogeneous",acc_adb1)
print("10. GBC",acc_gbc)

In [None]:
print("AUC")
print("1. KNN",knn_auc)
print("2. NB ",gnb_auc)
print("3. LR ",lr_auc)
print("4. DT ",tr_auc)
print("5. SVM",svm_auc)
print("6. MLP",mlp_auc)
print("7. RF ",rf_auc)
print("8. ADB Homogeneous",adb_auc)
print("9. ADB Heterogenous",adb_auc1)
print("10. GBC",gbc_auc)