In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style("whitegrid")

# <p style="text-align:center;">What causes heart problems💔? </p>
#  <p style="text-align:center;"> Analysis and comprehensive classification </p>

<center><img src="https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/article_thumbnails/slideshows/did_you_know_this_could_lead_to_heart_disease_slideshow/650x350_did_you_know_this_could_lead_to_heart_disease_slideshow.jpg"></center>

One of the key applications of machine learning methods is certainly health problems. The heaviest element in the application of black box models by scientists is to explain what affected the result. Since human life is the most important thing in medicine, data scientists must face up to the challenge and be able to answer the fundamental question - **why**?

The presented data includes numerous qualitative and quantitative features that will allow us to build the machine learning model. However as I mentioned, we work with human data, so the key element in building the model is to understand what really causes heart diseases. Starting with an extensive exploratory data analysing, I will try to answer some heart disease questions. Then the data will be cleaned and inserted into machine learning models .

### Thus, this kernel is more focused for learning purposes and exploration. Please feel free to give advice, recommendations/ better approaches or whatsover on the code below.

## If you find this kernel helpful, any **<font color='orange'>UPVOTES</font>** would be very much appreciated.

# Data overview

In [None]:
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
data.shape

In [None]:
data.head()

## Info

In [None]:
data.info()

In [None]:
pd.set_option('display.float','{:.2f}'.format)
data.describe()

Columns description:

* age - age in years 
* sex - (1 = male; 0 = female) 
* cp - chest pain type 
* trestbps - resting blood pressure (in mm Hg on admission to the hospital) 
* chol - serum cholestoral in mg/dl 
* fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg - resting electrocardiographic results 
* thalach - maximum heart rate achieved 
* exang - exercise induced angina (1 = yes; 0 = no) 
* oldpeak - ST depression induced by exercise relative to rest 
* slope - the slope of the peak exercise ST segment 
* ca - number of major vessels (0-3) colored by flourosopy 
* thal - 3 = normal; 6 = fixed defect; 7 = reversable defect 
* target - have disease or not (1=yes, 0=no)

## Target

* 1 - (YES) have heart disease
* 0 - (NO) have not heart disease

In [None]:
data.target.value_counts()

In [None]:
disease = len(data[data['target'] == 1])
no_disease = len(data[data['target']== 0])

plt.figure(figsize=(12,6))

labels = 'Have heart disease','Have not heart disease'
sizes = [disease,no_disease]
explode = (0.1, 0) 
plt.pie(sizes, explode=explode, labels=labels, colors=['orangered','skyblue'],
autopct='%1.2f%%', shadow=True, startangle=90, textprops={'fontsize': 12})
plt.axis('equal')
plt.title('Percentage of target', size=16)
plt.show()

## Missing values

In [None]:
data.isna().sum()

Great! Dataset is free from NaN values. 

# 1.Exploratory Data Analysis (EDA)

## Categorical values
Apart from target, some of features contianed in the dataset are categorical. Let's discover them.

In [None]:
qualitative = []
quantitative = []
for feature in data.columns:
    if len(data[feature].unique()) <= 8:
        qualitative.append(feature)
    else:
        quantitative.append(feature)

In [None]:
qualitative

In [None]:
qualitative

## Corellation Matrix

In [None]:
top = 15
corr = data.corr()
top15 = corr.nlargest(top, 'target')['target'].index
corr_top15 = data[top15].corr()
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_top15, square=True, ax=ax, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size':12})
plt.title('Top correlated features of dataset', size=16)
plt.show()

## Age visualization

In [None]:
fig = plt.figure(figsize=(16,4))

ax1 = fig.add_subplot(121)
sns.boxplot(y = data['age'], ax=ax1, color='orangered')
describe = data['age'].describe().to_frame().round(2)

ax2 = fig.add_subplot(122)
ax2.axis('off')
font_size = 16
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of age', fontsize=16)
plt.show()

### Conclusion
* Based on boxplot we can see that data most often include people of age between 47-61 becouse, these ages lying between I and III percentile (IQR).

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(data['age'], hue=data['sex'], palette=['skyblue','orangered'], saturation=0.8)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Gender count', fontsize=16)
plt.legend(loc='upper right', fontsize=16, labels=['Female', 'Male'])
plt.text(30, 11,'Total male: {:.2f}%'. 
         format(((data['sex'].value_counts()[1])/(len(data)))*100), fontsize=16)
plt.text(30, 10.5,'Total female: {:.2f}%'. 
         format(((data['sex'].value_counts()[0])/(len(data)))*100), fontsize=16)
plt.show()

### Conclusion
* The data examined included mainly men, at the age mentioned. 

## Target gender count

In [None]:
plt.figure(figsize=(12,4))
labels = ['female','male']
sns.countplot(data['sex'], hue=data['target'], palette=['skyblue','orangered'], saturation=0.8)
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Target count in genders', fontsize=16)
plt.legend(loc='upper right', fontsize=16, labels=['No disease', 'Disease'])
plt.show()

### Conclusion
* In the data under study, men more often had heart problems.

## Sex, age vs target

In [None]:
plt.figure(figsize=(12,6))
palette=['skyblue','orangered']
s1=sns.boxenplot(x=data.sex, y=data.age, hue=data.target, palette=palette, linewidth=3)
handles = s1.get_legend_handles_labels()[0]
s1.legend(handles, ['No disease', 'Disease'])
s1.set_title("Sex, age, target boxenplot",fontsize=16)
plt.show()

#### Conclusion
* On average, men start having heart problems at an earlier age than women.

## Key categorical features vs heart disease
Based on corrplot I took three most corellating qualitative features with target.

In [None]:
fig,ax=plt.subplots(figsize=(24,6))
palette = ['skyblue','orangered']

plt.subplot(131)
x1=sns.countplot(x=data.cp,hue=data.target,palette=palette,linewidth=3)
x1.set_title('Chest pain type vs heart disease',size=16)
x1.legend(loc='upper right', fontsize=12, labels=['No disease', 'Disease'])

plt.subplot(132)
x2=sns.countplot(x=data.thal,hue=data.target,palette=palette,linewidth=3)
x2.set_title('Thalassaemia vs heart disease',size=16)
x2.legend(loc='upper left', fontsize=12, labels=['No disease', 'Disease'])

plt.subplot(133)
x3=sns.countplot(x=data.slope,hue=data.target,palette=palette,linewidth=3)
x3.set_title('Slope of the peak exercise ST segment vs heart disease',size=16)
x3.legend(loc='upper left', fontsize=12, labels=['No disease', 'Disease'])

plt.show()

### Conclusions
* Second type of chest pain most often accompanies heart problems.
* Second thalassaemia type(fixed defect) most often accompanies heart problems.
* Second slope  most often accompanies heart problems.

## Key quantitative features vs heart disease
Based on corrplot I took three most corellating quantitative features with target.

In [None]:
fig,ax=plt.subplots(figsize=(24,6))
palette = ['darkblue','darkred']

plt.subplot(1, 3, 1)
data['bin_thalach']=pd.cut(data.thalach, bins=[80,100,125,150,175,200])
p1=sns.countplot(x=data.bin_thalach,hue=data.target,palette=palette,linewidth=3)
p1.set_title("Thalach vs heart disease",size=16)
p1.legend(loc='upper left', fontsize=12, labels=['No disease', 'Disease'])



plt.subplot(1, 3, 2)
data['bin_chol']=pd.cut(data.chol, bins=[100,150,200,250,300,350,400])
p2=sns.countplot(x=data.bin_chol,hue=data.target,palette=palette,linewidth=3)
p2.set_title("Cholesterol vs heart disease",size=16)
p2.legend(loc='upper left', fontsize=12, labels=['No disease', 'Disease'])



plt.subplot(1, 3, 3)
data['bin_trestbps']=pd.cut(data.trestbps, bins=[80,100,120,140,160,180,200])
p3=sns.countplot(x=data.bin_trestbps,hue=data.target,palette=palette,linewidth=3)
p3.set_title("Trestbps vs heart disease",size=16)
p3.legend(loc='upper left', fontsize=12, labels=['No disease', 'Disease'])



plt.show()

### Conclusions
* The most frequent heart rate(thalach) suggesting heart disease is 175.
* Cholesterol levels suggestive of heart disease are most likely to be 250.
* The most frequent resting blood pressure, which is associated with heart disease, is 140.

## Oldpeak and Slope
According to the high correlation score between oldpeak and slope, it is worth taking a closer look at this dependence.

In [None]:
fig,ax=plt.subplots(figsize=(24,6))

plt.subplot(121)
old_bins = [0,1,2,3,4,5,6]
data['oldpeak_bin']=pd.cut(data.oldpeak, bins=old_bins)
o1=sns.countplot(x=data.oldpeak_bin,hue='target',data=data, palette='bright')
o1.legend(loc='upper right', fontsize=12, labels=['No disease', 'Disease'])


plt.subplot(122)
o2 = sns.pointplot(x='slope',y='oldpeak',data=data,hue='target',palette='bright')
handles = o2.get_legend_handles_labels()[0]
o2.legend(handles, ['No disease', 'Disease'])

plt.suptitle('Oldpeak, slope vs target', size = 22)
plt.show()

### Conclusions
* With the value of oldpeak increases the rate of heart disease decreases
* With increases of value of slope, heart disease people have lower oldpeak

## Age and thalach

In [None]:
plt.figure(figsize=(24,6))
z1=sns.pointplot(x=data.age, y=data.thalach, hue=data.target, palette='bright', linewidth=3)
plt.title('Age, thalach vs target',size=22)
plt.show()

### Conclusions
* As the age increases, the thalach slightly decreases
* In almost every age, for disease samples thalach feature has a higher value.

In [None]:
#Dropping columns used only to plots.
data.drop(['bin_chol','bin_thalach','bin_trestbps','oldpeak_bin'],axis=1,inplace=True)

# 2.Outliers
To detect outliers I will use the IQR method on quantitative features.

In [None]:
data[quantitative].head()

In [None]:
def iqr(df, column):
  Q1 = np.percentile(df[column], 25)
  Q3 = np.percentile(df[column], 75)
  IQR = Q3 - Q1
  outlier_step = 1.5 * IQR
  outliers_index = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step)].index
  return outliers_index

### Outliers in trestbps

In [None]:
outliers_index = iqr(data,'trestbps')
data.drop(outliers_index, inplace=True)
data.reset_index(drop=True, inplace=True)

### Outliers in cholesterol

In [None]:
outliers_index = iqr(data,'chol')
data.drop(outliers_index, inplace=True)
data.reset_index(drop=True, inplace=True)

### Outliers in thalach

In [None]:
outliers_index = iqr(data,'thalach')
data.drop(outliers_index, inplace=True)
data.reset_index(drop=True, inplace=True)

### Outliers in oldpeak

In [None]:
outliers_index = iqr(data,'oldpeak')
data.drop(outliers_index, inplace=True)
data.reset_index(drop=True, inplace=True)

# 3.Data preparation

### Split data into features and target

In [None]:
y = data['target']
X = data.drop('target',axis=1)

### Encode categorical variables

In [None]:
qualitative.remove('target')
X = pd.get_dummies(X, columns = qualitative)

In [None]:
X.head()

### Features scaling

In [None]:
X[quantitative] = StandardScaler().fit_transform(X[quantitative])

### Train test split

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=42)

# 4.Models

# K-Nearest Neighbor Algorithm
implemented with GridSearchCV for hyperparameter tunning.

### Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

params = {'n_neighbors':list(range(1,20)),
    'p':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'leaf_size':list(range(1,20)),
    'weights':['uniform', 'distance']}

In [None]:
knn_param = GridSearchCV(knn, params, cv=5, n_jobs=-1)

In [None]:
knn_param.fit(X_train, y_train)
#Best params selected by GridSearchCV
knn_param.best_params_

### Predictions

In [None]:
predict = knn_param.predict(X_test)

In [None]:
knn_acc_train = knn_param.score(X_train, y_train)*100
knn_acc_test = knn_param.score(X_test, y_test)*100

print("Train Accuracy {:.2f}%".format(knn_acc_train))
print("Test Accuracy {:.2f}%".format(knn_acc_test))

### Report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predict))

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for K-NN')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

### ROC Curve for K-Nearest Neighbor

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
y_probabilities = knn_param.predict_proba(X_test)[:,1]

In [None]:
false_positive_rate_knn,true_positive_rate_knn,threshold_knn = roc_curve(y_test,y_probabilities)

In [None]:
#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('ROC for K-NN')
plt.plot(false_positive_rate_knn, true_positive_rate_knn, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Logistic Regression
implemented with GridSearchCV for hyperparameter tunning.

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()

In [None]:
params = {'penalty':['l1','l2'],
         'C':[0.01,0.1,1,10,100],
         'class_weight':['balanced',None]}

log_param = GridSearchCV(log,param_grid=params,cv=10, verbose=0)

In [None]:
log_param.fit(X_train,y_train)
#Best params selected by GridSearchCV
log_param.best_params_

### Predictions

In [None]:
predict = log_param.predict(X_test)

In [None]:
log_acc_train = log_param.score(X_train, y_train)*100
log_acc_test = log_param.score(X_test, y_test)*100

print("Train Accuracy {:.2f}%".format(log_acc_train))
print("Test Accuracy {:.2f}%".format(log_acc_test))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predict))

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Logistic Regression')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

### ROC Curve for Logistic Regression

In [None]:
y_probabilities = log_param.predict_proba(X_test)[:,1]

In [None]:
false_positive_rate_log,true_positive_rate_log,threshold_log = roc_curve(y_test,y_probabilities)

In [None]:
#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('ROC for Logistic Regression')
plt.plot(false_positive_rate_log, true_positive_rate_log, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Supported Vector Classifier
implemented with GridSearchCV for hyperparameter tunning.

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(probability=True, kernel='rbf', gamma=0.1, C=1.0)

In [None]:
params = {"C":(0.1, 0.5, 1, 2, 5, 10, 20), 
          "gamma":(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1), 
          "kernel":('linear', 'poly', 'rbf')}

svm_param = GridSearchCV(svm_clf, params, n_jobs=-1, cv=5, verbose=1, scoring="accuracy")

In [None]:
svm_param.fit(X_train, y_train)
#Best params selected by GridSearchCV
svm_param.best_params_

### Predictions

In [None]:
predict = svm_param.predict(X_test)

In [None]:
svc_acc_train = svm_param.score(X_train, y_train)*100
svc_acc_test = svm_param.score(X_test, y_test)*100

print("Train Accuracy {:.2f}%".format(svc_acc_train))
print("Test Accuracy {:.2f}%".format(svc_acc_test))

In [None]:
print(classification_report(y_test,predict))

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Supported Vector Classifier')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

### ROC Curve for Supported Vector Classifier

In [None]:
y_probabilities = svm_param.predict_proba(X_test)[:,1]

In [None]:
false_positive_rate_svc,true_positive_rate_svc,threshold_svc = roc_curve(y_test,y_probabilities)

In [None]:
#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('ROC for Supported Vector Classifier')
plt.plot(false_positive_rate_svc, true_positive_rate_svc, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Decision Tree Classifier
implemented with GridSearchCV for hyperparameter tunning.

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()

In [None]:
params = {"criterion":("gini", "entropy"), 
          "splitter":("best", "random"), 
          "max_depth":(list(range(1, 20))), 
          "min_samples_split":[2, 3, 4], 
          "min_samples_leaf":list(range(1, 20))}

tree_param = GridSearchCV(tree_clf, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3, iid=True)
tree_param.fit(X_train, y_train)

In [None]:
best_params = tree_param.best_params_
#Best params selected by GridSearchCV
best_params

In [None]:
tree_param = DecisionTreeClassifier(**best_params)

In [None]:
tree_param.fit(X_train, y_train)

### Predictions

In [None]:
predict = tree_param.predict(X_test)

In [None]:
tree_acc_train = tree_param.score(X_train, y_train)*100
tree_acc_test = tree_param.score(X_test, y_test)*100

print("Train Accuracy {:.2f}%".format(tree_acc_train))
print("Test Accuracy {:.2f}%".format(tree_acc_test))

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Decision Tree')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

### ROC Curve for Decision Tree

In [None]:
y_probabilities = tree_param.predict_proba(X_test)[:,1]

In [None]:
false_positive_rate_tree, true_positive_rate_tree, threshold_tree = roc_curve(y_test,y_probabilities)

In [None]:
#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('Revceiver Operating Characterstic')
plt.plot(false_positive_rate_tree, true_positive_rate_tree, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

#  XGBoost Classifier
implemented with GridSearchCV for hyperparameter tunning.

In [None]:
from xgboost.sklearn import XGBClassifier  

xgb = XGBClassifier()

In [None]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster = ['gbtree', 'gblinear']
base_score = [0.25, 0.5, 0.75, 0.99]
learning_rate = [0.05, 0.1, 0.15, 0.20]
min_child_weight = [1, 2, 3, 4]

params = {'n_estimators': n_estimators, 'max_depth': max_depth,
    'learning_rate' : learning_rate, 'min_child_weight' : min_child_weight, 
    'booster' : booster, 'base_score' : base_score}

In [None]:
xgb_cv = GridSearchCV(xgb, params, cv=5, scoring = 'accuracy',n_jobs =-1, verbose=0)

In [None]:
xgb_cv.fit(X_train, y_train)

In [None]:
best_params = xgb_cv.best_params_
#Best params selected by GridSearchCV
best_params

In [None]:
xgb = XGBClassifier(**best_params, silent=1)
xgb.fit(X_train, y_train)

### Predictions

In [None]:
predict = xgb.predict(X_test)

In [None]:
xgb_acc_train = xgb.score(X_train, y_train)*100
xgb_acc_test = xgb.score(X_test, y_test)*100

print("Train Accuracy {:.2f}%".format(xgb_acc_train))
print("Test Accuracy {:.2f}%".format(xgb_acc_test))

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for XGB Classifier')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
### ROC Curve for XGB Classifier

y_probabilities = xgb.predict_proba(X_test)[:,1]

false_positive_rate_xgb, true_positive_rate_xgb, threshold_xgb = roc_curve(y_test,y_probabilities)

#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('ROC for XGB Classifier')
plt.plot(false_positive_rate_xgb, true_positive_rate_xgb, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Neural Network

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

In [None]:
model = keras.Sequential([
        layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.4),
        layers.Dense(16, activation='relu',kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(1,activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics='accuracy')

In [None]:
print("Fit model on training data")
history = model.fit(X_train, y_train, batch_size=30, epochs=100, validation_split = 0.2, verbose=1)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6))

ax1.plot(history.history['accuracy'])
ax1.plot(history.history['val_accuracy'])
ax1.set_title('Model Accuracy')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Epoch')
ax1.legend(['Train', 'Test'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_title('Model Loss')
ax2.set_ylabel('loss')
ax2.set_xlabel('epoch')
ax2.legend(['train', 'test'])
plt.suptitle("Model Accuracy & Loss",fontsize=16)

plt.show()

### Predictions

In [None]:
predict_test = model.predict_classes(X_test)
predict_train = model.predict_classes(X_train)

In [None]:
nn_acc_train = accuracy_score(y_train, predict_train)*100
nn_acc_test = accuracy_score(y_test, predict_test)*100

print("Train Accuracy {:.2f}%".format(nn_acc_train))
print("Test Accuracy {:.2f}%".format(nn_acc_test))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,predict_test)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Blues',
           fmt = 'g')

ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Neural Network')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
### ROC Curve for Neural Network

y_probabilities = model.predict_proba(X_test)

false_positive_rate_nn, true_positive_rate_nn, threshold_nn = roc_curve(y_test,y_probabilities)

#Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('ROC for Neural Network')
plt.plot(false_positive_rate_nn, true_positive_rate_nn, linewidth=5, color='red')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Summary

## ROC Curves for all models

In [None]:
plt.figure(figsize = (16,10))
plt.title('ROC Curves')
plt.plot(false_positive_rate_knn,true_positive_rate_knn,label='K-NN')
plt.plot(false_positive_rate_log,true_positive_rate_log,label='Logistic Regression')
plt.plot(false_positive_rate_svc,true_positive_rate_svc,label='Supported Vector Classifier')
plt.plot(false_positive_rate_tree, true_positive_rate_tree,label='Decision Tree Classifier')
plt.plot(false_positive_rate_xgb, true_positive_rate_xgb,label='XGB Classifier')
plt.plot(false_positive_rate_nn, true_positive_rate_nn,label='Neural Network Classifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

## Model scores

In [None]:
scores = pd.DataFrame(data=[["K-NN", knn_acc_train, knn_acc_test],
                            ["Logistic Regression", log_acc_train, log_acc_test],
                            ["Supported Vector Classifier", svc_acc_train, svc_acc_test],
                            ["Decision Tree Classifier", tree_acc_train, tree_acc_test],
                            ["XGB Classifier", xgb_acc_train, xgb_acc_test],
                            ["Neural Network Classifier", nn_acc_train, nn_acc_test]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
scores

# <p style="text-align:center;">**<font color='orange'>Suggestions</font>** are welcome </p>

# <p style="text-align:center;"> **<font color='orange'>Feel free</font>** to ask below </p>