In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier  
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.preprocessing import normalize
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score


import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import load_model

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/cardiovascular-disease-dataset/cardio_train.csv", sep=';').drop('id', axis=1)
df.head()

### Check missing values

In [None]:
df.isnull().sum()

### Check duplicate values

In [None]:
df.duplicated().sum()

In [None]:
# Dropping duplicate values
df.drop_duplicates(inplace=True)
print("number of duplicated data after dropping :",df.duplicated().sum())

### Data Describe

In [None]:
df.describe()

In ap_hi and ap_lo have some outlier!

### Outlier Checking and Dropping

In [None]:
std_list = ["age", "height", "weight", "ap_hi", "ap_lo"] 

def standartization(x):
    x_std = x.copy(deep=True)
    for column in std_list:
        x_std[column] = (x_std[column] - x_std[column].mean()) / x_std[column].std() 
    return x_std

df_std = standartization(df)
df_std.head()

In [None]:
# Melting data for multi box plot
df_melt = pd.melt(frame=df_std, id_vars='cardio', value_vars=std_list, var_name='features', value_name='value', col_level=None)
df_melt

In [None]:
# Box plot
plt.figure(figsize=(12, 10))
sns.boxplot(x='features', y='value', hue='cardio', data=df_melt)
plt.xticks(rotation=90)

It is clear that in ap_hi and ap_lo have some unusual outlier!

In [None]:
# Diciding upper and lower bound 
ap_list = ['ap_hi', 'ap_lo']
df_ap = pd.DataFrame(index=['lower_bound', 'upper_bound'])

for each in ap_list:
    Q1 = df[each].quantile(0.25)
    Q3 = df[each].quantile(0.75)
    IQR = Q3 - Q1
    
    lowr_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df_ap[each] = [lowr_bound, upper_bound]
    
df_ap.head()

In [None]:
# Outlier rows
ap_hi_filter = (df['ap_hi'] > df_ap['ap_hi'][1])
ap_lo_filter = (df['ap_lo'] > df_ap['ap_lo'][1])
outlier_filter = (ap_hi_filter | ap_lo_filter)
df_outlier = df[outlier_filter]

# graph of outlier
sns.countplot(x='cardio', data=df_outlier, linewidth=2, edgecolor=sns.color_palette("dark", 1))


In [None]:
# taking less than 250 in ap_hi and less than 200 in ap_lo
outlier_filter = ((df["ap_hi"]>250) | (df["ap_lo"]>200) )
df = df[~outlier_filter]
df.shape

### Day to Year of age column

In [None]:
df['age'] = (df['age']/365).round().astype('int')
df.head()

### New column of weight

In [None]:
def weight_func(x):
    if x <= 50:
         return 1
        
    if x > 50  and x <= 100:
         return 2
        
    if x >100 and x<= 150:
         return 3
        
    else:
         return 4


In [None]:
df['weight_class'] = df['weight'].apply(lambda x: weight_func(x))

### New column of age

In [None]:
def age_func(x):
    if x <= 40:
         return 1
        
    if x > 40  and x <= 50:
         return 2
        
    if x > 50 and x <= 60:
         return 3
        
    else:
         return 4


In [None]:
df['age_class'] = df['age'].apply(lambda x: age_func(x))

### New column of body mass index

In [None]:
df['bmi'] = df['weight'] / (df['height']/100)**2
df.head()

### Preprocessing gender column

In [None]:
df['gender'] = df['gender'].apply(lambda x: 0 if x==1 else 1)

In [None]:
df.head()

### Crrelation

In [None]:
corr = df.corr()
f, ax = plt.subplots(figsize=(18, 18))
sns.heatmap(corr, annot=True, fmt='.3f', linewidths=0.5, ax=ax)

### shuffle dataset

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

### Dividing Dataset

In [None]:
X = df.drop('cardio', axis=1)
y = df.cardio
print("Shape of X is :", X.shape, "Shape of y is :", y.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Using Machine Learning Algorithms

In [None]:
algs = {
    "logistic_regression": LogisticRegression(),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(),
    "support_vector machine": SVC(),
    "linear_svc": LinearSVC(),
    "perceptron": Perceptron(),
    "gaussian_naive bayes": GaussianNB(),
    "k nearest_neighbors": KNeighborsClassifier(),
    "sgdclassifier": SGDClassifier(),
    "xgboost": XGBClassifier(),
    "lgbm_classifier": LGBMClassifier(),
    "gradient_boosting": GradientBoostingClassifier(),
    "ridge_classifier": RidgeClassifier(),
    "bagging_classifier": BaggingClassifier(),
    "extra_tree_classifier": ExtraTreesClassifier(),
    "adaboost_classifier": AdaBoostClassifier()
}

In [None]:
result = pd.DataFrame(columns=['train_accuracy', 'test_accuracy', 'f1_score'])

for key, value in algs.items():
    value.fit(x_train, y_train)
    train_acc = round(value.score(x_train, y_train)*100, 2)  
    predicted = value.predict(x_test)
    result.loc[key] = [train_acc, 100* np.round(accuracy_score(y_test, predicted), decimals=4),
                      round(f1_score(y_test, predicted), 2)]
    
# Printing result dataframe  
result = result.sort_values(by=['test_accuracy', 'train_accuracy'], ascending=False)
result

## Hyperparameter Tuning with Random Forest

Hyperparameter tuning takes lots of time! for this I don't run with hyperparameter 
If you want you can run! 

In [None]:
rfc = RandomForestClassifier() 

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyper_params = dict(n_estimators = n_estimators, 
                    max_depth = max_depth,  
                    min_samples_split = min_samples_split, 
                    min_samples_leaf = min_samples_leaf)

In [None]:
grid_s_cv = GridSearchCV(rfc, hyper_params, cv=3)
grid_s_cv.fit(x_train, y_train)

In [None]:
grid_s_cv.best_params_

In [None]:
# training with hyperparameter
rfc_hyp = RandomForestClassifier() 
rfc_hyp.fit(x_train, y_train)

In [None]:
rfc_predicted = rfc_hyp.predict(x_test)

### Classification Report

In [None]:
print(classification_report(y_test, rfc_predicted))

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, rfc_predicted)
plt.figure(figsize=(7,7))
sns.heatmap(cm, fmt='.0f', annot=True, linewidths=0.2, linecolor='purple')
plt.xlabel('predicted value')
plt.ylabel('Truth value')
plt.show()

## Hyperparameter Tuning with XGBoost

In [None]:
xgb = XGBClassifier()

In [None]:
hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth':[2, 3, 5, 10, 15],
    'learning_rate':[0.05,0.1,0.15,0.20],
    'min_child_weight':[1,2,3,4],
    'booster':['gbtree','gblinear'],
    'base_score':[0.25,0.5,0.75,1]
    }

In [None]:
random_cv = RandomizedSearchCV(estimator=xgb,
            param_distributions=hyperparameter_grid,
            cv=3, 
            n_iter=50,
            scoring = 'neg_mean_absolute_error',
            n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)


In [None]:
random_cv.fit(x_train, y_train)

In [None]:
random_cv.best_estimator_

In [None]:
# Training
xgb_hyp = XGBClassifier()
xgb_hyp.fit(x_train, y_train)

In [None]:
# predicting 
xgb_predicted = xgb_hyp.predict(x_test)

In [None]:
# Classification Report
print(classification_report(y_test, xgb_predicted))

In [None]:
# Confusion Matrix 
cm = confusion_matrix(y_test, xgb_predicted)
plt.figure(figsize=(7,7))
sns.heatmap(cm, fmt='.0f', annot=True, linewidths=0.2, linecolor='purple')
plt.xlabel('predicted value')
plt.ylabel('Truth value')
plt.show()

### Using Cross_val_score

In [None]:
xgb_cvs = XGBClassifier()
cvs = cross_val_score(xgb_cvs, X, y, cv=5)
cvs

In [None]:
cvs.mean()

### Hyperparameter tuning with LGBMClassifier

In [None]:
lgbmc = LGBMClassifier()

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3


In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(x_test, y_test)],
            'eval_names': ['valid'],
            'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}


In [None]:
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}


In [None]:
lgbmc = LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
                        estimator=lgbmc, 
                        param_distributions=param_test, 
                        n_iter=100,
                        scoring='roc_auc',
                        cv=3,
                        refit=True,
                        random_state=314,
                        verbose=True)


In [None]:
opt_parameters = {'colsample_bytree': 0.9234, 'min_child_samples': 399, 'min_child_weight': 0.1, 'num_leaves': 13, 'reg_alpha': 2, 'reg_lambda': 5, 'subsample': 0.855}

In [None]:
clf_sw = LGBMClassifier(**lgbmc.get_params())
#set optimal parameters
clf_sw.set_params(**opt_parameters)

In [None]:
gs_sample_weight = GridSearchCV(estimator=clf_sw, 
                                param_grid={'scale_pos_weight':[1,2,6,12]},
                                scoring='roc_auc',
                                cv=5,
                                refit=True,
                                verbose=True)


In [None]:
gs_sample_weight.fit(x_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs_sample_weight.best_score_, gs_sample_weight.best_params_))


In [None]:
print("Valid+-Std     Train  :   Parameters")
for i in np.argsort(gs_sample_weight.cv_results_['mean_test_score'])[-5:]:
    print('{1:.3f}+-{3:.3f}     {2:.3f}   :  {0}'.format(gs_sample_weight.cv_results_['params'][i], 
                                                         gs_sample_weight.cv_results_['mean_test_score'][i], 
                                                         gs_sample_weight.cv_results_['mean_train_score'][i],
                                                         gs_sample_weight.cv_results_['std_test_score'][i]))
    

### Final Model

In [None]:
lgbmc_hyp = LGBMClassifier(**lgbmc.get_params())
lgbmc_hyp.set_params(**opt_parameters)

In [None]:
lgbmc_hyp.fit(x_train, y_train, **fit_params)

In [None]:
# Predicting with test data
lgbmc_predicted = lgbmc_hyp.predict(x_test)

In [None]:
# classification report
print(classification_report(y_test, lgbmc_predicted))

In [None]:
# Confusion Matrix 
cm = confusion_matrix(y_test, lgbmc_predicted)
plt.figure(figsize=(7,7))
sns.heatmap(cm, fmt='.0f', annot=True, linewidths=0.2, linecolor='purple')
plt.xlabel('predicted value')
plt.ylabel('Truth value')
plt.show()

## Voting Classifier

In [None]:
vot_cls = VotingClassifier(estimators=[('lr', rfc_hyp), ('rf', xgb_hyp), ('ab', lgbmc_hyp)], voting='hard')

for clf, label in zip([rfc_hyp, xgb_hyp, lgbmc_hyp, vot_cls],
                      ['Random forest classifier', 'XGBClassifier', "LGBM classifier", "Ensemble"]):
    scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"% (scores.mean(), scores.std(), label))
    

In [None]:
# train score of voting classifier
vot_cls.fit(x_train, y_train)
train_acc = round(vot_cls.score(x_train, y_train)*100, 2) 
train_acc

In [None]:
# test score of voting classifier
test_acc = round(vot_cls.score(x_test, y_test)*100, 2)

In [None]:
test_acc

## Using Neural Network

In [None]:
# Neural Network
Model = Sequential()
Model.add(Dense(7, input_dim=14, activation='relu'))
Model.add(Dense(1, activation='sigmoid'))

In [None]:
# Deep Neural Network
model = Sequential()
model.add(Dense(14, input_dim=14, activation='relu'))
model.add(Dense(50, activation='relu', kernel_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1), bias_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu', kernel_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1), bias_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu', kernel_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1), bias_initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))


In [None]:
# Model Summary
model.summary()

In [None]:
# Model Compile
optimizer = RMSprop(learning_rate=0.002)
model.compile(loss='binary_crossentropy', 
              metrics=['accuracy'], 
              optimizer=optimizer)

In [None]:
# ReduceLROnPlateau
learning_rate_reduction = ReduceLROnPlateau( 
    monitor='val_loss',    # Quantity to be monitored.
    factor=0.1,       # Factor by which the learning rate will be reduced. new_lr = lr * factor
    patience=50,        # The number of epochs with no improvement after which learning rate will be reduced.
    verbose=1,         # 0: quiet - 1: update messages.
    mode="auto",       # {auto, min, max}. In min mode, lr will be reduced when the quantity monitored has stopped decreasing;                    
    min_delta=0.0001,  # threshold for measuring the new optimum, to only focus on significant changes.
    cooldown=0,        # number of epochs to wait before resuming normal operation after learning rate (lr) has been reduced.
    min_lr=0.00001     # lower bound on the learning rate.
    )

# EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', 
                   mode='min', 
                   verbose=1, 
                   patience=400, 
                   restore_best_weights=True)

In [None]:
# Model Training
history = model.fit(x_train, y_train.values,
                    batch_size=1024, 
                    epochs=1000,
                    verbose=0,
                    validation_split=0.15,
                    callbacks=[learning_rate_reduction, early_stopping],
                    shuffle=True)

In [None]:
# Model Evaluate
model.evaluate(x_test, y_test.values, verbose=2)

### Accuracy and Loss of Training

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
losses = pd.DataFrame(history.history)
losses[['loss','val_loss']].plot()

In [None]:
losses = pd.DataFrame(history.history)
losses[['accuracy','val_accuracy']].plot()

### Model Saving and Loading¶

In [None]:
model.save("Cardiovascular_Disease.h5") 

In [None]:
loaded_model = load_model("Cardiovascular_Disease.h5")

### Predicting with test data

In [None]:
ann_predicted = loaded_model.predict(x_test)
ann_predicted

### Classification Report

In [None]:
print(classification_report(y_test, ann_predicted.round()))

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, ann_predicted.round()) 

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(cm, fmt='.0f', annot=True, linewidths=0.2, linecolor='purple')
plt.xlabel('predicted value')
plt.ylabel('Truth value')
plt.show()