In [100]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [102]:
df = pd.read_csv('/kaggle/input/heart-disease-dataset/heart.csv')

In [103]:
df.shape

In [104]:
df.info()

In [105]:
df.head()

## EDA and visiualisatons

In [106]:
plt.figure(figsize=(8,6),dpi=100)
sns.displot(x='age',data=df,kde=True)
print(f"We can see that the mean age is {np.mean(df['age'])}. As expected it is higher for the patients.")

In [107]:
# Plotting a scatterplot
plt.figure(figsize=(6,4),dpi = 100)
sns.scatterplot(x='age',y = 'chol',data=df,hue='target')
print(f"One obvious conclusion is that people having higher age generally suffers from heart disease")

**Checking the correlation between different features**

In [108]:
corr = df.corr()

In [109]:
corr['target']

In [110]:
plt.figure(figsize=(10,8),dpi=150)
sns.heatmap(corr,annot=True)

In [111]:
sns.pairplot(df,hue='target')

In [112]:
sns.countplot(x='sex',data=df,hue='target')

In [113]:
sns.countplot(x='cp',data=df,hue='target')

#### It becomes pretty obvious to see that people not experiencing any heart pain do not generally have heart disease

## IMPORTING THE DIFFERENT MODELS

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,f1_score

In [115]:
X = df.drop('target',axis=1)
y = df['target']

In [116]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [117]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [118]:
def classify(clf,vals):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    return round(accuracy_score(y_test, y_pred),2)

def accuracies(vals):
    return{
        "AdaBoostClassifier" : classify(AdaBoostClassifier(),vals),
         "KNeighborsClassifier(n_neighbors=5)" : classify(KNeighborsClassifier(n_neighbors=5),vals),
        "RandomForestClassifier" : classify(RandomForestClassifier(),vals),
         "LogisticRegression" : classify(LogisticRegression(),vals)
    }

In [119]:
accuracies(df)

## We can see that Random Forest performs the best so individually training it

In [120]:
rfc = RandomForestClassifier()
param_grid = {'n_estimators':[10,20,50,100,200],'max_depth':[2,3,4,5]}

In [121]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = rfc,param_grid = param_grid,cv=5,verbose=0)
grid.fit(X_train,y_train)

In [122]:
grid.best_estimator_

In [123]:
grid.best_params_

In [124]:
pred = grid.predict(X_test)
accuracy_score(y_test,pred)

In [125]:
rfc.fit(X_train,y_train)
accuracy_score(y_test,rfc.predict(X_test))

# Training an artifical Neural network over this data

In [132]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [129]:
X_train.shape

In [None]:
#Building a simple ANN model

In [138]:
model = keras.models.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(25,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

model.fit(X_train,y_train,epochs=100)

In [139]:
model.evaluate(X_test,y_test)

In [140]:
model2 = keras.models.Sequential()
model2.add(keras.layers.Dense(150, input_shape = (820,13), activation = "relu"))#hidden layer with 300 neurons with relu activation15
model2.add(keras.layers.Dense(75, activation = "relu")) #hidden layer with 100 neurons with relu activation
model2.add(keras.layers.Dense(1, activation = "sigmoid"))

In [143]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001)
model2.compile(optimizer=optimizer,
             loss = "binary_crossentropy",
             metrics= ["accuracy"])
early_stopping_nn = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
heart_model = model2.fit(X_train, y_train, epochs= 100, validation_data = (X_test,  y_test), callbacks= [early_stopping_nn])


In [144]:
pd.DataFrame(heart_model.history).plot (figsize = (8,5))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.show()

In [145]:
model.evaluate(X_test,y_test)