# **1.IMPORTING LIBRARIES**

In [None]:
# Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# **2.DATA PREPARATION**

* *2.1. IMPOTING DATA*

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')

* *2.2. DATA INFORMATIONS*

In [None]:
data.head(10)

In [None]:
data.sample(5)

In [None]:
data.describe()

In [None]:
data.info()

* *2.3. DATA VISUALISATION*

In [None]:
figure = px.histogram(data,x="Sex",hover_data=data.columns,title=" Sex")
figure.show()

In [None]:
figure = px.histogram(data,x='ChestPainType',hover_data=data.columns,title="Distribution of ChestPainType")
figure.show()

In [None]:
figure = px.histogram(data,x='RestingECG',hover_data=data.columns,title="Distribution of RestingECG")
figure.show()

In [None]:
figure = px.box(data, y="Age", x="HeartDisease",title='Distribution of Age')
figure.show()

In [None]:
figure = px.box(data,x='HeartDisease', y='Cholesterol', title='Distribution of HeartDisease')
figure.show()

In [None]:
figure = px.box(data,x='HeartDisease', y='MaxHR')
figure.show()

In [None]:
# yes = heart disease, no = has not a heart disease
yes = data[data.HeartDisease == 1]
no = data[data.HeartDisease == 0]

# Compare of Age and MaxHR
plt.scatter(yes.Age,yes.MaxHR,color='red',label='has a heart disease',alpha=0.4)
plt.scatter(no.Age,no.MaxHR,color='green',label='has not a heart disease',alpha=0.4)
plt.xlabel("Age")
plt.ylabel("MaxHR")
plt.legend()
plt.show()

In [None]:
# Compare of Age and Cholesterol
plt.scatter(yes.Age,yes.Cholesterol,color='red',label='has a heart disease',alpha=0.4)
plt.scatter(no.Age,no.Cholesterol,color='green',label='has not a heart disease',alpha=0.4)
plt.xlabel("Age")
plt.ylabel("Cholesterol")
plt.legend()
plt.show()

In [None]:
# Compare of Age and RestingBP
plt.scatter(yes.Age,yes.RestingBP,color='red',label='has a heart disease',alpha=0.4)
plt.scatter(no.Age,no.RestingBP,color='green',label='has not a heart disease',alpha=0.4)
plt.xlabel("Age")
plt.ylabel("RestingBP")
plt.legend()
plt.show()

* *2.4. DATA CLEANING*


In [None]:
# Checking for NULLs in the data
data.isnull().sum()

* *2.4.1 One Hot Encoding*

In [None]:
# One Hot Encoding For Non Tree Based Algorithms

string_col = data.select_dtypes(include="object").columns

df=pd.get_dummies(data,columns=string_col,drop_first=False)
df.head()

* *2.5. DEFINE VALUES*

In [None]:
y = df.HeartDisease.values
x_data = df.drop(['HeartDisease'],axis=1)
x_data.shape

* *2.6. DATA NORMALIZATION*

    Formula = (x - min(x)) / (max(x) - min(x))

In [None]:
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))
x.shape

* *2.7. TRAIN TEST SPLIT*

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=25)

# **3 CLASSIFICATION ALGORITHMS**

# *3.1. NON TREE BASED ALGORITHMS*

**3.1.1. LOGISTIC REGRESSION**

In [None]:
# Sklearn library
from sklearn.linear_model import LogisticRegression

# Model
lr = LogisticRegression()

# Train
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)

In [None]:
# Accuracy
print("Accuracy of Logistic Regression: {}\n".format(lr.score(x_test,y_test)))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion matrix of loigistic regression:\n {}".format(cm))

**3.1.2. K-NEAREST NEIGHBORS(KNN)**

In [None]:
# Sklearn libarary
from sklearn.neighbors import KNeighborsClassifier

# Model
knn = KNeighborsClassifier(n_neighbors=3)

# Train
knn.fit(x_train,y_train)

y_pred = knn.predict(x_test)

In [None]:
# Accuracy
print("Accuracy of KNN: {}\n".format(knn.score(x_test,y_test)))

#Confusion Metrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion matrix of KNN:\n {}".format(cm))

*3.1.2.1. Finding Best K Values*

In [None]:
# Lists for values
trian_accuracy = []
test_accuracy = []

for each in range(1,15):
    
    # Test KNN Algorithm
    knn_2 = KNeighborsClassifier(n_neighbors=each)
    
    trian_accuracy.append(knn_2.fit(x_train,y_train))
    test_accuracy.append(knn_2.score(x_test,y_test))
    
# Plot
plt.plot(range(1,15),test_accuracy,color='green')
plt.xlabel('K values')
plt.ylabel('Accuracy values')
plt.show()

print("Best K values is: {} and hiegst accuracy is: {} ".format( 1 + test_accuracy.index(np.max(test_accuracy)), np.max(test_accuracy)))

You see that the best k value is 3

**3.1.3. SUPPORT VECTOR MACHINE(SVM)**

In [None]:
# Sklearn library
from sklearn.svm import SVC

# Model
svm = SVC(kernel = 'rbf',)

# Train
svm.fit(x_train,y_train)

y_pred = svm.predict(x_test)


In [None]:
# Accuracy
print("Accoracy of SVM: {}\n".format(svm.score(x_test,y_test)))

# Confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion matrix of SVM:\n {}".format(cm))

**3.1.4 NAIVE BAYES**

In [None]:
# Sklearn library
from sklearn.naive_bayes import GaussianNB

# Model
nb = GaussianNB()

# Train
nb.fit(x_train,y_train)

y_pred = nb.predict(x_test)

In [None]:
# Accuracy
print("Accuracy of Naive Bayes: {}\n".format(nb.score(x_test,y_test)))

# Confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion matrix of Naive Bayes:\n {}".format(cm))

# *3.2. TREE BASED ALGORITHM*

* **DATA PREPROCESSING**

1. *LABEL ENCODER*

In [None]:
# Label Encoding For Tree Based Algorithms
from sklearn.preprocessing import LabelEncoder

df_tree = data.apply(LabelEncoder().fit_transform)
df_tree.head()

2. *DEFINE VERIABLES*

In [None]:
y = df_tree.HeartDisease.values
x_data = df_tree.drop(['HeartDisease'],axis=1)

3. *DATA NORMALIZATION* 

In [None]:
x = (x_data - np.min(x_data))/(np.max(x_data) - np.min(x_data))
x.shape

4. *TRAIN TEST SPLIT*

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

**3.2.1. DECISION TREE**

In [None]:
# Sklearn library
from sklearn.tree import DecisionTreeClassifier

# Model
dt = DecisionTreeClassifier()

# Train
dt.fit(x_train,y_train)

y_pred = dt.predict(x_test)

In [None]:
# Accuracy
print("Accuracy of Decision Tree: {}\n".format(dt.score(x_test,y_test)))

# Confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion matrix of Decision Tree:\n {}".format(cm))

**3.2.2. RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=9,random_state=7)

rf.fit(x_train,y_train)

y_pred = rf.predict(x_test)

In [None]:
# Accuracy 
print("Accuracy of Renadom Forest: {}\n".format(rf.score(x_test,y_test)))

# Confusion matrix
cm = confusion_matrix(y_test,y_pred)

print("Confusion matrix of Random Forest:\n {}".format(cm))

In [None]:
# Lists for accuracy values of train and test
accuracy_train = []
accuracy_test = []
for each in range(1,15):
    
    # Models
    rf_2 = RandomForestClassifier(n_estimators=each,random_state=7)
    
    # Trains
    accuracy_train.append(rf_2.fit(x_train,y_train))
    
    # Accuracies
    accuracy_test.append(rf_2.score(x_test,y_test))
    
# Plot
plt.plot(range(1,15),accuracy_test)
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.show()


print("Best n_estimators is {} and hiegher accuracy is {}".format(1 + accuracy_test.index(np.max(accuracy_test)),np.max(accuracy_test)))

# 4. CONCLUSION

1. Importing data
2. Cleaning data
3. Preparing data
4. Predictions of non tree based classification algorithms 
5. Predictions of tree based classifications algorithms