# HEART DISEASE PREDICTION

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz  # to export graph of decision tree to pdf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Data Collection and Processing 

**About Dataset**  
**Context**  
This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The "target" field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.  

**Content**  
Attribute Information:  

age  
sex  
chest pain type (4 values)  
resting blood pressure  
serum cholestoral in mg/dl  
fasting blood sugar > 120 mg/dl  
resting electrocardiographic results (values 0,1,2)  
maximum heart rate achieved  
exercise induced angina  
oldpeak = ST depression induced by exercise relative to rest  
the slope of the peak exercise ST segment  
number of major vessels (0-3) colored by flourosopy  
thal: 0 = normal; 1 = fixed defect; 2 = reversable defect  
The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.  

In [None]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/Users/riteshkumar/Downloads/ML projects/Heart Attack Risk Assessment/heart.csv')

In [None]:
# print first 5 rows of the dataset
heart_data.head()

In [None]:
# print last 5 rows of the dataset
heart_data.tail()

In [None]:
# number of rows and columns in the dataset
heart_data.shape

In [None]:
# getting some info about the data
heart_data.info()

In [None]:
# checking for missing values
heart_data.isnull().sum()

In [None]:
# statistical measures about the data
heart_data.describe()

# Data Scaling

In [None]:
# checking the distribution of Target Variable
heart_data['target'].value_counts()

1 --> Defective Heart

0 --> Healthy Heart

Splitting the Features and Target

In [None]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
# scaler = MinMaxScaler()
# X = pd.DataFrame(scaler.fit_transform(X) , columns = X.columns)
# X

In [None]:
Y

In [None]:
#Splitting Data into Training data & Testing Data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

# Model Training & Evaluation

### Logistic Regression

In [None]:
logistic_reg = LogisticRegression(random_state = 0)
logistic_reg.fit(X_train, Y_train) # Training

In [None]:
# accuracy on training data
X_train_prediction1 = logistic_reg.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction1, Y_train)

print('Accuracy on Training data : ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction1 = logistic_reg.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction1, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
# Confusion Matrix
print(classification_report(Y_test , X_test_prediction1))

### Decision Tree Classifier

In [None]:
dec_tree_clf = DecisionTreeClassifier(random_state=0, max_depth = 5 , min_samples_leaf = 1 , min_samples_split = 5) 
dec_tree_clf.fit(X_train, Y_train)  # fits the data

In [None]:
# accuracy on training data
X_train_prediction2 = dec_tree_clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction2, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction2 = dec_tree_clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction2, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
# Confusion Matrix
print(classification_report(Y_test , X_test_prediction2))

### Random Forest Classifier (BEST ACCURACY)

In [None]:
random_forest_clf = RandomForestClassifier(max_depth = 6, random_state=0)  # creating object or instance
random_forest_clf.fit(X_train, Y_train)  # fits the data

In [None]:
# accuracy on training data
X_train_prediction3 = random_forest_clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction3, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction3 = random_forest_clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction3, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
# Confusion Matrix
print(classification_report(Y_test , X_test_prediction3))

### Grid - Search CV -  Hyperparameter Tuning

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,4,5,6],
}
CV_rfc = GridSearchCV(estimator=random_forest_clf, param_grid = param_grid)
CV_rfc.fit(X_train, Y_train)
CV_rfc.best_params_

In [None]:
random_forest_clf = RandomForestClassifier(max_depth = 6, random_state=0, max_features='sqrt', n_estimators=200)
random_forest_clf.fit(X_train, Y_train)
X_test_prediction3 = random_forest_clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction3, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)