# 1. Business Understanding
- Help in predicting the likelihood of a patient to have a heart disease
- Target column: Target
- Most likely, I'll use classifiers because of the binary nature of the target data
- Data quality is good

# 2. Data Exploration

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings

warnings.filterwarnings("ignore")

# Set the environment
np.random.seed(1234)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Load CSV file
file_path = 'C:/Users/User/Downloads/Python/'
file_name = 'heart.csv'

df = pd.read_csv(file_path+file_name, header=0, delimiter=',', encoding='1253')

In [None]:
# Show first 5 rows of data
df.head()

In [None]:
# Rename the first column
df.rename(columns = {'ο»Ώage':'age'}, inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Show summary statistics
df.describe(include = 'all')

In [None]:
# Remove empty rows/columns
df.dropna(how="all", axis=0, inplace=True) # Rows
df.dropna(how="all", axis=1, inplace=True) # Columns

In [None]:
# Remove duplicate rows
df.drop_duplicates(keep='first', inplace=True)

## Correlation Analysis

In [None]:
# Find the pearson correlations matrix
corr = df.corr(method='pearson')
corr

In [None]:
# Plot a correlation heatmap
plt.figure(figsize=(20,6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", linewidth=.5)
plt.show()

In [None]:
# Perform One-Hot Encoding to prepare the data for ML
df_model = pd.get_dummies(df)

In [None]:
df_model.head()

# 3. Modeling

In [None]:
# Declare the independent variables (X) and the dependent variable(y)
X = df_model.drop('target', axis=1)
y = df_model['target']

In [None]:
X.columns

In [None]:
y

In [None]:
# Generate the train and test partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Use different Scikit classifiers

In [None]:
# LogisticRegression
lr = LogisticRegression(random_state=1234).fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# SVC
svc = SVC(random_state=1234).fit(X_train, y_train)
svc_pred = lr.predict(X_test)

# KNeighborsClassifier
knn = KNeighborsClassifier().fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# RandomForestClassifier
rf = RandomForestClassifier(random_state=1234).fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=1234).fit(X_train, y_train)
gb_pred = gb.predict(X_test)

# 4. Evaluation

In [None]:
# Generate our evaluation metrics to find the best model
print("lr Accuracy: "+ str(lr.score(X,y)))
print("svc Accuracy: "+ str(svc.score(X,y)))
print("knn Accuracy: "+ str(knn.score(X,y)))
print("rf Accuracy: "+ str(rf.score(X,y)))
print("gb Accuracy: "+ str(gb.score(X,y)))

In [None]:
# Our best model is 'rf'. 
rf_pred_new = rf.predict(X)             # Predicts Target for the whole dataset       

In [None]:
# Add a new column in the original dataset with the predictions
df['target_pred'] = rf_pred_new

In [None]:
df

# The End