# importing used libraries

In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
import pickle

In [None]:
from sklearn.model_selection import cross_validate

# reading and exploring the data

In [None]:
df=pd.read_csv('heart_2020_cleaned.csv')

In [None]:
df.loc[df['HeartDisease'] == 'Yes']

In [None]:
df1=df.sample(10000)

In [None]:
df1

In [None]:
colmns=df1.columns
colmns

# Statistical Description of the Features

In [None]:
df1.describe()

# data cleaning and preprocessing

In [None]:
df1.isnull().value_counts()

In [None]:
df1.duplicated().value_counts()

In [None]:
df1.drop_duplicates(inplace=True)

In [None]:
df1.duplicated().value_counts()

# encoding non numerical data

In [None]:
for colmn in colmns:
       df1 = df1.replace({colmn: {'Yes': 1, 
                                'No': 0}})

In [None]:
df1 = df1.replace({'Sex': {'Male': 1, 
                                'Female': 0}})

In [None]:
df1[['age', 'Last age']] = df1.AgeCategory.str.split("-", expand = True)


In [None]:
df1.drop('Last age',axis=1,inplace=True)

In [None]:
df1 = df1.replace({'Diabetic': {'Yes (during pregnancy)': 1, 
                                'No, borderline diabetes': 0}})

In [None]:
df1 = df1.replace({'age': {'80 or older': 80}})


In [None]:
df1['age']=df1['age'].astype(int)

In [None]:
df1['Race'].value_counts()


In [None]:
df1 = df1.replace({'Race': {'White': 0, 
                                'Hispanic': 1,
                               'Black':2,
                                'Other':3, 'Asian':4, 'American Indian/Alaskan Native':5}})

In [None]:
df1 = df1.replace({'GenHealth': {'Poor': 0, 
                                'Fair': 1,
                               'Good':2,
                                'Very good':3, 'Excellent':4}})

# splitting the data into train and test

In [None]:
# plotting correlation heatmap
fig, ax = plt.subplots(figsize=(15,15))    
dataplot = sns.heatmap(df1.corr(numeric_only =True), annot=True)
  
# displaying heatmap
plt.show()

In [None]:
X = df1.drop((['HeartDisease','AgeCategory','MentalHealth','Race','SleepTime']),axis=1)
y = df1['HeartDisease']

In [None]:
X

In [None]:
colmns=X.columns
colmns

In [None]:
fig, ax = plt.subplots(figsize =(15,9))
ax.patch.set_facecolor('#CAD5E0')
fig.patch.set_facecolor('#CAD5E0')
mpl.rcParams['font.family'] = 'TeX Gyre Heros'

sns.boxplot(data = X, ax=ax, palette='husl', orient="h", linewidth=4);

# Colors
for i,artist in enumerate(ax.artists):
    col = artist.get_facecolor()
    artist.set_edgecolor(col)
    artist.set_facecolor('None')
    for j in range(i*6,i*6+6):
        line = ax.lines[j]
        line.set_color(col)
        line.set_mfc(col)
        line.set_mec(col)

# Remove ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

# Remove axes splines
for i in ['top', 'bottom', 'left', 'right']:
    ax.spines[i].set_visible(False)

# Remove grid
ax.grid(False)

# Change color axis
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);

# Title
ax.set_title('Outliers', fontsize=40, fontweight="bold", pad=20);


## Scaling Features

In [None]:
# Scaling features
from sklearn.preprocessing import MinMaxScaler

for col in X :
     X[col] = MinMaxScaler().fit_transform(X[[col]])

In [None]:
scoreDF = pd.DataFrame(columns=['accuracy','f1','recall',"precision","Algorithm","Balanced-Data"])

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,train_size=0.8)

# Split and generate balance test/train set using SMOTE

In [None]:

sm = SMOTE(random_state = 42
           )
X_trainnew, y_trainnew = sm.fit_resample(X_train, y_train.ravel())
class_balance = pd.Series(y_trainnew).value_counts().plot.bar()
class_balance.set_title("Outcome ytrain (SMOTE)")
pd.Series(y_trainnew).value_counts()

# model training

here we train the models with balnced and unbalanced data and before and after hyperparameter tuning to see the diffrence

# LogisticRegression

In [None]:
loger= LogisticRegression()
loger.fit(X_train,y_train)

In [None]:
loger1= LogisticRegression()
loger1.fit(X_trainnew,y_trainnew)

In [None]:
y_pred0 = loger.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred0), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred0), 4))

In [None]:
y_pred1 = loger1.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred1), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred1), 4))

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred0))


In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred1)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
from sklearn.linear_model import LogisticRegressionCV

# L1 regularized logistic regression
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear').fit(X_train, y_train)
y_pred1 = lr_l1.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred1), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred1), 4))

In [None]:
from sklearn.linear_model import LogisticRegressionCV

# L1 regularized logistic regression
lr_l = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear').fit(X_trainnew, y_trainnew)
y_pred00 = lr_l.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred00), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred00), 4))

In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred1)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred1))


In [None]:
# L2 regularized logistic regression
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2', solver='liblinear').fit(X_train, y_train)
y_pred2 = lr_l2.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred2), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred2), 4))
score = cross_validate(lr_l2, X_train, y_train, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"logistic regression","No"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:


# L2 regularized logistic regression
lr_l22 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2', solver='liblinear').fit(X_trainnew, y_trainnew)
y_pred3 = lr_l22.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred3), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred3), 4))
score = cross_validate(lr_l22,  X_trainnew, y_trainnew, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"logistic regression","yes"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred3)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
X

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred2))


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# First model
knn = KNeighborsClassifier(n_neighbors=1)
knn = knn.fit(X_train, y_train)
y_pred4 = knn.predict(X_test)
print('Accuracy score: ', round(accuracy_score(y_test, y_pred4), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred4), 4))

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred4))


# hyperparameter tuning for KNN

In [None]:
# max_k = 20
# f1_scores = list()
# error_rates = list() # 1-accuracy
# accuracy_scores = list()

# for k in range(1, max_k+1):
    
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn = knn.fit(X_trainnew, y_trainnew)
    
#     y_pred5 = knn.predict(X_test)
#     f1 = f1_score(y_pred5, y_test)
#     print(f1)
#     f1_scores.append((k, round(f1_score(y_test, y_pred5), 4)))
#     error = 1-round(accuracy_score(y_test, y_pred5), 4)
#     print(error)
#     error_rates.append((k, error))
#     acc= round(accuracy_score(y_test, y_pred5), 4)
#     accuracy_scores.append((k, acc))
    
# f1_results = pd.DataFrame(f1_scores, columns=['K', 'F1 Score'])
# error_results = pd.DataFrame(error_rates, columns=['K', 'Error Rate'])
# acc_results = pd.DataFrame(accuracy_scores, columns=['K', 'acc Score'])

In [None]:
#acc_results

In [None]:

# mpl.rcParams['font.size'] = 12
# sns.set_style("whitegrid", {'grid.linestyle': '--'})  # set grid

# fig, (ax_f1,ax_accuracy) = plt.subplots(1, 2, figsize=(20, 20))

# fig.patch.set_facecolor('#F1F3F4')
# ax_f1.patch.set_facecolor('#F1F3F4')
# #ax_error.patch.set_facecolor('#F1F3F4')
# ax_accuracy.patch.set_facecolor('#F1F3F4')

# sns.lineplot(f1_results['F1 Score'], color = '#236AB9', ax=ax_f1)
# #sns.lineplot(error_results['Error Rate'], color='#B85B14', ax=ax_error)
# sns.lineplot(acc_results['acc Score'], color = '#236AB9', ax=ax_accuracy)

# ax_f1.set_title('KNN F1 Score', color='#236AB9', fontsize= 25)
# #ax_error.set_title('KNN Elbow Curve', color='#B85B14', fontsize= 25)
# ax_accuracy.set_title('KNN accuracy Score', color='#236AB9', fontsize= 25)
# # Set xticks range
# ax_f1.set_xticks(range(1,20))
# #ax_error.set_xticks(range(1,20))
# ax_accuracy.set_xticks(range(1,20))

# # Remove axes splines
# for i in ['top', 'bottom', 'left', 'right']:
#     ax_f1.spines[i].set_visible(False)

# #for i in ['top', 'bottom', 'left', 'right']:
#     #ax_error.spines[i].set_visible(False)
    
# for i in ['top', 'bottom', 'left', 'right']:
#     ax_accuracy.spines[i].set_visible(False)    

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=12)
knn = knn.fit(X_train, y_train)
y_pred6 = knn.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred6), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred6), 4))
score = cross_validate(knn, X_train, y_train, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"KNeighbors Classifier","No"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:


from sklearn.neighbors import KNeighborsClassifier

knn2 = KNeighborsClassifier(n_neighbors=8)
knn2 = knn2.fit(X_trainnew, y_trainnew)
y_pred7 = knn2.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred7), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred7), 4))

score = cross_validate(knn2,  X_trainnew, y_trainnew, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"KNeighbors Classifier","yes"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred7)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
confusion_matrix(y_test, y_pred7)

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred3))


# DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)
y_pred8 = dt.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred8), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred8), 4))

score = cross_validate(dt, X_train, y_train, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"Decision Tree Classifier","No"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt2 = DecisionTreeClassifier(random_state=42)
dt2 = dt2.fit(X_trainnew, y_trainnew)
y_pred9 = dt2.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred9), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred9), 4))

score = cross_validate(dt2, X_trainnew, y_trainnew, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"Decision Tree Classifier","yes"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred9)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
confusion_matrix(y_test, y_pred9)

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred9))


# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# First model
RF = RandomForestClassifier(random_state=42, n_estimators=100)
                            
RF = RF.fit(X_trainnew, y_trainnew)
y_pred10 = RF.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred10), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred10), 4))

In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred10)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred10))


# hyperparameter tuning for RandomForestClassifier

In [None]:
 from sklearn.model_selection import GridSearchCV
     from tqdm import tqdm

n_estimators = [100, 150, 200]
max_depth = [15, 20, 25]
max_depth.append(None)
max_features = ['auto', 'sqrt']
min_samples_split = [5, 10, 15]
min_samples_leaf = [1, 2]
bootstrap = [True, False]

params = {'n_estimators': n_estimators, 'max_features': max_features,
           'max_depth': max_depth, 'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap,}

 RF = RandomForestClassifier(random_state=42,)

grid_search = GridSearchCV(estimator = RF, 
                            param_grid = params,
                            scoring = 'f1',
                            cv = 5,
                            verbose=3, 
                            n_jobs=-1)

 grid_search.fit(X_trainnew, y_trainnew)
print("best score: ", grid_search.best_score_)
print("best param: ", grid_search.best_params_)


In [None]:
#best_params = grid_search.best_params_
RF1 = RandomForestClassifier(random_state=42, bootstrap=False, max_depth= 20, max_features= "sqrt", min_samples_leaf= 1, min_samples_split= 5, n_estimators=100)
                            
RF1 = RF1.fit(X_train, y_train)
y_pred11 = RF1.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred11), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred11), 4))

score = cross_validate(RF1, X_train, y_train, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"].mean()))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"Random Forest Classifier1","No"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)


In [None]:
RF12 = RandomForestClassifier(random_state=42, bootstrap=False, max_depth= 20, max_features= "sqrt", min_samples_leaf= 1, min_samples_split= 5, n_estimators=100)
scores=list()
RF12 = RF12.fit(X_trainnew, y_trainnew)
y_pred12= RF12.predict(X_test)

print('Accuracy score: ', round(accuracy_score(y_test, y_pred12), 4))
print('F1 Score: ', round(f1_score(y_test, y_pred12), 4))

score = cross_validate(RF12, X_trainnew, y_trainnew, cv=10,scoring=['accuracy','f1','recall','precision'])
print("Test accuracy:{}".format(score["test_accuracy"]))
data =[score["test_accuracy"].mean(), score["test_f1"].mean(), score["test_recall"].mean(),
       score["test_precision"].mean(),"Random Forest Classifier1","yes"]
scoreDF = scoreDF.append(pd.DataFrame([data], columns=scoreDF.columns), ignore_index=True)
scores.append(score["test_accuracy"])

In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred12)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
confusion_matrix(y_test, y_pred12)

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred11))


In [None]:
#Get the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred11)
#print(cf_matrix)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred11))


# model selection

In [None]:
#we select the model base on the best results regarding accuracy, f1,precision,recall

In [None]:
scoreDF = scoreDF[['Algorithm', 'Balanced-Data', 'accuracy', 'f1', 'precision','recall']]
scoreDF

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")

# Draw a nested barplot by species and sex
ax = sns.catplot(
    data=scoreDF, kind="bar",
    x="Algorithm", y="f1", hue="Balanced-Data",
     palette="dark", alpha=.6, height=10
)
ax.despine(left=True)
ax.set_axis_labels("Algorithms", "MEAN F1-Score")

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")

# Draw a nested barplot by species and sex
ax = sns.catplot(
    data=scoreDF, kind="bar",
    x="Algorithm", y="accuracy", hue="Balanced-Data",
     palette="dark", alpha=.6, height=10
)
ax.despine(left=True)
ax.set_axis_labels("Algorithms", "MEAN accuracy")

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")

# Draw a nested barplot by species and sex
ax = sns.catplot(
    data=scoreDF, kind="bar",
    x="Algorithm", y="precision", hue="Balanced-Data",
     palette="dark", alpha=.6, height=10
)
ax.despine(left=True)
ax.set_axis_labels("Algorithms", "MEAN precision")

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")

# Draw a nested barplot by species and sex
ax = sns.catplot(
    data=scoreDF, kind="bar",
    x="Algorithm", y="recall", hue="Balanced-Data",
     palette="dark", alpha=.6, height=10
)
ax.despine(left=True)
ax.set_axis_labels("Algorithms", "MEAN recall")

In [None]:
col_name=X.columns
col_name
predict_data=np.array([])
for i in range(14):
    #inpt=input()
    print('pleas enter your ',col_name[i],' here')
    inpt=float(input())
    predict_data=np.append(predict_data,inpt)




predict_data_as_numpy_array= np.asarray(predict_data)


predict_data_reshaped = predict_data_as_numpy_array.reshape(1,-1)

prediction = RF12.predict(predict_data_reshaped)
print(prediction)

if (prediction[0]== 0):
     print('The Person does not have a Heart Disease')
else:
      print('The Person has Heart Disease')

In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(RF12, open(filename, 'wb'))