## Import Libraries

In [None]:
import numpy as np
from numpy import mean, std
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report, accuracy_score

## Create custom palette
## Thank Subin An. It is good color palette. [link](https://www.kaggle.com/subinium/dark-mode-visualization-apple-version)

In [None]:
raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]

raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]


light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

## Setting custom plot parameters

In [None]:
from cycler import cycler

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 150

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
print('Light mode palette')
sns.palplot(light_palette)
sns.palplot(gray_light_palette)

print('Dark mode palette')
sns.palplot(dark_palette)
sns.palplot(gray_dark_palette)

## Import dataset

In [None]:
dataset = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

## EDA

In [None]:
dataset.info()

### Meta data for dataset
1. age - Age of the patient
2. sex - Sex of the patient(0=Female, 1=Male)
3. cp - Chest pain type  (0=Typical Angina, 1=Atypical Angina, 2=Non-anginal Pain, 3=Asymptomatic)
4. trtbps - Resting blood pressure (in mm Hg)
5. chol - Cholestoral in mg/dl fetched via BMI sensor
6. fbs - (fasting blood sugar > 120 mg/dl) (1=True, 0=False)
7. restecg - Resting electrocardiographic results (0=Normal, 1=ST-T wave abnormality, 2=Left ventricular hypertrophy)
8. thalachh - Maximum heart rate achieved
9. exng - Exercise induced angina (1=Yes, 0=No)
10. oldpeak - Previous peak
11. slp - Slope
12. caa - Number of major vessels(0-3)
13. thall - Thalium Stress Test result(0-3)
14. output - (0=less chance of heart attack, 1=more chance of heart attack)

In [None]:
dataset.isnull().value_counts()

### There is no null vaules in each attribute.

In [None]:
feature = dataset.columns
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

y = np.array([dataset[feature[i]].nunique() for i in range(len(dataset.columns))])

ax[0].bar(range(len(dataset.columns)), y, alpha=0.7, color=dark_palette)
ax[0].set_yticks(range(0, max(y), 10))
ax[0].set_xticks(range(len(dataset.columns)))
ax[0].set_xticklabels(feature, rotation=45)

for p in ax[0].patches:
    ax[0].annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()-2),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')

ax[0].margins(0.01)
ax[0].grid(axis='y', linestyle='-')
ax[0].set_title('# of Unique Values on Features', loc='left', fontweight='bold')

ax[1].bar(range(len(dataset.columns)), sorted(y), alpha=0.7, color=dark_palette)
ax[1].set_yticks(range(0, max(y), 10))
ax[1].set_xticks(range(len(dataset.columns)))
ax[1].set_xticklabels(['sex','fbs','exng','output','restecg','slp','thall','cp','caa','oldpeak','age','trtbps','thalachh','chol'], rotation=45)

for p in ax[1].patches:
    ax[1].annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')

ax[1].margins(0.01)
ax[1].grid(axis='y', linestyle='-')
ax[1].set_title('# of Unique Values on Features(Sorted)', loc='left', fontweight='bold')

plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Heart Attack Chance')

ax = sns.countplot(x='output', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['Low chance','High chance'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Patient')
ax.grid(axis='y', linestyle='-')
# dataset['output'].value_counts().sort_index().plot(kind='pie', labels=['Safe', 'Dangerous'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])
dataset['output'].value_counts().sort_index().plot(kind='pie', labels=None,
                                      autopct='%.1f%%', ylabel='', ax=axes[1])
plt.show()


### The number of patients in each safe and dangerous group is not the same.  A sampling technique might be necessary to get a better prediction.

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Gender')

# Make cosistance color with Pie plot
ax = sns.countplot(x='sex', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['Female','Male'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=None,
                                      autopct='%.1f%%', ylabel='', ax=axes[1])
                                      
plt.show()

### The number of patients by gender is unbalanced. It would be better to create a separate model to get a better prediction.

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Chest Pain Types')

# Make cosistance color with Pie plot
ax = sns.countplot(x='cp', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['typical','atypical', 'Non-agnial', 'asymptomatic'], rotation=45, size=8)
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['cp'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1])
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Fasting Blood Sugar')

# Make cosistance color with Pie plot
ax = sns.countplot(x='fbs', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['≤120 mg/dL','>120 mg/dL'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['fbs'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1])
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Resting ECG')

# Make cosistance color with Pie plot
ax = sns.countplot(x='restecg', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['Normal', 'ST-T Wave ABN', 'Left Ventricular Hypertrophy'], rotation=45, size=8)
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['restecg'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1])
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Exercise Induced Angina')

# Make cosistance color with Pie plot
ax = sns.countplot(x='exng', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['No', 'Yes'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['exng'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1])
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Number of Major Vessels')

# Make cosistance color with Pie plot
ax = sns.countplot(x='caa', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
# ax.set_xticklabels(['No', 'Yes'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['caa'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1], fontsize=8)
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Slope')

# Make cosistance color with Pie plot
ax = sns.countplot(x='slp', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
ax.set_xticklabels(['Down', 'Flat', 'Up'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['slp'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1], fontsize=8)
                                      
plt.show()

In [None]:
fig, axes= plt.subplots(1,2)
plt.suptitle('Patient Share by Stress Test')

# Make cosistance color with Pie plot
ax = sns.countplot(x='thall', data=dataset, palette=dark_palette, ax=axes[0])

for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
# ax.set_xlabel('Condition')
ax.set_xlabel('')
# ax.set_xticklabels(['Fixed', 'Normal', 'Reversable'])
ax.set_ylabel('Number of Patient')
# ax.set_title('Gender')
ax.grid(axis='y', linestyle='-')
# dataset['sex'].value_counts().sort_index().plot(kind='pie', labels=['Female', 'Male'],
#                                       autopct='%.1f%%', ylabel='', title='Patient Share', ax=axes[1])

dataset['thall'].value_counts().sort_index().plot(kind='pie', labels=None, autopct='%.1f%%', ylabel='', ax=axes[1], fontsize=8)
                                      
plt.show()

In [None]:
plt.suptitle('Patient Share by Age')

sns.kdeplot(x='age', data=dataset, palette=dark_palette, bw_adjust=0.2, fill=True)
plt.xlabel('Age')
plt.grid(axis='y', linestyle='-')                                  
plt.show()

In [None]:
plt.suptitle('Patient Share by Blood Pressure')

sns.kdeplot(x='trtbps', data=dataset, palette=dark_palette, bw_adjust=0.2, fill=True)
plt.xlabel('Blood Pressure (mmHg)')
plt.grid(axis='y', linestyle='-')                                  
plt.show()

In [None]:
plt.suptitle('Patient Share by Cholestoral')

sns.kdeplot(x='chol', data=dataset, palette=dark_palette, bw_adjust=0.2, fill=True)
plt.xlabel('Cholestoral (mg/dL)')
plt.grid(axis='y', linestyle='-')                                  
plt.show()

In [None]:
plt.suptitle('Patient Share by Maximum Heart Rate')

sns.kdeplot(x='thalachh', data=dataset, palette=dark_palette, bw_adjust=0.2, fill=True)
plt.xlabel('Maximum Heart Rate (BPM)')
plt.grid(axis='y', linestyle='-')                                  
plt.show()

In [None]:
plt.suptitle('Patient Share by Oldpeak')
sns.histplot(x='oldpeak', data=dataset, binwidth=0.2, palette=dark_palette, linewidth=0.3)
# plt.xlabel('Oldpeak')
# plt.grid(axis='y', linestyle='-')                                  
plt.show()

In [None]:
fig = plt.figure()
gs = fig.add_gridspec(2,2)
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[1, 1])
ax2.set_aspect(1)
ax3.set_aspect(1)

plt.suptitle(ha='right', t='Heart Attack by Gender')
ax = sns.countplot(x='sex', hue='output', data=dataset, palette=dark_palette, ax=ax1)
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
ax.set_xlabel('Gender')
ax.set_xticklabels(['Female', 'Male'])
ax.set_ylabel('Number of Patient')
ax.grid(axis='y', linestyle='-')
ax.legend(title='Heart Attack', labels=['No','Yes'])

dataset[['sex','output']].value_counts()[0].sort_index().plot(kind='pie', autopct='%.1f%%', ylabel='', labels=None, fontsize=8, ax=ax2)
dataset[['sex','output']].value_counts()[1].sort_index().plot(kind='pie', autopct='%.1f%%', ylabel='', labels=None, fontsize=8, ax=ax3)
ax2.set_title('Female')
ax3.set_title('Male')
plt.show()

### Female has significantly higher chance to have heart attack problem.

In [None]:
fig = plt.figure()
gs = fig.add_gridspec(2,2)
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[1, 1])
ax2.set_aspect(1)
ax3.set_aspect(1)

plt.suptitle(ha='right', t='Heart Attack by Exercise Induced Angina')
ax = sns.countplot(x='exng', hue='output', data=dataset, palette=dark_palette, ax=ax1)
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2, p.get_height()),
                ha = 'center', va = 'center', xytext = (0,8), textcoords = 'offset points')
ax.set_xlabel('Exercise Induced Angina')
ax.set_xticklabels(['No', 'Yes'])
ax.set_ylabel('Number of Patient')
ax.grid(axis='y', linestyle='-')
ax.legend(title='Heart Attack', labels=['No','Yes'])

dataset[['exng','output']].value_counts()[0].sort_index().plot(kind='pie', autopct='%.1f%%', ylabel='', labels=None, fontsize=8, ax=ax2)
dataset[['exng','output']].value_counts()[1].sort_index().plot(kind='pie', autopct='%.1f%%', ylabel='', labels=None, fontsize=8, ax=ax3)
ax2.set_title('No')
ax3.set_title('Yes')
plt.show()

### Patient who has no Exercise-Induced Angina has a higher chance to get a Heart Attack.

In [None]:
fig, ax = plt.subplots(2, 1)
plt.suptitle('Heart Attack by Age')
sns.histplot(x='age', hue='output', data=dataset, multiple='stack', binrange=(25, 80), bins=20, ax=ax[0])
sns.kdeplot(x='age', hue='output', data=dataset, bw_adjust=1, fill=True, clip=(25,80), ax=ax[1])
ax[0].set_xticks(range(25,80,5))
ax[0].legend(title='Heart Attack', labels=['Yes', 'No'])
ax[1].legend(title='Heart Attack', labels=['Yes', 'No'])
plt.show()

### Under 55 has a higher chance to have a heart attack. Over 55 and below 70 has less chance to have a heart attack.

In [None]:
total = dataset.groupby(['cp'])['output'].value_counts()
total_cp = [total[0].sum(), total[1].sum(), total[2].sum(), total[3].sum()]
sick = dataset.groupby('cp')['output'].sum()
total_cp = pd.DataFrame(total_cp, columns=(['n']))
total_cp['sick'] = sick
total_cp['cp'] = ['0', '1', '2', '3']
total_cp['percent1'] = [i / j * 100 for i,j in zip(total_cp['n'], total_cp['n'])]
total_cp['percent2']= [i / j * 100 for i,j in zip(total_cp['sick'], total_cp['n'])]

In [None]:
plt.figure()
plt.suptitle(ha='right', t='Heart Attack by Chest Pain Type')
bar1 = sns.barplot(x='cp', y='percent1', data=total_cp, color=dark_palette[0])
bar2 = sns.barplot(x='cp', y='percent2', data=total_cp, color=dark_palette[1])
plt.axhline(y=50, color='r', linestyle='--', linewidth='0.5')
top_bar = mpatches.Patch(color=dark_palette[0], label='No')
bottom_bar = mpatches.Patch(color=dark_palette[1], label='Yes')
plt.legend(title='Heart Attack', handles=[top_bar, bottom_bar], ncol=2, bbox_to_anchor = (1, 1.15))
plt.ylabel('Percents (%)')
plt.xlabel('Chest Pain Type')
plt.show()

### If a patient experiencing any type of chest pain except chest pain Typical Angina, the patient has a high chance to have a Heart Attack.

In [None]:
total = dataset.groupby(['thall'])['output'].value_counts()
total_cp = [total[0].sum(), total[1].sum(), total[2].sum(), total[3].sum()]
sick = dataset.groupby('thall')['output'].sum()
total_cp = pd.DataFrame(total_cp, columns=(['n']))
total_cp['sick'] = sick
total_cp['thall'] = ['0', '1', '2', '3']
total_cp['percent1'] = [i / j * 100 for i,j in zip(total_cp['n'], total_cp['n'])]
total_cp['percent2']= [i / j * 100 for i,j in zip(total_cp['sick'], total_cp['n'])]

In [None]:
plt.figure()
plt.suptitle(ha='right', t='Heart Attack by Thallium Stress')
bar1 = sns.barplot(x='thall', y='percent1', data=total_cp, color=dark_palette[0])
bar2 = sns.barplot(x='thall', y='percent2', data=total_cp, color=dark_palette[1])
plt.axhline(y=50, color='r', linestyle='--', linewidth='0.5')
top_bar = mpatches.Patch(color=dark_palette[0], label='No')
bottom_bar = mpatches.Patch(color=dark_palette[1], label='Yes')
plt.legend(title='Heart Attack', handles=[top_bar, bottom_bar], ncol=2, bbox_to_anchor = (1, 1.15))
plt.ylabel('Percents (%)')
plt.xlabel('Thallium Stress')
plt.show()

### Thallium Stress type 2 is the only one that above 50% chance to indicate Heart Attack.

In [None]:
fig, ax = plt.subplots(2, 1, sharey=True)
fig.text(0.02, 0.5, 'Blood Pressure (mmHg)', va='center', rotation='vertical')
plot1 = sns.regplot(x='age', y='trtbps', marker='+', label=None, order=2, data=dataset, color=dark_palette[0], ax=ax[0])
plot2 = sns.scatterplot(x='age', y='trtbps', style='sex', hue='sex', data=dataset, ax=ax[1])
plot1.set(ylabel=None)
plot2.set(ylabel=None)
plt.xlabel('Age')
ax[1].legend(title='sex', labels=['Male', 'Female'])
plt.show()

In [None]:
dataset[['age', 'trtbps']].corr()

### Blood pressure and Age have a weak positive correlation with each other.

In [None]:
fig, ax = plt.subplots(2, 1, sharey=True)
fig.text(0.02, 0.5, 'Cholesterol (mg/dL)', va='center', rotation='vertical')
plot1 = sns.regplot(x='age', y='chol', marker='+', label=None, order=2, data=dataset, color=dark_palette[0], ax=ax[0])
plot2 = sns.scatterplot(x='age', y='chol', style='sex', hue='sex', data=dataset, ax=ax[1])
plot1.set(ylabel=None)
plot2.set(ylabel=None)
plt.xlabel('Age')
ax[1].legend(title='sex', labels=['Male', 'Female'])
plt.show()

In [None]:
dataset[['age', 'chol']].corr()

### Cholesterol level and Age have a weak positive correlation with each other.

In [None]:
fig, ax = plt.subplots(2, 1, sharey=True)
fig.text(0.02, 0.5, 'Heart Rate (bpm)', va='center', rotation='vertical')
plot1 = sns.regplot(x='age', y='thalachh', marker='+', label=None, order=2, data=dataset, color=dark_palette[0], ax=ax[0])
plot2 = sns.scatterplot(x='age', y='thalachh', style='sex', hue='sex', data=dataset, ax=ax[1])
plot1.set(ylabel=None)
plot2.set(ylabel=None)
plt.xlabel('Age')
ax[1].legend(title='sex', labels=['Male', 'Female'])
plt.show()

In [None]:
dataset[['age', 'thalachh']].corr()

### Heart Rate and Age have a weak negative correlation with each other.
### The correlation figure is often much lower in medical fields. [link](https://www.statology.org/what-is-a-strong-correlation/#:~:text=As%20a%20rule%20of%20thumb%2C%20a%20correlation%20greater,a%20medical%20field%20compared%20to%20a%20technology%20field.)

## Prediction

### Since there are only 303 samples, it would be better to use k-fold Nested corss validation. [link](https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/)

In [None]:
y = dataset.output
X = dataset.drop(columns=['output'])

In [None]:
cv_outer = KFold(n_splits=10, shuffle=True, random_state=0)
result_df = pd.DataFrame()

In [None]:
for train_ix, test_ix in cv_outer.split(X):

    X_train, X_test = X.loc[train_ix, :], X.loc[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits=3, shuffle=True, random_state=0)
    model = XGBClassifier(use_label_encoder=False,  eval_metric='logloss')
    params = dict()
    params['learning_rate'] = [0.01, 0.05, 0.1]
    params['n_estimators'] = [150, 200, 250, 300]
    params['max_depth'] = [1, 2, 3]
    params['colsample_bytree'] = [0.3, 0.5, 0.8]
    cv_search = GridSearchCV(model, params, scoring='accuracy', cv=cv_inner, refit=True)
    result = cv_search.fit(X_train, y_train)
    best_model = result.best_estimator_

    y_test_predic = best_model.predict(X_test)
    y_train_predic = best_model.predict(X_train)

    acc_test = accuracy_score(y_test, y_test_predic)
    acc_train = accuracy_score(y_train, y_train_predic)

    df = {'acc_test': acc_test, 'acc_train': acc_train,
          'learning_rate': result.best_params_.get('learning_rate'),
          'colsample_bytree': result.best_params_.get('colsample_bytree'),
          'max_depth': result.best_params_.get('max_depth'),
          'n_estimators': result.best_params_.get('n_estimators')
    }
    result_df = result_df.append(df, ignore_index=True)

In [None]:
result_df

In [None]:
result_df.groupby(['learning_rate', 'colsample_bytree', 'max_depth', 'n_estimators']).agg(['count', 'mean', 'min', 'max'])

In [None]:
result_df[['acc_test', 'acc_train']].agg(['mean', 'min', 'max'])

1. Each subset has its own optimal parameters.
2. It would be better to use more parameter lists to find optimal parameters.
3. If you have more parameter pairs, computation time is increased accordingly.
4. I have 10 different prediction models in the above example.
5. Only two subsets share the same parameters.
6. There are no unique optimal parameters for this dataset since lack of samples.
7. Therefore when you want to predict the outcome for new data, it would be better to put all prediction models and **average** them.
8. **You have to think about overfitting for dataset. Higher accuracy is not always good.**
9. **It would be not well fit for unknown data from the future.**

## Prediction (Using only 3 models)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
params_1 = {
          'use_label_encoder': False,  
          'eval_metric': 'logloss',
          'learning_rate': 0.01,
          'colsample_bytree': 0.8,
          'max_depth': 1,
          'n_estimators': 300
}
optimum_model_1 = XGBClassifier(**params_1)
optimum_model_1.fit(X_train, y_train)

y_test_predic_1 = optimum_model_1.predict(X_test)
y_train_predic_1 = optimum_model_1.predict(X_train)

print('\nTest Data classification Report')
print(classification_report(y_test, y_test_predic_1))
print('\nTraining Data classification Report')
print(classification_report(y_train, y_train_predic_1))

In [None]:
params_2 = {
          'use_label_encoder': False,  
          'eval_metric': 'logloss',
          'learning_rate': 0.05,
          'colsample_bytree': 0.3,
          'max_depth': 1,
          'n_estimators': 200
}
optimum_model_2 = XGBClassifier(**params_2)
optimum_model_2.fit(X_train, y_train)

y_test_predic_2 = optimum_model_2.predict(X_test)
y_train_predic_2 = optimum_model_2.predict(X_train)

print('\nTest Data classification Report')
print(classification_report(y_test, y_test_predic_2))
print('\nTraining Data classification Report')
print(classification_report(y_train, y_train_predic_2))

In [None]:
params_3 = {
          'use_label_encoder': False,  
          'eval_metric': 'logloss',
          'learning_rate': 0.01,
          'colsample_bytree': 0.8,
          'max_depth': 3,
          'n_estimators': 300
}
optimum_model_3 = XGBClassifier(**params_3)
optimum_model_3.fit(X_train, y_train)

y_test_predic_3 = optimum_model_3.predict(X_test)
y_train_predic_3 = optimum_model_3.predict(X_train)

print('\nTest Data classification Report')
print(classification_report(y_test, y_test_predic_3))
print('\nTraining Data classification Report')
print(classification_report(y_train, y_train_predic_3))

### Find final prediction values

In [None]:
y_test_predic_avg = (y_test_predic_1 + y_test_predic_2 + y_test_predic_3) / 3
y_train_predic_avg = (y_train_predic_1 + y_train_predic_2 + y_train_predic_3) / 3

In [None]:
for i in range(0, len(y_test_predic_avg)):
    if (y_test_predic_avg[i] > 0.5):
        y_test_predic_avg[i] = 1
    else:
        y_test_predic_avg[i] = 0 

for i in range(0, len(y_train_predic_avg)):
    if (y_train_predic_avg[i] > 0.5):
        y_train_predic_avg[i] = 1
    else:
        y_train_predic_avg[i] = 0   

In [None]:
y_test_predic_avg = y_test_predic_avg.astype(int)
y_train_predic_avg = y_train_predic_avg.astype(int)

In [None]:
print('\nTest Data classification Report')
print(classification_report(y_test, y_test_predic_avg))
print('\nTraining Data classification Report')
print(classification_report(y_train, y_train_predic_avg))