In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/heartdisease/heart.csv')
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

* Age : age of the patient [years]
* Sex : sex of the patient [M: Male, F: Female]
* ChestPainType : chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* RestingBP : resting blood pressure [mm Hg]
* Cholesterol : serum cholesterol [mm/dl]
* FastingBS : fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* RestingECG : resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
* MaxHR : maximum heart rate achieved [Numeric value between 60 and 202]
* ExerciseAngina : exercise-induced angina [Y: Yes, N: No]
* Oldpeak : oldpeak = ST [Numeric value measured in depression]
* ST_Slope : the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* HeartDisease : output class [1: heart disease, 0: Normal]


In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='viridis', linewidths=0.5)
plt.title('Correlation Heatmap for Heart Disease Prediction')
plt.show()

In [None]:
data.describe().T

In [None]:
yes = data[data['target'] == 1].describe().T
no = data[data['target'] == 0].describe().T
colors = ['#F93822','#FDD20E']

fig,ax = plt.subplots(nrows = 1,ncols = 2,figsize = (5,5))
plt.subplot(1,2,1)
sns.heatmap(yes[['mean']],annot = True,cmap = 'viridis',linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f',)
plt.title('target');

plt.subplot(1,2,2)
sns.heatmap(no[['mean']],annot = True,cmap = 'viridis',linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f')
plt.title('No target');

fig.tight_layout(pad = 2)

Dividing features into Numerical and Categorical:

In [None]:
col = list(data.columns)
categorical_features = []
numerical_features = []
for i in col:
    if len(data[i].unique()) > 6:
        numerical_features.append(i)
    else:
        categorical_features.append(i)

print('Categorical Features :',*categorical_features)
print('Numerical Features :',*numerical_features)

Categorical Features:

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1 = data.copy(deep = True)

df1['sex'] = le.fit_transform(df1['sex'])
df1['cp'] = le.fit_transform(df1['cp'])
df1['restecg'] = le.fit_transform(df1['restecg'])
df1['exang'] = le.fit_transform(df1['exang'])
df1['slope'] = le.fit_transform(df1['slope'])

Distributing of Categorial Features

In [None]:
num_features = len(categorical_features)
cols = 2
rows = (num_features + 1) // cols

fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 5 * rows))
ax = ax.flatten()  

colors = ['#F93822', '#FDD20E']

for i in range(num_features):
    sns.histplot(df1[categorical_features[i]], kde=True, color=colors[0], ax=ax[i])
    ax[i].set_title(f'Distribution: {categorical_features[i]}')

for j in range(i + 1, len(ax)):
    fig.delaxes(ax[j])

fig.tight_layout(pad=2)
plt.show()

Numerical Features :
Distribution of Numerical Features :

In [None]:
fig, ax = plt.subplots(nrows = 2,ncols = 2,figsize = (10,9.75))
for i in range(len(numerical_features) - 1):
    plt.subplot(2,2,i+1)
    sns.distplot(data[numerical_features[i]],color = colors[0])
    title = 'Distribution : ' + numerical_features[i]
    plt.title(title)
plt.show()

plt.figure(figsize = (4.75,4.55))
sns.distplot(df1[numerical_features[len(numerical_features) - 1]],kde_kws = {'bw' : 1},color = colors[0])
title = 'Distribution : ' + numerical_features[len(numerical_features) - 1]
plt.title(title);

Target Variable Visualization (HeartDisease) :

In [None]:
l = list(data['target'].value_counts())
circle = [l[1] / sum(l) * 100, l[0] / sum(l) * 100]

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))

plt.subplot(1, 2, 1)
plt.pie(circle, labels=['No Heart Disease', 'Heart Disease'], autopct='%1.1f%%', startangle=90, 
        explode=(0.1, 0), colors=colors, wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Heart Disease %')

plt.subplot(1, 2, 2)
ax = sns.countplot(x='target', data=data, palette='viridis', edgecolor='black')
for rect in ax.patches:
    ax.text(rect.get_x() + rect.get_width() / 2, rect.get_height() + 2, rect.get_height(), 
            horizontalalignment='center', fontsize=11)
ax.set_xticklabels(['No Heart Disease', 'Heart Disease'])
plt.title('Cases of Heart Disease')

plt.show()

Categorical Features vs Target Variable (HeartDisease) :

In [None]:
import math
categorical_features = ['sex', 'cp', 'restecg', 'exang', 'slope', 'thal', 'fbs']  # example: 7 features
num_plots = len(categorical_features)
cols = 2
rows = math.ceil(num_plots / cols)
fig, axs = plt.subplots(nrows=rows, ncols=cols, figsize=(12, 5 * rows))
axs = axs.flatten()

colors = ['#F93822', '#FDD20E']

for i in range(num_plots):
    ax = axs[i]
    plot = sns.countplot(x=categorical_features[i], data=data, hue="target", palette='viridis', edgecolor='black', ax=ax)

    for rect in plot.patches:
        height = rect.get_height()
        plot.text(rect.get_x() + rect.get_width() / 2, height + 0.5, height, 
                  ha='center', va='bottom', fontsize=10)

    ax.set_title(f'{categorical_features[i]} vs Target')
    ax.legend(title='Heart Disease', labels=['No', 'Yes'])
for j in range(num_plots, len(axs)):
    fig.delaxes(axs[j])

fig.tight_layout()
plt.show()

Categorical Features vs Positive Heart Disease Cases :

In [None]:
sex = data[data['target'] == 1]['sex'].value_counts()
sex = [sex[0] / sum(sex) * 100, sex[1] / sum(sex) * 100]

cp = data[data['target'] == 1]['cp'].value_counts()
cp = [cp[0] / sum(cp) * 100,cp[1] / sum(cp) * 100,cp[2] / sum(cp) * 100,cp[3] / sum(cp) * 100]

fbs = data[data['target'] == 1]['fbs'].value_counts()
fbs = [fbs[0] / sum(fbs) * 100,fbs[1] / sum(fbs) * 100]

restecg = data[data['target'] == 1]['restecg'].value_counts()
restecg = [restecg[0] / sum(restecg) * 100,restecg[1] / sum(restecg) * 100,restecg[2] / sum(restecg) * 100]

exang = data[data['target'] == 1]['exang'].value_counts()
exang = [exang[0] / sum(exang) * 100,exang[1] / sum(exang) * 100]

slope = data[data['target'] == 1]['slope'].value_counts()
slope = [slope[0] / sum(slope) * 100,slope[1] / sum(slope) * 100,slope[2] / sum(slope) * 100]

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

colors_sex = ['#FF9999', '#66B3FF']                          
colors_cp = ['#FF9999', '#FFCC99', '#99FF99', '#66B3FF']     
colors_fbs = ['#FFD700', '#FF6347']                          
colors_restecg = ['#A1C3D1', '#F4A582', '#92C5DE']           
colors_exang = ['#8DD3C7', '#FB8072']                        
colors_slope = ['#B3DE69', '#FDB462', '#80B1D3']             
plt.subplot(3, 2, 1)
plt.pie(sex, labels=['Male', 'Female'], autopct='%1.1f%%', startangle=90, explode=(0.1, 0), colors=colors_sex,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Sex')

plt.subplot(3, 2, 2)
plt.pie(cp, labels=['ASY', 'NAP', 'ATA', 'TA'], autopct='%1.1f%%', startangle=90, explode=(0, 0.1, 0.1, 0.1),
        colors=colors_cp,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Chest Pain Type')

plt.subplot(3, 2, 3)
plt.pie(fbs, labels=['FBS < 120 mg/dl', 'FBS > 120 mg/dl'], autopct='%1.1f%%', startangle=90, explode=(0.1, 0),
        colors=colors_fbs,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Fasting Blood Sugar')

plt.subplot(3, 2, 4)
plt.pie(restecg, labels=['Normal', 'ST', 'LVH'], autopct='%1.1f%%', startangle=90, explode=(0, 0.1, 0.1),
        colors=colors_restecg,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Resting ECG')

plt.subplot(3, 2, 5)
plt.pie(exang, labels=['Angina', 'No Angina'], autopct='%1.1f%%', startangle=90, explode=(0.1, 0),
        colors=colors_exang,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('Exercise Induced Angina')

plt.subplot(3, 2, 6)
plt.pie(slope, labels=['Flat', 'Up', 'Down'], autopct='%1.1f%%', startangle=90, explode=(0, 0.1, 0.1),
        colors=colors_slope,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
plt.title('ST Segment Slope')

plt.tight_layout()
plt.show()


Numerical Features vs Target Variable (HeartDisease) :

In [None]:
data['trestbps_Group'] = [int(i / 5) for i in data['trestbps']]
data['chol_Group'] = [int(i / 10) for i in data['chol']]
data['thalach_Group'] = [int(i / 5) for i in data['thalach']]
data['oldpeak_Group'] = [int((i * 10) / 5) for i in data['oldpeak']]

group_numerical_features = ['trestbps_Group', 'chol_Group', 'thalach_Group', 'oldpeak_Group']

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(10, 25))

colors = ['#F93822', '#FDD20E']

for i in range(len(group_numerical_features)):
    sns.countplot(x=group_numerical_features[i], data=data, hue="target",
                  palette='viridis', edgecolor='black', ax=ax[i])
    
    ax[i].set_title(f'{group_numerical_features[i]} vs Heart Disease')
    ax[i].legend(title='Target', labels=['No Heart Disease', 'Heart Disease'])

plt.tight_layout()
plt.show()

In [None]:
color_palettes = [
    ['#1f77b4', '#ff7f0e'],  # Blue, Orange
    ['#2ca02c', '#d62728'],  # Green, Red
    ['#9467bd', '#8c564b'],  # Purple, Brown
    ['#e377c2', '#7f7f7f'],  # Pink, Gray
    ['#17becf', '#bcbd22']   # Cyan, Yellow-Green
]
target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i in range(3):
    ax = axs[i]
    plot = sns.stripplot(
        x='sex', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[i], ax=ax, dodge=True
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs Sex")

plt.tight_layout()
plt.show()
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for idx, i in enumerate([-2, -1]):
    ax = axs[idx]
    plot = sns.stripplot(
        x='sex', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[idx + 3], ax=ax, dodge=True
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs Sex")

plt.tight_layout()
plt.show()

FastingBS vs Numerical features :

In [None]:
color_palettes = [
    ['#1f77b4', '#ff7f0e'],  # Blue, Orange
    ['#2ca02c', '#d62728'],  # Green, Red
    ['#9467bd', '#8c564b'],  # Purple, Brown
    ['#e377c2', '#7f7f7f'],  # Pink, Gray
    ['#17becf', '#bcbd22']   # Cyan, Yellow-Green
]

target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i in range(3):
    ax = axs[i]
    plot = sns.stripplot(
        x='fbs', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[i], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs Fasting Blood Sugar")

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for idx, i in enumerate([-2, -1]):
    ax = axs[idx]
    plot = sns.stripplot(
        x='fbs', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[idx + 3], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs Fasting Blood Sugar")

plt.tight_layout()
plt.show()

RestingECG vs Numerical Features :

In [None]:
color_palettes = [
    ['#1f77b4', '#ff7f0e'],  # Blue, Orange
    ['#2ca02c', '#d62728'],  # Green, Red
    ['#9467bd', '#8c564b'],  # Purple, Brown
    ['#e377c2', '#7f7f7f'],  # Pink, Gray
    ['#17becf', '#bcbd22']   # Cyan, Yellow-Green
]

target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i in range(3):
    ax = axs[i]
    plot = sns.stripplot(
        x='restecg', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[i], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs restecg")

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for idx, i in enumerate([-2, -1]):
    ax = axs[idx]
    plot = sns.stripplot(
        x='restecg', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[idx + 3], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(l)] for l in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs restecg")

plt.tight_layout()
plt.show()

ExerciseAngina vs Numerical Features :

In [None]:
color_palettes = [
    ['#1f77b4', '#ff7f0e'],  # Blue, Orange
    ['#2ca02c', '#d62728'],  # Green, Red
    ['#9467bd', '#8c564b'],  # Purple, Brown
    ['#e377c2', '#7f7f7f'],  # Pink, Gray
    ['#17becf', '#bcbd22']   # Cyan, Yellow-Green
]

target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i in range(3):
    ax = axs[i]
    plot = sns.stripplot(
        x='exang', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[i], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(label)] for label in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs exang")

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for idx, i in enumerate([-2, -1]):
    ax = axs[idx]
    plot = sns.stripplot(
        x='exang', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[idx + 3], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(label)] for label in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs exang")

plt.tight_layout()
plt.show()

ST_Slope vs Numerical Features :

In [None]:
color_palettes = [
    ['#1f77b4', '#ff7f0e'],  # Blue, Orange
    ['#2ca02c', '#d62728'],  # Green, Red
    ['#9467bd', '#8c564b'],  # Purple, Brown
    ['#e377c2', '#7f7f7f'],  # Pink, Gray
    ['#17becf', '#bcbd22']   # Cyan, Olive
]
target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i in range(3):
    ax = axs[i]
    plot = sns.stripplot(
        x='slope', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[i], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(label)] for label in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs slope")

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for idx, i in enumerate([-2, -1]):
    ax = axs[idx]
    plot = sns.stripplot(
        x='slope', y=numerical_features[i],
        data=data, hue='target',
        palette=color_palettes[idx + 3], dodge=True, ax=ax
    )
    handles, labels = plot.get_legend_handles_labels()
    new_labels = [target_labels[int(label)] for label in labels]
    ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
    ax.set_title(f"{numerical_features[i]} vs slope")

plt.tight_layout()
plt.show()

Numerical features vs Numerical features w.r.t Target variable(HeartDisease) :

In [None]:
palette = ['#1f77b4', '#ff7f0e']  # Blue = 0 (No Heart Disease), Orange = 1 (Heart Disease)

# Custom label map for legend
target_labels = {0: 'No Heart Disease', 1: 'Heart Disease'}

# Set up subplot counter
a = 0
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(15, 25))

# Flatten axes for easier access
axs = ax.flatten()

for i in range(len(numerical_features)):
    for j in range(len(numerical_features)):
        if i != j and j > i:
            plot_ax = axs[a]
            sns.scatterplot(
                x=numerical_features[i],
                y=numerical_features[j],
                data=data,
                hue='target',
                palette=palette,
                edgecolor='black',
                ax=plot_ax
            )
            handles, labels = plot_ax.get_legend_handles_labels()
            new_labels = [target_labels[int(l)] for l in labels]
            plot_ax.legend(handles=handles, labels=new_labels, title='Heart Disease')
            title = f"{numerical_features[i]} vs {numerical_features[j]}"
            plot_ax.set_title(title)
            a += 1

# Hide any unused subplots if numerical_features are less
for k in range(a, len(axs)):
    fig.delaxes(axs[k])

plt.tight_layout()
plt.show()

Summary
Order / Values of features for positive cases of heart disease :
Categorical Features (Order) :
Sex : Male > Female
ChestPainType : ASY > NAP > ATA > TA
FastingBS : ( FBS < 120 mg/dl ) > ( FBS > 120 mg/dl)
RestingECG : Normal > ST > LVH
ExerciseAngina : Angina > No Angina
ST_Slope : Flat > Up > Down
Numerical Features (Range) :
Age : 50+
RestingBP : 95 - 170
Cholesterol : 160 - 340
Oldpeak : 0 - 4

Feature Engineering

Data Scaling:

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization

df1['oldpeak'] = mms.fit_transform(df1[['oldpeak']])
df1['age'] = ss.fit_transform(df1[['age']])
df1['restecg'] = ss.fit_transform(df1[['restecg']])
df1['chol'] = ss.fit_transform(df1[['chol']])
df1['thalach'] = ss.fit_transform(df1[['thalach']])
df1.head()

Correlation Matrix :

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='viridis', linewidths=0.5)
plt.title('Correlation Heatmap for Heart Disease Prediction')
plt.show()

In [None]:
corr = df1.corrwith(df1['target']).sort_values(ascending = False).to_frame()
corr.columns = ['Correlations']
plt.subplots(figsize = (5,5))
sns.heatmap(corr,annot = True,cmap = 'viridis',linewidths = 0.4,linecolor = 'black');
plt.title('Correlation w.r.t HeartDisease');

Feature Selection for Categorical Features:
Chi Squared Test:

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
le = LabelEncoder()
for col in categorical_features:
    df1[col] = le.fit_transform(df1[col])

features = df1.loc[:, categorical_features[:-1]]
target = df1.loc[:, categorical_features[-1]]

best_features = SelectKBest(score_func=chi2, k='all')
fit = best_features.fit(features, target)

featureScores = pd.DataFrame(
    data=fit.scores_,
    index=list(features.columns),
    columns=['Chi Squared Score']
)

plt.figure(figsize=(6, 6))
sns.heatmap(
    featureScores.sort_values(by='Chi Squared Score', ascending=False),
    annot=True,
    cmap='YlGnBu',  
    linewidths=0.4,
    linecolor='black',
    fmt='.2f'
)
plt.title('Selection of Categorical Features (Chi-Square Test)')
plt.show()

Except RestingECG, all the remaining categorical features are important for predicting heart diseases

Feature Selection for Numerical Features:

ANOVA Test:

In [None]:
from sklearn.feature_selection import f_classif

features = df1.loc[:,numerical_features]
target = df1.loc[:,categorical_features[-1]]

best_features = SelectKBest(score_func = f_classif,k = 'all')
fit = best_features.fit(features,target)

featureScores = pd.DataFrame(data = fit.scores_,index = list(features.columns),columns = ['ANOVA Score']) 

plt.subplots(figsize = (5,5))
sns.heatmap(featureScores.sort_values(ascending = False,by = 'ANOVA Score'),annot = True,cmap ='YlGnBu',linewidths = 0.4,linecolor = 'black',fmt = '.2f');
plt.title('Selection of Numerical Features');


Modeling:

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, accuracy_score, precision_recall_curve
from sklearn.metrics import RocCurveDisplay

In [None]:
features = df1[df1.columns.drop(['target','restecg','restecg'])].values
target = df1['target'].values
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state = 2)

* Selecting the features from the above conducted tests and splitting the data into 80 - 20 train - test groups  

In [None]:
def model(classifier):
    
    classifier.fit(x_train,y_train)
    prediction = classifier.predict(x_test)
    cv = RepeatedStratifiedKFold(n_splits = 10,n_repeats = 3,random_state = 1)
    print("Accuracy : ",'{0:.2%}'.format(accuracy_score(y_test,prediction)))
    print("Cross Validation Score : ",'{0:.2%}'.format(cross_val_score(classifier,x_train,y_train,cv = cv,scoring = 'roc_auc').mean()))
    print("ROC_AUC Score : ",'{0:.2%}'.format(roc_auc_score(y_test,prediction)))
    plot_roc_curve(classifier, x_test,y_test)
    plt.title('ROC_AUC_Plot')
    plt.show()

def model_evaluation(classifier):
    
    cm = confusion_matrix(y_test,classifier.predict(x_test))
    names = ['True Neg','False Pos','False Neg','True Pos']
    counts = [value for value in cm.flatten()]
    percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm,annot = labels,cmap = colors,fmt ='')
    
    print(classification_report(y_test,classifier.predict(x_test)))

1. Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(random_state = 0,C=10,penalty= 'l2')

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import RocCurveDisplay, accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

def model(classifier):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    classifier.fit(x_train, y_train)
    prediction = classifier.predict(x_test)

    print("Accuracy Score:", '{0:.2%}'.format(accuracy_score(y_test, prediction)))
    print("Classification Report:\n", classification_report(y_test, prediction))
    print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
    print("Cross Validation Score:", '{0:.2%}'.format(cross_val_score(classifier, x_train, y_train, cv=cv, scoring='roc_auc').mean()))
    print("ROC_AUC Score:", '{0:.2%}'.format(roc_auc_score(y_test, prediction)))
    
    disp = RocCurveDisplay.from_estimator(classifier, x_test, y_test)
    
    disp.line_.set_color("hotpink")

    plt.legend([disp.line_], [disp.line_.get_label()], loc='lower right', labelcolor='hotpink')
    
    plt.title('ROC_AUC Plot')
    plt.show()
model(classifier_lr)

In [None]:
model_evaluation(classifier_lr)

2. Support Vector Classifier:

In [None]:
from sklearn.svm import SVC

In [None]:
classifier_svc = SVC(kernel = 'linear',C = 0.1)

In [None]:
model(classifier_svc)

In [None]:
model_evaluation(classifier_svc)

3. Decision Tree Classifier:

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(random_state = 1000,max_depth = 4,min_samples_leaf = 1)

In [None]:
model(classifier_dt)

In [None]:
model_evaluation(classifier_dt)

4. Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(max_depth = 4,random_state = 0)

In [None]:
model(classifier_rf)

5. K-nearest Neighbors Classifier:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(leaf_size = 1, n_neighbors = 3,p = 1)

In [None]:
model(classifier_knn)

In [None]:
model_evaluation(classifier_knn)