### Import Libraries

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # Hyperparameyer tuning

from sklearn.preprocessing import MinMaxScaler, StandardScaler  #Feature Scaling

from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

### Data Gathering

In [4]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   gender    303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
df.describe()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


#### MinMax Scaler

In [5]:
age_63 = (63-29)/(77-29)
age_63

0.7083333333333334

#### Standrad Scaler

In [7]:
age_63 = (63-54.366337)/(9.082101)
age_63

0.950623980068048

In [8]:
X = df.drop('target', axis=1)
y = df['target']

#### MinMax Scaler (Normalization)

In [10]:
# Create instance of MInMax Scaler (Normalization)
minmax_scaler = MinMaxScaler()
X_scaled = minmax_scaler.fit_transform(X)
# X_scaled
X_minmax_df = pd.DataFrame(X_scaled, columns=X.columns)
X_minmax_df

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.708333,1.0,1.000000,0.481132,0.244292,1.0,0.0,0.603053,0.0,0.370968,0.0,0.00,0.333333
1,0.166667,1.0,0.666667,0.339623,0.283105,0.0,0.5,0.885496,0.0,0.564516,0.0,0.00,0.666667
2,0.250000,0.0,0.333333,0.339623,0.178082,0.0,0.0,0.770992,0.0,0.225806,1.0,0.00,0.666667
3,0.562500,1.0,0.333333,0.245283,0.251142,0.0,0.5,0.816794,0.0,0.129032,1.0,0.00,0.666667
4,0.583333,0.0,0.000000,0.245283,0.520548,0.0,0.5,0.702290,1.0,0.096774,1.0,0.00,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.583333,0.0,0.000000,0.433962,0.262557,0.0,0.5,0.396947,1.0,0.032258,0.5,0.00,1.000000
299,0.333333,1.0,1.000000,0.150943,0.315068,0.0,0.5,0.465649,0.0,0.193548,0.5,0.00,1.000000
300,0.812500,1.0,0.000000,0.471698,0.152968,1.0,0.5,0.534351,0.0,0.548387,0.5,0.50,1.000000
301,0.583333,1.0,0.000000,0.339623,0.011416,0.0,0.5,0.335878,1.0,0.193548,0.5,0.25,1.000000


In [11]:
X_minmax_df.describe()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,0.528465,0.683168,0.322332,0.354941,0.274575,0.148515,0.264026,0.600358,0.326733,0.167678,0.69967,0.182343,0.771177
std,0.18921,0.466011,0.344017,0.165454,0.118335,0.356198,0.26293,0.174849,0.469794,0.18727,0.308113,0.255652,0.204092
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.385417,0.0,0.0,0.245283,0.194064,0.0,0.0,0.477099,0.0,0.0,0.5,0.0,0.666667
50%,0.541667,1.0,0.333333,0.339623,0.260274,0.0,0.5,0.625954,0.0,0.129032,0.5,0.0,0.666667
75%,0.666667,1.0,0.666667,0.433962,0.339041,0.0,0.5,0.725191,1.0,0.258065,1.0,0.25,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Standrad Scaler (Standradization)

In [12]:
std_scaler = StandardScaler()
X_scaled = std_scaler.fit_transform(X)
# X_scaled
X_std_df = pd.DataFrame(X_scaled, columns=X.columns)
X_std_df

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922
4,0.290464,-1.468418,-0.938515,-0.663867,2.082050,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.290464,-1.468418,-0.938515,0.478391,-0.101730,-0.417635,0.898962,-1.165281,1.435481,-0.724323,-0.649113,-0.714429,1.123029
299,-1.033002,0.681005,1.973123,-1.234996,0.342756,-0.417635,0.898962,-0.771706,-0.696631,0.138373,-0.649113,-0.714429,1.123029
300,1.503641,0.681005,-0.938515,0.706843,-1.029353,2.394438,0.898962,-0.378132,-0.696631,2.036303,-0.649113,1.244593,1.123029
301,0.290464,0.681005,-0.938515,-0.092738,-2.227533,-0.417635,0.898962,-1.515125,1.435481,0.138373,-0.649113,0.265082,1.123029


In [13]:
X_std_df.describe()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,4.690051e-17,-1.407015e-16,2.3450260000000003e-17,-7.035077e-16,-1.113887e-16,-2.3450260000000003e-17,1.465641e-16,-6.800574e-16,-4.690051e-17,2.3450260000000003e-17,-1.407015e-16,-2.3450260000000003e-17,-1.641518e-16
std,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654,1.001654
min,-2.797624,-1.468418,-0.9385146,-2.148802,-2.32416,-0.4176345,-1.005832,-3.439267,-0.6966305,-0.8968617,-2.274579,-0.7144289,-3.784824
25%,-0.7572802,-1.468418,-0.9385146,-0.6638668,-0.6814943,-0.4176345,-1.005832,-0.7061105,-0.6966305,-0.8968617,-0.6491132,-0.7144289,-0.5129219
50%,0.06988599,0.6810052,0.03203122,-0.09273778,-0.1210553,-0.4176345,0.8989622,0.1466343,-0.6966305,-0.2067053,-0.6491132,-0.7144289,-0.5129219
75%,0.7316189,0.6810052,1.002577,0.4783913,0.5456738,-0.4176345,0.8989622,0.7151309,1.435481,0.4834512,0.9763521,0.2650822,1.123029
max,2.49624,0.6810052,1.973123,3.905165,6.140401,2.394438,2.803756,2.289429,1.435481,4.451851,0.9763521,3.203615,1.123029


### Train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_minmax_df, y,test_size=0.20, random_state=42)

In [15]:
X_test

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,0.583333,1.0,0.000000,0.528302,0.342466,0.0,0.0,0.312977,1.0,0.096774,0.5,0.25,0.333333
228,0.625000,1.0,1.000000,0.716981,0.369863,0.0,0.0,0.671756,0.0,0.032258,0.5,0.00,1.000000
111,0.583333,1.0,0.666667,0.528302,0.000000,1.0,0.5,0.778626,0.0,0.032258,1.0,0.25,1.000000
246,0.562500,0.0,0.000000,0.377358,0.646119,0.0,0.0,0.603053,1.0,0.306452,0.5,0.50,1.000000
60,0.875000,0.0,0.666667,0.150943,0.317352,1.0,0.0,0.450382,0.0,0.000000,1.0,0.25,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,0.833333,1.0,0.666667,0.433962,0.292237,0.0,0.0,0.572519,0.0,0.322581,0.5,0.75,1.000000
104,0.437500,1.0,0.666667,0.330189,0.159817,0.0,0.5,0.702290,0.0,0.000000,1.0,0.00,0.666667
300,0.812500,1.0,0.000000,0.471698,0.152968,1.0,0.5,0.534351,0.0,0.548387,0.5,0.50,1.000000
193,0.645833,1.0,0.000000,0.481132,0.356164,0.0,0.0,0.541985,1.0,0.451613,0.5,0.50,1.000000


### Train Model

In [17]:
knn_class = KNeighborsClassifier(n_neighbors=10, p=2)
knn_class.fit(X_train, y_train)

In [18]:
y_pred = knn_class.predict(X_test)
y_pred

array([0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [19]:
y_test[:5]

179    0
228    0
111    1
246    0
60     1
Name: target, dtype: int64

In [21]:
y_pred[:5]

array([0, 0, 1, 0, 1])

#### Evaluation

In [22]:
#Confusion Matrix
con_matrix = confusion_matrix(y_test, y_pred)
con_matrix

array([[27,  2],
       [ 9, 23]])

In [23]:
#Accuracy
accuracy= accuracy_score(y_test, y_pred)
accuracy

0.819672131147541

In [24]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.75      0.93      0.83        29
           1       0.92      0.72      0.81        32

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.84      0.82      0.82        61



### Hyperparameter Tuning

#### Grid SearchCV

In [27]:
k = np.arange(2,15)
p = [1,2]
hyper = {'n_neighbors': k, "p":p}

In [28]:
knn = KNeighborsClassifier()
gscv_knn_model = GridSearchCV(knn,hyper,cv=5)
gscv_knn_model.fit(X_train, y_train)

In [29]:
gscv_knn_model.best_params_

{'n_neighbors': np.int64(5), 'p': 1}

#### Randomized Search CV

In [30]:
knn = KNeighborsClassifier()
rscv_knn_model = RandomizedSearchCV(knn,hyp,cv=5)
rscv_knn_model.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [31]:
rscv_knn_model.best_params_

{'p': 1, 'n_neighbors': np.int64(5)}

In [32]:
knn_class_gs = KNeighborsClassifier(n_neighbors=5, p=1)
knn_class_gs.fit(X_train, y_train)

In [33]:
y_pred_gs = knn_class_gs.predict(X_test)
y_pred_gs

array([0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [35]:
#Accuracy
accuracy_gs= accuracy_score(y_test, y_pred_gs)
accuracy_gs

0.819672131147541

In [36]:
knn_class_rs = KNeighborsClassifier(n_neighbors=4, p=1)
knn_class_rs.fit(X_train, y_train)

In [37]:
y_pred_rs = knn_class_rs.predict(X_test)
y_pred_rs

array([0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [38]:
#Accuracy
accuracy_rs= accuracy_score(y_test, y_pred_rs)
accuracy_rs

0.7704918032786885

In [39]:
from sklearn.metrics import roc_auc_score, roc_curve

In [41]:
roc_auc_score(y_test, y_pred)

np.float64(0.8248922413793103)