In [35]:
!pip install pandas numpy scikit-learn



In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("../../data/raw/heart-disease.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Check is data balance or no
print(np.sum(df["target"])/df.shape[0])

0.5445544554455446


In [4]:
print(df.isnull().sum().sum())

0


In [5]:
# Define the features (X) and the target (y)
X = df.drop('target', axis=1)
y = df['target']

In [6]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features using standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Define the hyperparameter grid
param_grid = {'n_neighbors': np.arange(1, 10),
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
              }

# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the grid search object
grid = GridSearchCV(knn, param_grid, scoring='accuracy')

grid.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy
print("Best Hyperparameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)

Best Hyperparameters: {'algorithm': 'auto', 'n_neighbors': np.int64(5), 'weights': 'uniform'}
Best Accuracy: 0.8141156462585034


In [8]:
# Train and evaluate the model with the best hyperparameters
best_knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'],
                                 weights=grid.best_params_['weights'],
                                 algorithm=grid.best_params_['algorithm']
                                 )
best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)  

Accuracy: 0.9016393442622951


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61



In [11]:
import pickle

# Save the model
with open('../../models/model.pkl', 'wb') as file:
    pickle.dump(best_knn, file)

In [12]:
import pickle
with open('../../models/scaler.pkl','wb') as file:
    pickle.dump(scaler, file)