In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('realheart.csv')

In [3]:
df.sample()

Unnamed: 0,age,sex,cp,tresbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
284,61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1,7,1


In [4]:
X = df.drop(columns = 'target')
y = df['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 42)

In [6]:
sc = StandardScaler()

In [7]:
sc.fit_transform(X_train)

array([[-2.83850353,  0.72250438, -1.10468954, ..., -0.96343165,
        -0.71586852, -0.87575498],
       [ 0.24135234,  0.72250438, -0.09205746, ..., -0.96343165,
         0.40181007,  1.20712172],
       [ 1.56129057,  0.72250438, -2.11732162, ...,  0.65566876,
         0.40181007, -0.87575498],
       ...,
       [ 0.46134205,  0.72250438,  0.92057462, ..., -0.96343165,
         0.40181007,  1.20712172],
       [ 0.68133175,  0.72250438,  0.92057462, ..., -0.96343165,
         0.40181007,  1.20712172],
       [ 0.24135234, -1.38407465,  0.92057462, ..., -0.96343165,
         0.40181007, -0.87575498]])

In [8]:
sc.transform(X_test)

array([[-1.98627069e-01,  7.22504380e-01, -9.20574618e-02,
        -1.25981747e-01,  7.89327927e-03,  2.51661148e+00,
         1.01249171e+00,  1.02124161e+00, -7.15891053e-01,
        -8.73572989e-01, -9.63431646e-01,  2.63716726e+00,
        -8.75754976e-01],
       [-8.86322160e-02,  7.22504380e-01,  9.20574618e-01,
        -1.22661650e+00, -8.22616975e-01, -3.97359707e-01,
         1.01249171e+00, -1.87737451e+00,  1.39686059e+00,
        -8.73572989e-01,  6.55668759e-01,  4.01810072e-01,
        -8.75754976e-01],
       [ 1.31357489e-01,  7.22504380e-01,  9.20574618e-01,
        -4.01140435e-01,  7.01815483e-02,  2.51661148e+00,
         1.01249171e+00, -2.71987120e-01,  1.39686059e+00,
         1.38739844e-01,  6.55668759e-01,  4.01810072e-01,
        -8.75754976e-01],
       [ 3.51347195e-01,  7.22504380e-01,  9.20574618e-01,
        -1.77693388e+00, -2.41259797e-01, -3.97359707e-01,
        -9.95893490e-01,  2.63142010e-01, -7.15891053e-01,
        -7.89213586e-01, -9.63431646e

In [27]:
knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'kd_tree', leaf_size =10 , p = 1, weights = 'distance'
                          , metric = 'manhattan')


In [28]:
knn.fit(X_train, y_train)

In [29]:
y_pred = knn.predict(X_test)

In [30]:
print("Accuracy Score on Test Data is ", accuracy_score(y_pred,y_test))

Accuracy Score on Test Data is  0.7704918032786885


In [13]:
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

knn = KNeighborsClassifier()

pipeline = Pipeline([
    ('scaler', scaler),
    ('knn', knn)
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 30, 50],
    'knn__p': [1, 2]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.3f}")

best_model = grid_search.best_estimator_
training_accuracy = best_model.score(X_train, y_train)
print(f"Accuracy on the training dataset: {training_accuracy:.3f}")

test_accuracy = best_model.score(X_test, y_test)
print(f"Accuracy on the test dataset: {test_accuracy:.3f}")


Best parameters found: {'knn__algorithm': 'auto', 'knn__leaf_size': 10, 'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'distance'}
Best cross-validation accuracy: 0.818
Accuracy on the training dataset: 1.000
Accuracy on the test dataset: 0.902


In [14]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


scaler = StandardScaler()

knn = KNeighborsClassifier()

pipeline = Pipeline([
    ('scaler', scaler),
    ('knn', knn)
])

param_dist = {
    'knn__n_neighbors': randint(3, 12),  # Random integers from 3 to 11
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': randint(10, 51),  # Random integers from 10 to 50
    'knn__p': [1, 2]
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)

print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_:.3f}")

best_model = random_search.best_estimator_
training_accuracy = best_model.score(X_train, y_train)
print(f"Accuracy on the training dataset: {training_accuracy:.3f}")

test_accuracy = best_model.score(X_test, y_test)
print(f"Accuracy on the test dataset: {test_accuracy:.3f}")


Best parameters found: {'knn__algorithm': 'kd_tree', 'knn__leaf_size': 38, 'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'uniform'}
Best cross-validation accuracy: 0.822
Accuracy on the training dataset: 0.847
Accuracy on the test dataset: 0.902
