In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('heart_attack_risk_prediction_1.csv')
data

Unnamed: 0,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,Diet_Score,Cholesterol_Level,Triglyceride_Level,Systolic_BP,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Heart_Attack_History,Heart_Attack_Risk
0,42,0,0,0,1,1,0,0,9,248,125,93,119,1,0,4,0,0
1,26,1,0,0,0,0,1,1,4,272,51,134,115,0,0,7,0,0
2,78,1,0,0,1,0,0,1,6,268,213,104,117,0,1,10,0,0
3,58,1,1,0,1,0,0,1,9,224,250,91,65,0,0,1,1,0
4,22,1,0,0,0,0,0,1,5,277,129,141,109,0,0,9,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,20,0,0,0,0,1,0,1,6,160,238,133,74,1,1,10,0,0
9996,34,0,0,0,0,0,0,0,5,257,241,124,78,0,1,1,0,1
9997,76,1,0,0,0,0,0,0,2,205,69,151,98,1,0,1,1,0
9998,52,1,0,0,0,0,0,1,5,155,288,127,60,1,0,7,0,0


In [5]:
x = data.drop(columns=['Heart_Attack_Risk'])
y = data['Heart_Attack_Risk']
x.shape, y.shape

((10000, 17), (10000,))

In [7]:
# Selecting binary columns, then we can split the dataset into two, 
# One of them contains binary values and the other can have continuous numerical columns
# So that we can applyscaling on only these continuous numerical columns

# Select binary columns
binary_columns = [col for col in x.columns if x[col].nunique() == 2]
print("Binary columns:\n", binary_columns)

Binary columns:
 ['Gender', 'Diabetes', 'Hypertension', 'Obesity', 'Smoking', 'Alcohol_Consumption', 'Physical_Activity', 'Air_Pollution_Exposure', 'Family_History', 'Heart_Attack_History']


In [9]:
# Select columns having conotinuous numerical values

numerical_columns = [col for col in data.columns if data[col].nunique() > 2]
print("Numerical columns:\n", numerical_columns)

Numerical columns:
 ['Age', 'Diet_Score', 'Cholesterol_Level', 'Triglyceride_Level', 'Systolic_BP', 'Diastolic_BP', 'Stress_Level']


In [11]:
# Applying Train-test split BEFORE scaling

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7000, 17), (3000, 17), (7000,), (3000,))

In [15]:
# Scaling only numerical columns AFTER the split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled_numerical = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled_numerical = scaler.transform(X_test[numerical_columns])

# Reconstructing final X_train and X_test with binary_columns
X_train_final = pd.DataFrame(X_train_scaled_numerical, columns=numerical_columns, index=X_train.index)
X_train_final[binary_columns] = X_train[binary_columns]

X_test_final = pd.DataFrame(X_test_scaled_numerical, columns=numerical_columns, index=X_test.index)
X_test_final[binary_columns] = X_test[binary_columns]

In [17]:
X_train_final.shape, X_test_final.shape, y_train.shape, y_test.shape

((7000, 17), (3000, 17), (7000,), (3000,))

In [19]:
X_test_final.head()

Unnamed: 0,Age,Diet_Score,Cholesterol_Level,Triglyceride_Level,Systolic_BP,Diastolic_BP,Stress_Level,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,Air_Pollution_Exposure,Family_History,Heart_Attack_History
6252,0.898305,0.9,0.590604,0.610442,0.078652,0.0,0.0,1,1,0,0,1,0,0,1,0,0
4684,0.423729,0.3,0.268456,0.052209,0.314607,0.898305,1.0,0,0,0,0,0,0,0,0,0,0
1731,0.847458,0.5,0.744966,0.184739,0.685393,0.694915,0.0,1,0,0,1,0,0,1,1,0,1
4742,0.355932,0.7,0.899329,0.783133,0.404494,0.779661,0.666667,0,0,0,0,0,1,0,1,0,0
4521,0.305085,1.0,0.724832,0.971888,0.314607,0.779661,0.555556,1,0,0,0,0,0,1,0,0,0


In [21]:
X_train_final.head()

Unnamed: 0,Age,Diet_Score,Cholesterol_Level,Triglyceride_Level,Systolic_BP,Diastolic_BP,Stress_Level,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,Air_Pollution_Exposure,Family_History,Heart_Attack_History
9069,0.237288,0.9,0.825503,0.646586,0.685393,0.491525,0.111111,0,1,1,0,0,1,0,0,0,0
2603,0.440678,1.0,0.624161,0.048193,0.730337,0.033898,0.666667,1,1,0,0,0,0,0,0,0,0
7738,0.644068,0.0,0.590604,0.136546,0.449438,0.762712,0.444444,0,0,0,0,0,0,0,1,1,0
1579,0.305085,0.7,1.0,0.24498,0.179775,0.152542,1.0,0,0,0,0,0,1,1,1,0,0
5058,0.627119,0.0,0.463087,0.883534,0.662921,0.813559,0.666667,1,1,1,0,0,1,1,0,0,0


In [23]:
## Import KNN classification model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [25]:
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model
knn.fit(X_train_final, y_train) ## stored data points in model


# compute model performance on training data

# Model performance on test data
# Make predictions on the test set
y_pred = knn.predict(X_test_final)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred) ## prec, rec, f1 score

In [27]:
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.614
Confusion Matrix:
[[1652  445]
 [ 713  190]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.79      0.74      2097
           1       0.30      0.21      0.25       903

    accuracy                           0.61      3000
   macro avg       0.50      0.50      0.49      3000
weighted avg       0.58      0.61      0.59      3000



**Observations from KNN Classifier:**

Accuracy: 61.4% — but this is misleading due to class imbalance.

Recall for Class 1 (Risk): 21% — which is quite low, meaning the model is missing most of the actual "risky" patients.

Precision for Class 1: 30% — low, meaning that many of the predicted "risky" patients are false positives.

Confusion Matrix:

445 false positives and 713 false negatives — indicating poor sensitivity toward the minority class (which is often more critical in health-related use cases).

F1-Score (class 1): 0.25 — which reflects a weak balance between precision and recall.