In [2]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

df = pd.read_csv("data.csv")
profile = ProfileReport(df, title="My Data Report")



In [3]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [4]:
print("Columns in dataframe:")
print(df.columns.tolist())
print(f"\nShape: {df.shape}")
print(f"\nFirst few rows:")

Columns in dataframe:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num       ']

Shape: (293, 14)

First few rows:


In [5]:
df.columns = df.columns.str.strip()

# Replace '?' with NaN across entire dataframe
df = df.replace('?', 0)

# Convert numeric columns to proper numeric type
numeric_cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values with median or 0
df = df.fillna(0)

target_name = "num"

x = df.drop(columns="num")
y = df[target_name]
x.head()
df["oldpeak"].value_counts()

oldpeak
0.0    188
1.0     41
2.0     31
1.5     16
3.0      9
2.5      3
0.5      2
0.8      1
4.0      1
5.0      1
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print("accuracy_score:",accuracy_score(y_test,y_pred))
print("\n F1:\n",f1_score(y_test,y_pred))
print("\n Precision score is:\n",precision_score(y_test,y_pred))
print("\n Recall score is:\n",recall_score(y_test,y_pred))
print( "confusion matrix is:\n",confusion_matrix(y_test,y_pred))

accuracy_score: 0.6610169491525424

 F1:
 0.4117647058823529

 Precision score is:
 0.4375

 Recall score is:
 0.3888888888888889
confusion matrix is:
 [[32  9]
 [11  7]]


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

# 1. Change the model to KNN
knn = KNeighborsClassifier()

# 2. Change the parameter grid to KNN-specific settings
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 15], # The 'k' value
    'weights': ['uniform', 'distance'],   # How neighbors influence the result
    'metric': ['euclidean', 'manhattan']  # How distance is calculated
}

# 3. Apply GridSearchCV using the knn estimator
grid_search_knn = GridSearchCV(
    estimator=knn,
    param_grid=param_grid_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the model
grid_search_knn.fit(x_train, y_train)

# Results
print("Best KNN Parameters:", grid_search_knn.best_params_)
best_knn = grid_search_knn.best_estimator_
knn_pred = best_knn.predict(x_test)

# Evaluation
print("accuracy_score:", accuracy_score(y_test, knn_pred))
print("\nF1:", f1_score(y_test, knn_pred))
print("\nPrecision score:", precision_score(y_test, knn_pred))
print("\nRecall score:", recall_score(y_test, knn_pred))
print("confusion matrix is:\n", confusion_matrix(y_test, knn_pred))

Best KNN Parameters: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}
accuracy_score: 0.7627118644067796

F1: 0.5333333333333333

Precision score: 0.6666666666666666

Recall score: 0.4444444444444444
confusion matrix is:
 [[37  4]
 [10  8]]


### ðŸ“ˆ Optimized K-Nearest Neighbors (KNN) Results

After using `GridSearchCV` to tune the hyperparameters, the KNN model showed significant improvement over the baseline.

#### 1. Optimal Hyperparameters
* **Metric:** Manhattan (L1 Distance)
* **Number of Neighbors (k):** 15
* **Weights:** Uniform (all neighbors count equally)

#### 2. Evaluation Metrics
| Metric | Value |
| :--- | :--- |
| **Accuracy Score** | **76.27%** |
| **Precision Score** | **66.67%** |
| **F1 Score** | **53.33%** |
| **Recall Score** | **44.44%** |

#### 3. Confusion Matrix Breakdown
| | Predicted Negative | Predicted Positive |
| :--- | :---: | :---: |
| **Actual Negative** | 37 (TN) | 4 (FP) |
| **Actual Positive** | 10 (FN) | 8 (TP) |

**Key Observation:** The optimized KNN model is very conservative. While it has a low **False Positive** rate (only 4 healthy people were misdiagnosed), its **Recall** is quite low (44.4%). In a medical context, this model might be considered "risky" because it failed to detect 10 out of 18 actual cases of heart disease (False Negatives). 

**Comparison:** While this KNN model has better **Precision** than the Logistic Regression model, the **Logistic Regression** remains the superior choice for this specific task due to its much higher **Recall** (88.8%).