In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
# References = data camp and toward datascience
# Load the data with the location path
train_data_path = '/Users/bahar/Downloads/heart_train.csv'
train_data = pd.read_csv(train_data_path)

# Define categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing  sets
X = train_data.drop('HeartDisease', axis=1)
y = train_data['HeartDisease']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

# different distance metrics
distance_metrics = [1, 2]  # 1 for Manhattan, 2 for Euclidean
best_overall_score = 0
best_overall_k = 0
best_distance_metric = 0

for p in distance_metrics:
    for k in range(1, 31):
        knn = KNeighborsClassifier(n_neighbors=k, weights='distance', p=p)
        knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('knn', knn)])
        scores = cross_val_score(knn_pipeline, X_train, y_train, cv=10)
        average_score = np.mean(scores)
        print(f"K: {k}, Distance Metric (p): {p}, Cross-Val Accuracy: {average_score}")

        if average_score > best_overall_score:
            best_overall_score = average_score
            best_overall_k = k
            best_distance_metric = p

print(f"Best K: {best_overall_k}, Best Distance Metric (p): {best_distance_metric}, Best Score: {best_overall_score}")

# Fit the Weighted KNN model with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=best_overall_k, weights='distance', p=best_distance_metric)
knn_best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('knn', knn_best)])
knn_best_pipeline.fit(X_train, y_train)

# Predict and evaluate on the validation set
knn_preds = knn_best_pipeline.predict(X_valid)
knn_accuracy = accuracy_score(y_valid, knn_preds)
print(f"Weighted KNN Validation Accuracy: {knn_accuracy}")


K: 1, Distance Metric (p): 1, Cross-Val Accuracy: 0.8222222222222222
K: 2, Distance Metric (p): 1, Cross-Val Accuracy: 0.8222222222222222
K: 3, Distance Metric (p): 1, Cross-Val Accuracy: 0.8333333333333334
K: 4, Distance Metric (p): 1, Cross-Val Accuracy: 0.8466666666666667
K: 5, Distance Metric (p): 1, Cross-Val Accuracy: 0.8466666666666667
K: 6, Distance Metric (p): 1, Cross-Val Accuracy: 0.8488888888888889
K: 7, Distance Metric (p): 1, Cross-Val Accuracy: 0.8488888888888889
K: 8, Distance Metric (p): 1, Cross-Val Accuracy: 0.8488888888888889
K: 9, Distance Metric (p): 1, Cross-Val Accuracy: 0.86
K: 10, Distance Metric (p): 1, Cross-Val Accuracy: 0.8666666666666666
K: 11, Distance Metric (p): 1, Cross-Val Accuracy: 0.8711111111111111
K: 12, Distance Metric (p): 1, Cross-Val Accuracy: 0.8799999999999999
K: 13, Distance Metric (p): 1, Cross-Val Accuracy: 0.8666666666666666
K: 14, Distance Metric (p): 1, Cross-Val Accuracy: 0.8644444444444443
K: 15, Distance Metric (p): 1, Cross-Val Ac

In [5]:
test_data_path = '/Users/bahar/Downloads/heart_test.csv'
test_data = pd.read_csv(test_data_path)
X_test = test_data.drop(['Unnamed: 0', 'id'], axis=1)

# Predictions on the test data using the Weighted KNN model
knn_predictions = knn_best_pipeline.predict(X_test)
predictions_df = pd.DataFrame({
    "id": test_data['id'],
    "output": knn_predictions
})
output_path = '/Users/bahar/Desktop/heart_disease_predictions-WKNN-PCA.csv'

predictions_df.to_csv(output_path, index=False)