In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings('ignore')


df = pd.read_csv('dataset.csv')
df = df.fillna(df.median())


columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


dataset = pd.get_dummies(df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])


X = dataset.drop('target', axis=1)
y = dataset['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=12),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

best_model = None
best_accuracy = 0
best_cv_score = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X, y, cv=10)
    mean_cv_score = cv_scores.mean()


    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    print(f"{model_name} - Test Accuracy: {accuracy * 100:.2f}%, Cross-Validation Score: {mean_cv_score * 100:.2f}%")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Classification Report:\n{cr}")


    if accuracy > best_accuracy:
        best_model = model
        best_accuracy = accuracy
        best_cv_score = mean_cv_score

joblib.dump(best_model, 'best_model.pkl')

print("\nBest Model:")
print(f"Test Accuracy: {best_accuracy * 100:.2f}%")
print(f"Mean Cross-Validation Score: {best_cv_score * 100:.2f}%")


K-Nearest Neighbors - Test Accuracy: 90.16%, Cross-Validation Score: 84.48%
Confusion Matrix:
[[26  3]
 [ 3 29]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        29
           1       0.91      0.91      0.91        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

Logistic Regression - Test Accuracy: 88.52%, Cross-Validation Score: 85.45%
Confusion Matrix:
[[26  3]
 [ 4 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        29
           1       0.90      0.88      0.89        32

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61

Random Forest - Test Accuracy: 86.89%, Cross-Validation Score: 83.14%
Conf

In [None]:
def predict(input_data):
    # Ensure input data matches the feature set used for training
    input_data = pd.DataFrame([input_data])

    # One-hot encode the categorical variables
    input_data = pd.get_dummies(input_data, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

    # Ensure the input data has all columns that the model expects
    missing_cols = set(X.columns) - set(input_data.columns)
    for col in missing_cols:
        input_data[col] = 0  # Add missing columns with default value 0

    # Reorder columns to match the model's expected input
    input_data = input_data[X.columns]

    # Apply the same scaling as the training data
    input_data[columns_to_scale] = scaler.transform(input_data[columns_to_scale])

    # Handle missing values (if any) in the input data
    input_data = input_data.fillna(input_data.median())

    # Load the model and make a prediction
    model = joblib.load('knn_model.pkl')
    prediction = model.predict(input_data)
    return prediction[0]

# Get user inputs and predict
user_input = {
    'age': 20,                    # Age: 60
    'sex': 1,                     # Sex: 1 (Male)
    'cp': 0,                      # Chest pain type: 3 (Asymptomatic)
    'trestbps': 140,               # Resting blood pressure: 160 mm Hg
    'chol': 228,                   # Cholesterol level: 320 mg/dl
    'fbs': 0,                      # Fasting blood sugar: 1 (> 120 mg/dl)
    'restecg': 1,                  # Resting electrocardiographic results: 2 (Hypertrophy)
    'thalach': 138,                # Maximum heart rate achieved: 120 bpm
    'exang': 0,                    # Exercise induced angina: 1 (Yes)
    'oldpeak': 1,                # Depression induced by exercise relative to rest: 3.0
    'slope': 0,                    # Slope of the peak exercise ST segment: 0 (Upward sloping)
    'ca': 0,                       # Number of major vessels colored by fluoroscopy: 3
    'thal': 1
}

# Display the prediction
prediction = predict(user_input)
print(f"Prediction: {'Heart disease present' if prediction == 1 else 'No heart disease'}")

Prediction: Heart disease present
