<a href="https://colab.research.google.com/github/SHEETALDHARASHAN/Heart-Disease-prediction-ml-model/blob/main/HDP_ml_model%20with%20bp%20col.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
file_path = 'heart_disease_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Preprocess the data
# Handle missing values (if any)
data.fillna(data.mean(), inplace=True)

# Select specified features
selected_features = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs']
X = data[selected_features]
y = data['target']

# Normalize/scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'k-NN': KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate each model
best_model = None
best_score = 0
evaluation_results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    evaluation_results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

    # Cross-validation
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    cv_score_mean = cv_scores.mean()

    print(f"\n{model_name} Evaluation Metrics:")
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Cross-Validation Accuracy: {cv_score_mean}')

    # Update the best model based on cross-validation score
    if cv_score_mean > best_score:
        best_score = cv_score_mean
        best_model = model

print("\nBest model based on cross-validation accuracy:")
print(best_model)

# Function to predict heart disease probability for new data input
def predict_heart_disease(input_data, model, scaler, features):
    # Convert the input data to a DataFrame
    input_df = pd.DataFrame([input_data], columns=features)

    # Scale the input data
    input_scaled = scaler.transform(input_df)

    # Get the probability of heart disease
    probability = model.predict_proba(input_scaled)[0][1]

    return probability

# Example usage with the best model
new_patient_data = {
    'age': 45,
    'sex': 1,  # 1: male, 0: female
    'cp': 2,  # chest pain type
    'trestbps': 130,  # resting blood pressure
    'chol': 250,  # serum cholesterol
    'fbs': 0  # fasting blood sugar > 120 mg/dl
}

# Predict the probability with the best model
probability = predict_heart_disease(new_patient_data, best_model, scaler, selected_features)
print(f'The probability of having heart disease is {probability * 100:.2f}%')


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   48    1   0       130   256    1        0      150      1      0.0      2   
1   61    1   0       148   203    0        1      161      0      0.0      2   
2   44    0   2       118   242    0        1      149      0      0.3      1   
3   47    1   0       110   275    0        0      118      1      1.0      1   
4   56    1   3       120   193    0        0      162      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   1     3       0  
2   1     2       1  
3   1     2       0  
4   0     3       1  

Logistic Regression Evaluation Metrics:
Accuracy: 0.7704918032786885
Precision: 0.8142857142857143
Recall: 0.7916666666666666
F1 Score: 0.8028169014084506
Confusion Matrix:
[[37 13]
 [15 57]]
Cross-Validation Accuracy: 0.763961522828885

Random Forest Evaluation Metrics:
Accuracy: 0.9508196721311475
Precision: 0.9459459459459459
Recall: 0.9722222222222222
F1 Score: 0.9589041095