In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
file_path = '/content/heart disease prediction.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Separate the features (X) and the target variable (y)
X = data.drop('HeartDisease', axis=1)  # 'HeartDisease' is the target variable
y = data['HeartDisease']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that combines preprocessing with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
print("Training the model...")
pipeline.fit(X_train, y_train)
print("Model training completed.")

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Predicting Heart Disease for New Data
# Example new data (replace with actual new data values)
new_data = pd.DataFrame({
    'Age': [45],  # Replace with actual values
    'Sex': ['M'],  # Replace with actual values
    'ChestPainType': ['ATA'],  # Replace with actual values
    'RestingBP': [130],  # Replace with actual values
    'Cholesterol': [237],  # Replace with actual values
    'FastingBS': [0],  # Replace with actual values
    'RestingECG': ['Normal'],  # Replace with actual values
    'MaxHR': [170],  # Replace with actual values
    'ExerciseAngina': ['N'],  # Replace with actual values
    'Oldpeak': [0],  # Replace with actual values
    'ST_Slope': ['Up']  # Replace with actual values
})

# Predict the heart disease status using the trained model
predicted_status = pipeline.predict(new_data)

print(f"Predicted Heart Disease Status: {predicted_status[0]}")


Training the model...
Model training completed.
Accuracy: 0.8804347826086957
Confusion Matrix:
[[67 10]
 [12 95]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86        77
           1       0.90      0.89      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184

Predicted Heart Disease Status: 0
