In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [7]:
# Load dataset
df = pd.read_csv('HeartDiseaseTrain-Test.csv')

In [9]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']


In [11]:
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

In [13]:
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # fill missing with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [17]:
# Choose a model (Random Forest or Logistic Regression)
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=1000, random_state=42))])


In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Train models
rf_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)


In [22]:
# Predict probabilities
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]
lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]


In [25]:
# Evaluate
print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf_pred_proba))
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, lr_pred_proba))


Random Forest ROC-AUC: 1.0
Logistic Regression ROC-AUC: 0.9055777650866171


In [27]:
# Example: Risk score for a single patient
sample_patient = X_test.iloc[0:1]
rf_risk_score = rf_model.predict_proba(sample_patient)[0][1]
lr_risk_score = lr_model.predict_proba(sample_patient)[0][1]


In [29]:
print("Random Forest risk score:", rf_risk_score)
print("Logistic Regression risk score:", lr_risk_score)

Random Forest risk score: 0.99
Logistic Regression risk score: 0.9863440945603338
