In [None]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report

In [None]:
# Load CSV file
df = pd.read_csv("heart_cleveland_upload.csv")

In [None]:
# View first rows
df.head()

In [None]:
# Check shape
df.shape

In [None]:
# Check datatypes
df.info()

In [None]:
# Check missing values
df.isnull().sum()

In [None]:
# Fill missing numerical values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [None]:
#Summary Stats
df.describe()

In [None]:
#Target Distribution
sns.countplot(x='condition', data=df)
plt.title("Target Distribution (0 = No Disease, 1 = Disease)")
plt.show()

In [None]:
#Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#Separate Features & Target
X = df.drop("condition", axis=1)
y = df["condition"]

In [None]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
#Scaling (important for Logistic Regression)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [None]:
# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

In [None]:
# Logistic Regression: Predictions
y_pred_lr = log_reg.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", acc_lr)

In [None]:
print(classification_report(y_test, y_pred_lr))

In [None]:
# Decision Tree: Predictions
y_pred_dt = dt_clf.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", acc_dt)

In [None]:
print(classification_report(y_test, y_pred_dt))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix — Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ROC Curve & AUC
y_proba_lr = log_reg.predict_proba(X_test_scaled)[:,1]

fpr, tpr, _ = roc_curve(y_test, y_proba_lr)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Logistic Regression")
plt.legend()
plt.show()

In [None]:
# Decision Tree Feature Importance
feat_imp = pd.Series(dt_clf.feature_importances_, index=X.columns)
feat_imp.sort_values().plot(kind='barh')
plt.title("Decision Tree Feature Importance")
plt.show()

In [None]:
import pickle

with open("heart_disease_model.pkl", "wb") as file:
    pickle.dump(log_reg, file)