In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import numpy as np
import os

In [11]:
# Load the dataset
DATA_PATH = '../datasets/heart_disease_uci.csv'

try:
    data = pd.read_csv(DATA_PATH)
    print("Dataset loaded successfully.")
    print(data.head())
except FileNotFoundError:
    print(f"Error: '{DATA_PATH}' not found.")
    print("Please download the dataset and place it in the '../datasets' directory.")
    data = None

Dataset loaded successfully.
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4      

In [4]:
data.shape

(920, 16)

In [5]:
data.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [6]:
data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [13]:
# Handle missing values and non-predictive columns
if data is not None:
    print(f"Dataset shape after before NA: {data.shape}")
    data = data.dropna()
    print(f"Dataset shape after dropping NA: {data.shape}")
    
    # Drop 'id' and 'origin' if they exist
    data = data.drop(['id'], axis=1, errors='ignore')

Dataset shape after before NA: (920, 15)
Dataset shape after dropping NA: (299, 15)


In [14]:
# Prepare features 'X' and target 'y'
if data is not None:
    try:
        X_raw = data.drop('num', axis=1)
        y_raw = data['num']
        
        # Convert target 'num' (0=no, 1-4=yes) to binary (0=no, 1=yes)
        y = y_raw.astype(float).apply(lambda x: 1 if x > 0 else 0)
    except KeyError:
        print("Error: Could not find 'num' column. Please check your CSV.")
        data = None # Stop execution if key column is missing

In [15]:
# Convert categorical features to numeric using get_dummies
if data is not None:
    X = pd.get_dummies(X_raw) 
    print("Categorical features converted.")
    print(f"New feature shape: {X.shape}")

Categorical features converted.
New feature shape: (299, 28)


In [16]:
# Split data into training and testing sets
if data is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Training samples: 239, Test samples: 60


In [17]:
# Scale the Data
if data is not None:
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Data scaled.")

Data scaled.


In [18]:
# Create and Train the Logistic Regression Model
if data is not None:
    print("Training the heart disease model...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_scaled, y_train)
    print("Model trained.")

Training the heart disease model...
Model trained.


In [19]:
# Evaluate the Model
if data is not None:
    predictions = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")

Model Accuracy on Test Data: 90.00%


In [20]:
# Save the Model, Scaler, and Column List
if data is not None:
    MODEL_DIR = '../models'
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    model_filename = os.path.join(MODEL_DIR, 'heart_disease_model.pkl')
    scaler_filename = os.path.join(MODEL_DIR, 'heart_disease_scaler.pkl')
    columns_filename = os.path.join(MODEL_DIR, 'heart_disease_model_columns.pkl')
    
    # Save the model
    joblib.dump(model, model_filename)
    
    # Save the scaler
    joblib.dump(scaler, scaler_filename)
    
    # Save the column list (critical for the API)
    model_columns = list(X.columns)
    joblib.dump(model_columns, columns_filename)

    print(f"Model saved to {model_filename}")
    print(f"Scaler saved to {scaler_filename}")
    print(f"Model columns saved to {columns_filename}")

Model saved to ../models\heart_disease_model.pkl
Scaler saved to ../models\heart_disease_scaler.pkl
Model columns saved to ../models\heart_disease_model_columns.pkl
