In [32]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score,precision_score, recall_score, mean_squared_error

#https://www.kaggle.com/datasets/oktayrdeki/heart-disease
df = pd.read_csv('Data/heart_disease.csv')


In [3]:
print(f"Dataset Shape {df.shape}")
print(f"Total Records: {df.shape[0]:,}")
print(f"Total Features: {df.shape[1]}")
print(f"Dataset Info: {df.info()}")
print(f"First five rows: {df.head()}")
print(f"Missing values: ${df.isnull().sum()}")
print(f"Basic Stats: {df.describe()}")



Dataset Shape (10000, 21)
Total Records: 10,000
Total Features: 21
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Slee

In [4]:
#You can use obj_df = df.select_dtypes(include=['object']).copy()
# Guide to refrence for encoding valueshttps://pbpython.com/categorical-encoding.html 

print(df.columns)
categroical_values = []
for colum in df.columns:
    if not(pd.api.types.is_numeric_dtype(df[colum])):
        categroical_values.append(colum)
        
print("Categorical Values:",categroical_values)


Index(['Age', 'Gender', 'Blood Pressure', 'Cholesterol Level',
       'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'BMI',
       'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol',
       'Alcohol Consumption', 'Stress Level', 'Sleep Hours',
       'Sugar Consumption', 'Triglyceride Level', 'Fasting Blood Sugar',
       'CRP Level', 'Homocysteine Level', 'Heart Disease Status'],
      dtype='object')
Categorical Values: ['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol', 'Alcohol Consumption', 'Stress Level', 'Sugar Consumption', 'Heart Disease Status']


In [5]:
for feature in categroical_values:
    print(f"\n{feature.upper()} Distribution:")
    distribution = df[feature].value_counts()
    percentage = df[feature].value_counts(normalize=True) * 100
    for value, count in distribution.items():
        print(f"  {value}: {count:,} ({percentage[value]:.1f}%)")



GENDER Distribution:
  Male: 5,003 (50.1%)
  Female: 4,978 (49.9%)

EXERCISE HABITS Distribution:
  High: 3,372 (33.8%)
  Medium: 3,332 (33.4%)
  Low: 3,271 (32.8%)

SMOKING Distribution:
  Yes: 5,123 (51.4%)
  No: 4,852 (48.6%)

FAMILY HEART DISEASE Distribution:
  No: 5,004 (50.1%)
  Yes: 4,975 (49.9%)

DIABETES Distribution:
  No: 5,018 (50.3%)
  Yes: 4,952 (49.7%)

HIGH BLOOD PRESSURE Distribution:
  Yes: 5,022 (50.4%)
  No: 4,952 (49.6%)

LOW HDL CHOLESTEROL Distribution:
  Yes: 5,000 (50.1%)
  No: 4,975 (49.9%)

HIGH LDL CHOLESTEROL Distribution:
  No: 5,036 (50.5%)
  Yes: 4,938 (49.5%)

ALCOHOL CONSUMPTION Distribution:
  Medium: 2,500 (33.7%)
  Low: 2,488 (33.6%)
  High: 2,426 (32.7%)

STRESS LEVEL Distribution:
  Medium: 3,387 (33.9%)
  Low: 3,320 (33.3%)
  High: 3,271 (32.8%)

SUGAR CONSUMPTION Distribution:
  Low: 3,390 (34.0%)
  High: 3,330 (33.4%)
  Medium: 3,250 (32.6%)

HEART DISEASE STATUS Distribution:
  No: 8,000 (80.0%)
  Yes: 2,000 (20.0%)


In [6]:
encoded_df = df.copy()
encoded_df=encoded_df.dropna()

mapping = {'Low': 0, 'Medium': 1, 'High': 2}

encoded_df["Sugar Consumption"] = encoded_df['Sugar Consumption'].map(mapping)
encoded_df["Exercise Habits"] = encoded_df['Exercise Habits'].map(mapping)
encoded_df["Stress Level"] = encoded_df["Stress Level"].map(mapping)
encoded_df["Alcohol Consumption"] = encoded_df["Alcohol Consumption"].map(mapping)


encoded_df["Gender"] = (df['Gender']=="Male").astype(int)
encoded_df["Smoking"] = (df['Smoking'] == 'Yes').astype(int)
encoded_df["Family Heart Disease"] = (df["Family Heart Disease"] == 'Yes').astype(int)
encoded_df["Diabetes"] = (df["Diabetes"]=="Yes").astype(int)
encoded_df["High Blood Pressure"] = (df["High Blood Pressure"]=="Yes").astype(int)
encoded_df["Low HDL Cholesterol"] = (df["Low HDL Cholesterol"]=="Yes").astype(int)
encoded_df["Heart Disease Status"] = (df["Heart Disease Status"]=="Yes").astype(int)
encoded_df["High LDL Cholesterol"] = (df["High LDL Cholesterol"]=="Yes").astype(int)

print(encoded_df)


       Age  Gender  Blood Pressure  Cholesterol Level  Exercise Habits  \
1     69.0       0           146.0              286.0                2   
2     46.0       1           126.0              216.0                0   
3     32.0       0           122.0              293.0                2   
4     60.0       1           166.0              242.0                0   
5     25.0       1           152.0              257.0                0   
...    ...     ...             ...                ...              ...   
9992  68.0       0           169.0              291.0                1   
9994  73.0       0           144.0              191.0                1   
9995  25.0       0           136.0              243.0                1   
9998  23.0       1           142.0              299.0                0   
9999  38.0       0           128.0              193.0                1   

      Smoking  Family Heart Disease  Diabetes        BMI  High Blood Pressure  \
1           0                 

In [7]:
#Normalize features
encoded_df.describe()

scaler = StandardScaler()
features = ['Age', 'Gender', 'Blood Pressure', 'Cholesterol Level',
       'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'BMI',
       'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol',
       'Alcohol Consumption', 'Stress Level', 'Sleep Hours',
       'Sugar Consumption', 'Triglyceride Level', 'Fasting Blood Sugar',
       'CRP Level', 'Homocysteine Level']
X=encoded_df[features]
y=encoded_df["Heart Disease Status"].copy()



In [8]:
X_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
X_cv, X_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

# Delete temporary variables
del x_, y_



In [14]:
scaler_linear = StandardScaler()

x_train_scaled = scaler_linear.fit_transform(X_train)
x_cv_scaled = scaler_linear.transform(X_cv)
x_test_scaled = scaler_linear.transform(X_test)
print("Feature correlations with target:")
print(X_train.corrwith(y_train))
print("Heart Disease Status Distribution:")
print(y_train.value_counts(normalize=True))

Feature correlations with target:
Age                     0.002056
Gender                 -0.013527
Blood Pressure         -0.039485
Cholesterol Level       0.005571
Exercise Habits        -0.015064
Smoking                -0.015775
Family Heart Disease    0.016094
Diabetes               -0.001333
BMI                     0.027830
High Blood Pressure    -0.004309
Low HDL Cholesterol    -0.012282
High LDL Cholesterol    0.005539
Alcohol Consumption    -0.002566
Stress Level            0.019401
Sleep Hours            -0.013204
Sugar Consumption       0.033851
Triglyceride Level     -0.000106
Fasting Blood Sugar    -0.001236
CRP Level              -0.001019
Homocysteine Level      0.005500
dtype: float64
Heart Disease Status Distribution:
Heart Disease Status
0    0.796698
1    0.203302
Name: proportion, dtype: float64


In [36]:
#define 3 Neural Networks 
model_1 = Sequential([
    tf.keras.Input(shape=(20,)),
    Dense(128,activation="relu"),
    Dense(64,activation="relu"),
    Dense(32,activation="relu"),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(4,activation="relu"),
    Dense(2,activation="relu"),
    Dense(1,activation="linear"),

])

model_2 = Sequential([
    tf.keras.Input(shape=(20,)),
    Dense(32,activation="relu"),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(1,activation="linear")
])

model_3 = Sequential([
    tf.keras.Input(shape=(20,)),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(1,activation="linear")
])

model_4 = Sequential([
    tf.keras.Input(shape=(20,)),
    Dense(8,activation="relu"),
    Dense(1,activation="linear")
])

model_5 = Sequential([
    tf.keras.Input(shape=(20,)),
    Dense(8, activation="leaky_relu"),
    Dense(1, activation="linear")
])
models = [model_1,model_2,model_3,model_4,model_5]


nn_train_error = []
nn_cv_error=[]
train_predictions = []  # Store training predictions
cv_predictions = [] 
for model in models:
    
    # Setup the loss and optimizer
    model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    )

    print(f"Training {model.name}...")

    # Train the model
    model.fit(
        X_train, y_train,
        epochs=100,
        verbose=0
    )
    
    print("Done!\n")
    
    # Set the threshold for classification
    threshold = 0.5
    
    # Record the fraction of misclassified examples for the training set
    yhat = model.predict(X_train)
   
    #flatten yhat to make it a 1d arr
    yhat=yhat.flatten()
   
    yhat = tf.math.sigmoid(yhat)
    yhat = np.where(yhat >= threshold, 1, 0)
    train_error = np.mean(yhat != y_train)
    nn_train_error.append(train_error)
    train_predictions.append(yhat)


    # Record the fraction of misclassified examples for the cross validation set
    yhat = model.predict(x_cv_scaled)
    yhat=yhat.flatten()
    yhat = tf.math.sigmoid(yhat)
    yhat = np.where(yhat >= threshold, 1, 0)
    cv_error = np.mean(yhat != y_cv)
    nn_cv_error.append(cv_error)
    cv_predictions.append(yhat)

    
    
  
#Try different learning rate

# Print the result
for model_num in range(len(nn_train_error)):
    print(
        f"Model {model_num+1}: Training Set Classification Error: {nn_train_error[model_num]:.10f}, " +
        f"CV Set Classification Error: {nn_cv_error[model_num]:.10f}"
        )
    


Training sequential_94...
Done!

[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Training sequential_95...
Done!

[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Training sequential_96...
Done!

[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Training sequential_97...
Done!

[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 990us/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Training sequential_98...
Done!

[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Model 1: Training Set Classification Error: 0.2033018868, CV Set Classification Er

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)  # Convert to binary


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1]
0
