## Neural Network Model

In [1]:
# Importing dependencies
import pandas as pd
from pathlib import Path
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine

In [2]:
# Connecting to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/proj4_db')

query = "SELECT * FROM cleaned_data"
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totCHOL,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0.0,0,False,False,False,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0.0,0,False,False,False,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,1.0,20,False,False,False,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,1.0,30,False,False,True,False,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,1.0,23,False,False,False,False,285,130.0,84.0,23.1,85,85,False


## Optimization Attempt 1
For the first optimization attempt, we drop features with low PCA loadings from our PCA analysis (sex, education, diabetes, totChol)

In [3]:
df_op1 = df.copy()
df_op1.drop(columns=['sex', 'education', 'diabetes', 'totCHOL'], inplace=True)

In [4]:
# Separating target variables and features
y = df_op1['CHDRisk']
X = df_op1.drop(columns='CHDRisk')

In [5]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
y_train.value_counts()

CHDRisk
False    2312
True      415
Name: count, dtype: int64

In [7]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
y_train_resampled.value_counts()

CHDRisk
False    2312
True     2312
Name: count, dtype: int64

In [9]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [12]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6523 - loss: 0.6224 - recall: 0.5424
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7079 - loss: 0.5644 - recall: 0.6974
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7069 - loss: 0.5577 - recall: 0.6964
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7222 - loss: 0.5441 - recall: 0.7015
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7120 - loss: 0.5493 - recall: 0.7101
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 934us/step - accuracy: 0.7258 - loss: 0.5256 - recall: 0.6982
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7304 - loss: 0.5238 - recall: 0.7009
Epoch 8/100
[1m145/145[0m [32

In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 8ms/step - accuracy: 0.7495 - loss: 1.3470 - recall: 0.2464
Loss: 1.3470481634140015, Accuracy: 0.7494505643844604, Recall: 0.24637681245803833


This neural network model was slightly more accurate than the original NN model, however recall was lower.

## Optimization Attempt 2
For the second optimization attempt, we're dropping the binary features with high correlation to other non-binary features (smokingStatus, prevalentHyp, diabetes)

In [14]:
df_op2 = df.copy()
df_op2.drop(columns=['smokingStatus', 'prevalentHyp', 'diabetes'], inplace=True)

In [15]:
df_op2.head()

Unnamed: 0,sex,age,education,cigsPerDay,BPMeds,prevalentStroke,totCHOL,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0,False,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0,False,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,20,False,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,30,False,False,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,23,False,False,285,130.0,84.0,23.1,85,85,False


In [16]:
# Separating target variables and features
y = df_op2['CHDRisk']
X = df_op2.drop(columns='CHDRisk')

In [17]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [18]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [22]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 922us/step - accuracy: 0.6435 - loss: 0.6301 - recall_1: 0.7432
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 948us/step - accuracy: 0.7197 - loss: 0.5584 - recall_1: 0.7399
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 921us/step - accuracy: 0.7348 - loss: 0.5361 - recall_1: 0.7410
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 904us/step - accuracy: 0.7384 - loss: 0.5277 - recall_1: 0.7419
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7573 - loss: 0.4896 - recall_1: 0.7623
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.7666 - loss: 0.4836 - recall_1: 0.7774
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7892 - loss: 0.4679 - recall_1: 0.7930
Epoch 8/10

In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 8ms/step - accuracy: 0.7505 - loss: 1.5013 - recall_1: 0.2754
Loss: 1.501336693763733, Accuracy: 0.7505494356155396, Recall: 0.2753623127937317


This nn model increased in both accuracy and recall from the previous optimization attempt, and is currently the most accurate NN model.

## Optimization attempt 3
In this optimization attempt, we drop the features with the lowest feature importances (as calculated by the RF model): diabetes, BPMeds, and prevalentStroke.

In [24]:
df_op3 = df.copy()
df_op3.drop(columns=['diabetes', 'BPMeds', 'prevalentStroke'], inplace=True)

In [25]:
df_op3.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,prevalentHyp,totCHOL,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0.0,0,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0.0,0,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,1.0,20,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,1.0,30,True,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,1.0,23,False,285,130.0,84.0,23.1,85,85,False


In [26]:
# Separating target variables and features
y = df_op3['CHDRisk']
X = df_op3.drop(columns='CHDRisk')

In [27]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [28]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [29]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [32]:
# Training model
op3_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6113 - loss: 0.6663 - recall_2: 0.4529  
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7150 - loss: 0.5562 - recall_2: 0.7510
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 849us/step - accuracy: 0.7227 - loss: 0.5397 - recall_2: 0.7258
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7365 - loss: 0.5206 - recall_2: 0.7575
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7537 - loss: 0.4985 - recall_2: 0.7821
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7673 - loss: 0.4850 - recall_2: 0.7855
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 908us/step - accuracy: 0.7794 - loss: 0.4612 - recall_2: 0.7978
Epoch 8/100
[

In [33]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")

29/29 - 0s - 9ms/step - accuracy: 0.7527 - loss: 1.7766 - recall_2: 0.2899
Loss: 1.7766382694244385, Accuracy: 0.7527472376823425, Recall: 0.28985506296157837
