## Neural Network Model optimization
This notebook attempts to optimize the NN model using the data with Mean Aerterial Pressure (MAP) instead of systolic and diasystolic blood pressure.

In [1]:
# Importing dependencies
import pandas as pd
from pathlib import Path
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine

In [2]:
# Connecting to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/proj4_db')

query = "SELECT * FROM cleaned_data_map"
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totCHOL,BMI,heartRate,glucose,CHDRisk,MAP
0,1.0,39,4,0.0,0,False,False,False,False,195,26.97,80,77,False,82.0
1,0.0,46,2,0.0,0,False,False,False,False,250,28.73,95,76,False,94.333333
2,1.0,48,1,1.0,20,False,False,False,False,245,25.34,75,70,False,95.833333
3,0.0,61,3,1.0,30,False,False,True,False,225,28.58,65,103,True,113.333333
4,0.0,46,3,1.0,23,False,False,False,False,285,23.1,85,85,False,99.333333


In [3]:
# Separating target variables and features
y = df['CHDRisk']
X = df.drop(columns='CHDRisk')

In [4]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [6]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

## Original NN model (with MAP)

In [7]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [9]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6206 - loss: 0.6374 - recall: 0.6556
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step - accuracy: 0.7173 - loss: 0.5471 - recall: 0.7143
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7358 - loss: 0.5213 - recall: 0.7706
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7375 - loss: 0.5207 - recall: 0.7602
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7513 - loss: 0.5008 - recall: 0.7769
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step - accuracy: 0.7553 - loss: 0.4880 - recall: 0.7632
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step - accuracy: 0.7618 - loss: 0.4816 - recall: 0.7750
Epoch 8/100
[1m145/145[0m 

In [10]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 8ms/step - accuracy: 0.7484 - loss: 1.5945 - recall: 0.2536
Loss: 1.5944875478744507, Accuracy: 0.7483516335487366, Recall: 0.25362318754196167


## Optimization Attempt 1
For the first optimization attempt, we drop features with low PCA loadings from our PCA analysis (sex, BPMeds, prevalentStroke, heartRate)

In [11]:
df_op1 = df.copy()
df_op1.drop(columns=['sex', 'BPMeds', 'prevalentStroke', 'heartRate'], inplace=True)

In [12]:
# Separating target variables and features
y = df_op1['CHDRisk']
X = df_op1.drop(columns='CHDRisk')

In [13]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [14]:
y_train.value_counts()

CHDRisk
False    2312
True      415
Name: count, dtype: int64

In [15]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
y_train_resampled.value_counts()

CHDRisk
False    2312
True     2312
Name: count, dtype: int64

In [17]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [20]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 981us/step - accuracy: 0.6459 - loss: 0.6365 - recall_1: 0.7967
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 965us/step - accuracy: 0.7109 - loss: 0.5575 - recall_1: 0.7276
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step - accuracy: 0.7214 - loss: 0.5442 - recall_1: 0.7693
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step - accuracy: 0.7327 - loss: 0.5346 - recall_1: 0.7809
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step - accuracy: 0.7355 - loss: 0.5210 - recall_1: 0.7830
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 909us/step - accuracy: 0.7461 - loss: 0.5161 - recall_1: 0.7757
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7549 - loss: 0.5080 - recall_1: 0.8086
Epoch 8/

In [21]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 9ms/step - accuracy: 0.7055 - loss: 1.2852 - recall_1: 0.3188
Loss: 1.2852474451065063, Accuracy: 0.7054945230484009, Recall: 0.3188405930995941


This neural network model was slightly more accurate than the original NN model, however recall was lower.

## Optimization Attempt 2
For the second optimization attempt, we're dropping the binary features with high correlation to other non-binary features (smokingStatus, prevalentHyp, diabetes)

In [22]:
df_op2 = df.copy()
df_op2.drop(columns=['smokingStatus', 'prevalentHyp', 'diabetes'], inplace=True)

In [23]:
df_op2.head()

Unnamed: 0,sex,age,education,cigsPerDay,BPMeds,prevalentStroke,totCHOL,BMI,heartRate,glucose,CHDRisk,MAP
0,1.0,39,4,0,False,False,195,26.97,80,77,False,82.0
1,0.0,46,2,0,False,False,250,28.73,95,76,False,94.333333
2,1.0,48,1,20,False,False,245,25.34,75,70,False,95.833333
3,0.0,61,3,30,False,False,225,28.58,65,103,True,113.333333
4,0.0,46,3,23,False,False,285,23.1,85,85,False,99.333333


In [24]:
# Separating target variables and features
y = df_op2['CHDRisk']
X = df_op2.drop(columns='CHDRisk')

In [25]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [26]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [27]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [29]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [30]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 991us/step - accuracy: 0.6384 - loss: 0.6264 - recall_2: 0.6184
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 983us/step - accuracy: 0.7364 - loss: 0.5484 - recall_2: 0.7597
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 976us/step - accuracy: 0.7273 - loss: 0.5371 - recall_2: 0.7519
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - accuracy: 0.7537 - loss: 0.5082 - recall_2: 0.7736
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 950us/step - accuracy: 0.7626 - loss: 0.4953 - recall_2: 0.7782
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 936us/step - accuracy: 0.7657 - loss: 0.4824 - recall_2: 0.7907
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step - accuracy: 0.7754 - loss: 0.4749 - recall_2: 0.7881
Epoch 

In [31]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 9ms/step - accuracy: 0.7549 - loss: 1.3742 - recall_2: 0.2536
Loss: 1.374248743057251, Accuracy: 0.7549450397491455, Recall: 0.25362318754196167


This nn model increased in both accuracy and recall from the previous optimization attempt, and is currently the most accurate NN model.

## Optimization attempt 3
In this optimization attempt, we drop the features with the lowest feature importances (as calculated by the RF model): diabetes, BPMeds, and prevalentStroke.

In [32]:
df_op3 = df.copy()
df_op3.drop(columns=['diabetes', 'BPMeds', 'prevalentStroke'], inplace=True)

In [33]:
df_op3.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,prevalentHyp,totCHOL,BMI,heartRate,glucose,CHDRisk,MAP
0,1.0,39,4,0.0,0,False,195,26.97,80,77,False,82.0
1,0.0,46,2,0.0,0,False,250,28.73,95,76,False,94.333333
2,1.0,48,1,1.0,20,False,245,25.34,75,70,False,95.833333
3,0.0,61,3,1.0,30,True,225,28.58,65,103,True,113.333333
4,0.0,46,3,1.0,23,False,285,23.1,85,85,False,99.333333


In [34]:
# Separating target variables and features
y = df_op3['CHDRisk']
X = df_op3.drop(columns='CHDRisk')

In [35]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [36]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [37]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [39]:
from tensorflow.keras import metrics

# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [40]:
# Training model
op3_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5911 - loss: 0.6563 - recall_3: 0.6921
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - accuracy: 0.7167 - loss: 0.5576 - recall_3: 0.7296
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 904us/step - accuracy: 0.7319 - loss: 0.5373 - recall_3: 0.7668
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 909us/step - accuracy: 0.7362 - loss: 0.5191 - recall_3: 0.7682
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7351 - loss: 0.5110 - recall_3: 0.7336
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7619 - loss: 0.4950 - recall_3: 0.7722
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7738 - loss: 0.4729 - recall_3: 0.8192
Epoch 8/100
[

In [41]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")

29/29 - 0s - 8ms/step - accuracy: 0.7637 - loss: 1.4508 - recall_3: 0.2029
Loss: 1.4508092403411865, Accuracy: 0.7637362480163574, Recall: 0.2028985470533371
