## Neural Network Model Optimization

In [1]:
# Importing dependencies
import pandas as pd
from pathlib import Path
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from tensorflow.keras import metrics
from sqlalchemy import create_engine

In [2]:
# Connecting to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/proj4_db')

query = "SELECT * FROM cleaned_data"
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0.0,0,False,False,False,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0.0,0,False,False,False,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,1.0,20,False,False,False,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,1.0,30,False,False,True,False,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,1.0,23,False,False,False,False,285,130.0,84.0,23.1,85,85,False


## Optimization Attempt 1
For the first optimization attempt, we drop features with low PCA loadings from our PCA analysis (sex, BPMeds, prevalentStroke, diabetes)

In [3]:
df_op1 = df.copy()
df_op1.drop(columns=['sex', 'BPMeds', 'prevalentStroke', 'diabetes'], inplace=True)

In [4]:
# Separating target variables and features
y = df_op1['CHDRisk']
X = df_op1.drop(columns='CHDRisk')

In [5]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
y_train.value_counts()

CHDRisk
False    2312
True      415
Name: count, dtype: int64

In [7]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
y_train_resampled.value_counts()

CHDRisk
False    2312
True     2312
Name: count, dtype: int64

In [9]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [12]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 967us/step - accuracy: 0.6441 - loss: 0.6291 - recall: 0.5762 
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 785us/step - accuracy: 0.7033 - loss: 0.5620 - recall: 0.7269
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 761us/step - accuracy: 0.7144 - loss: 0.5569 - recall: 0.7338
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 771us/step - accuracy: 0.7235 - loss: 0.5401 - recall: 0.7575
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 756us/step - accuracy: 0.7372 - loss: 0.5248 - recall: 0.7728
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 750us/step - accuracy: 0.7592 - loss: 0.5117 - recall: 0.7910
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - accuracy: 0.7520 - loss: 0.5002 - recall: 0.7773
Epoch 8/100
[1m145

In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 7ms/step - accuracy: 0.7033 - loss: 1.4461 - recall: 0.2681
Loss: 1.446116328239441, Accuracy: 0.7032967209815979, Recall: 0.26811593770980835


This neural network model was slightly more accurate than the original NN model, however recall was lower.

## Optimization Attempt 2
For the second optimization attempt, we're dropping the binary features with high correlation to other non-binary features (smokingStatus, prevalentHyp, diabetes)

In [14]:
df_op2 = df.copy()
df_op2.drop(columns=['smokingStatus', 'prevalentHyp', 'diabetes'], inplace=True)

In [15]:
df_op2.head()

Unnamed: 0,sex,age,education,cigsPerDay,BPMeds,prevalentStroke,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0,False,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0,False,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,20,False,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,30,False,False,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,23,False,False,285,130.0,84.0,23.1,85,85,False


In [16]:
# Separating target variables and features
y = df_op2['CHDRisk']
X = df_op2.drop(columns='CHDRisk')

In [17]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [18]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Scaling the feature variables
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [22]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 920us/step - accuracy: 0.6219 - loss: 0.6364 - recall_1: 0.7848
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 778us/step - accuracy: 0.7058 - loss: 0.5688 - recall_1: 0.7165
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 750us/step - accuracy: 0.7222 - loss: 0.5454 - recall_1: 0.7393
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 750us/step - accuracy: 0.7389 - loss: 0.5254 - recall_1: 0.7631
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752us/step - accuracy: 0.7269 - loss: 0.5234 - recall_1: 0.7396
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step - accuracy: 0.7647 - loss: 0.4941 - recall_1: 0.7743
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 756us/step - accuracy: 0.7726 - loss: 0.4755 - recall_1: 0.7823
Epoch 

In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")


29/29 - 0s - 8ms/step - accuracy: 0.7319 - loss: 1.6564 - recall_1: 0.2464
Loss: 1.656411051750183, Accuracy: 0.7318681478500366, Recall: 0.24637681245803833


This nn model increased in both accuracy and recall from the previous optimization attempt, and is currently the most accurate NN model.

## Optimization attempt 3
In this optimization attempt, we drop the features with the lowest feature importances (as calculated by the RF model): diabetes, BPMeds, and prevalentStroke.

In [24]:
df_op3 = df.copy()
df_op3.drop(columns=['diabetes', 'BPMeds', 'prevalentStroke'], inplace=True)

In [25]:
df_op3.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,1.0,39,4,0.0,0,False,195,106.0,70.0,26.97,80,77,False
1,0.0,46,2,0.0,0,False,250,121.0,81.0,28.73,95,76,False
2,1.0,48,1,1.0,20,False,245,127.5,80.0,25.34,75,70,False
3,0.0,61,3,1.0,30,True,225,150.0,95.0,28.58,65,103,True
4,0.0,46,3,1.0,23,False,285,130.0,84.0,23.1,85,85,False


In [26]:
# Separating target variables and features
y = df_op3['CHDRisk']
X = df_op3.drop(columns='CHDRisk')

In [27]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [28]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [29]:
# Scaling the feature variables
scaler = StandardScaler()
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [32]:
# Training model
op3_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 871us/step - accuracy: 0.6275 - loss: 0.6367 - recall_2: 0.5674
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770us/step - accuracy: 0.7202 - loss: 0.5577 - recall_2: 0.7478
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 801us/step - accuracy: 0.7304 - loss: 0.5419 - recall_2: 0.7396
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 743us/step - accuracy: 0.7388 - loss: 0.5275 - recall_2: 0.7548
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 754us/step - accuracy: 0.7534 - loss: 0.5046 - recall_2: 0.7806
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 750us/step - accuracy: 0.7667 - loss: 0.4934 - recall_2: 0.7917
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step - accuracy: 0.7719 - loss: 0.4826 - recall_2: 0.8025
Epoch 

In [33]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")

29/29 - 0s - 8ms/step - accuracy: 0.7319 - loss: 1.5531 - recall_2: 0.2971
Loss: 1.5530589818954468, Accuracy: 0.7318681478500366, Recall: 0.2971014380455017


## Optimization Attempt 4
In this optimization attempt, we use Mean Aerterial Pressure (MAP) in place of systolic and diasystolic pressure (as calculated in our data cleaning)

In [34]:
# Connecting to PostgreSQL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/proj4_db')

query = "SELECT * FROM cleaned_data_map"
df_map = pd.read_sql(query, engine)
df_map.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,BMI,heartRate,glucose,CHDRisk,MAP
0,1.0,39,4,0.0,0,False,False,False,False,195,26.97,80,77,False,82.0
1,0.0,46,2,0.0,0,False,False,False,False,250,28.73,95,76,False,94.333333
2,1.0,48,1,1.0,20,False,False,False,False,245,25.34,75,70,False,95.833333
3,0.0,61,3,1.0,30,False,False,True,False,225,28.58,65,103,True,113.333333
4,0.0,46,3,1.0,23,False,False,False,False,285,23.1,85,85,False,99.333333


In [35]:
# Separating target variables and features
y = df_map['CHDRisk']
X = df_map.drop(columns='CHDRisk')

In [36]:
# Splitting the data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [37]:
# Using synthetic minority over-sampling technique to balance the target variable conditions in the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [38]:
# Scaling the feature variables
X_scaler = scaler.fit(X_train_resampled)

X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
# Defining original model
number_input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=64, input_dim=number_input_features, activation='relu')
)

# Hidden layer 1
nn.add(tf.keras.layers.Dense(units=32, activation='relu'))

# Hidden layer 2
nn.add(tf.keras.layers.Dense(units=16, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [40]:
# Compiling the model
nn.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', metrics.Recall()]
)

In [41]:
# Training model
fit_model = nn.fit(X_train_scaled, y_train_resampled, epochs=100)

Epoch 1/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 912us/step - accuracy: 0.6356 - loss: 0.6227 - recall_3: 0.5970
Epoch 2/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 675us/step - accuracy: 0.7201 - loss: 0.5534 - recall_3: 0.7468
Epoch 3/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 679us/step - accuracy: 0.7328 - loss: 0.5373 - recall_3: 0.7614
Epoch 4/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 689us/step - accuracy: 0.7423 - loss: 0.5135 - recall_3: 0.7830
Epoch 5/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 669us/step - accuracy: 0.7638 - loss: 0.4947 - recall_3: 0.7842
Epoch 6/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 729us/step - accuracy: 0.7691 - loss: 0.4768 - recall_3: 0.7837
Epoch 7/100
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.7827 - loss: 0.4605 - recall_3: 0.7846
Epoch 

In [42]:
# Evaluate the model using the test data
model_loss, model_accuracy, model_recall = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}, Recall: {model_recall}")

29/29 - 0s - 7ms/step - accuracy: 0.7275 - loss: 1.5690 - recall_3: 0.2464
Loss: 1.5689667463302612, Accuracy: 0.7274725437164307, Recall: 0.24637681245803833
