In [None]:
# Import necessary libraries for Neural Network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Verify GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Scaling adjustments

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# Adding the NotObese column
df['NotObese'] = df['BMI'].apply(lambda x: 1 if x < 30 else 0)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0, 12: 0, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['PhysHlth']) / 2
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol']) / 2
df['Lifestyle'] = (df['NoSmoker'] + df['Fruits']) / 2

# Splitting the dataset into features and target variable
X = df[['Age', 'MentHlth', 'NotObese', 'NoDocbcCost', 'PhysicalCondition', 'NoDisease', 'PhysHlth', 'Lifestyle', 'GenHlth', 'Income', 'Fruits']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the Neural Network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Neural Network model
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the validation set
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Predictions and evaluation metrics
y_val_pred = (nn_model.predict(X_val) > 0.5).astype("int32")

print("Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

In [3]:
# Import necessary libraries for Neural Network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Verify GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Scaling adjustments

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# Adding the NotObese column
df['NotObese'] = df['BMI'].apply(lambda x: 1 if x < 30 else 0)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0, 12: 0, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['PhysHlth']) / 2
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol']) / 2
df['Lifestyle'] = (df['NoSmoker'] + df['Fruits']) / 2

# Splitting the dataset into features and target variable
X = df[['Age', 'MentHlth', 'NotObese', 'NoDocbcCost', 'PhysicalCondition', 'NoDisease', 'PhysHlth', 'Lifestyle', 'GenHlth', 'Income', 'Fruits']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the Neural Network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Neural Network model
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the validation set
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Predictions and evaluation metrics
y_val_pred = (nn_model.predict(X_val) > 0.5).astype("int32")

print("Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Num GPUs Available:  1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Validation Loss: 0.516944169998169
Validation Accuracy: 0.7269394397735596
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.72      0.82     43739
           1       0.31      0.78      0.44      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.86      0.73      0.77     50736

Confusion Matrix:
[[31446 12293]
 [ 1561  5436]]


Same Model vs Raw Data

In [4]:
# Import necessary libraries for Neural Network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Dropping the ID column
df.drop(columns=['ID'], inplace=True)

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the Neural Network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Neural Network model
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the validation set
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Predictions and evaluation metrics
y_val_pred = (nn_model.predict(X_val) > 0.5).astype("int32")

print("Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Validation Loss: 0.5030717253684998
Validation Accuracy: 0.7396917343139648
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.73      0.83     43739
           1       0.32      0.77      0.45      6997

    accuracy                           0.74     50736
   macro avg       0.64      0.75      0.64     50736
weighted avg       0.87      0.74      0.78     50736

Confusion Matrix:
[[32120 11619]
 [ 1588  5409]]


Same model vs Raw Data and no SMOTE

In [5]:
# Import necessary libraries for Neural Network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Dropping the ID column
df.drop(columns=['ID'], inplace=True)

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Splitting the remaining 80% of the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Define the Neural Network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Neural Network model
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the validation set
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Predictions and evaluation metrics
y_val_pred = (nn_model.predict(X_val) > 0.5).astype("int32")

print("Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Evaluate the model on the test set
test_loss, test_accuracy = nn_model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Predictions and evaluation metrics for test set
y_test_pred = (nn_model.predict(X_test) > 0.5).astype("int32")

print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# ROC-AUC score
y_val_proba = nn_model.predict(X_val)
roc_auc = roc_auc_score(y_val, y_val_proba)
print(f"Validation ROC-AUC Score: {roc_auc}")

y_test_proba = nn_model.predict(X_test)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
print(f"Test ROC-AUC Score: {roc_auc_test}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Validation Loss: 0.3086070716381073
Validation Accuracy: 0.8680621385574341
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     43739
           1       0.58      0.15      0.24      6997

    accuracy                           0.87     50736
   macro avg       0.73      0.57      0.59     50736
weighted avg       0.84      0.87      0.83     50736

Confusion Matrix:
[[42966   773]
 [ 5921  1076]]
Test Loss: 0.31263267993927
Test Accuracy: 0.8671561479568481
Test Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     34929
           1       0.60      0.14      0.23      5660

    accuracy                           0.87     40589
   macro avg       0.74      0.56      0.58 

In [7]:
# Import necessary libraries for Neural Network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample

# Import TensorFlow for Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Dropping the ID column
df.drop(columns=['ID'], inplace=True)

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Split off 20% of the data for the final test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine the remaining data
df_train_val = pd.concat([X_train_val, y_train_val], axis=1)

# Separate majority and minority classes
df_majority = df_train_val[df_train_val.Diabetes_binary == 0]
df_minority = df_train_val[df_train_val.Diabetes_binary == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split the upsampled data into features and target variable
X_upsampled = df_upsampled.drop(columns=['Diabetes_binary'])
y_upsampled = df_upsampled['Diabetes_binary']

# Split the upsampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the Neural Network model
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the Neural Network model
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                       epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model on the validation set
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Predictions and evaluation metrics on validation set
y_val_pred = (nn_model.predict(X_val) > 0.5).astype("int32")

print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# ROC-AUC score for validation set
y_val_proba = nn_model.predict(X_val)
roc_auc_val = roc_auc_score(y_val, y_val_proba)
print(f"Validation ROC-AUC Score: {roc_auc_val}")

# Evaluate the model on the test set
test_loss, test_accuracy = nn_model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Predictions and evaluation metrics on test set
y_test_pred = (nn_model.predict(X_test) > 0.5).astype("int32")

print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# ROC-AUC score for test set
y_test_proba = nn_model.predict(X_test)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
print(f"Test ROC-AUC Score: {roc_auc_test}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Validation Loss: 0.5011608600616455
Validation Accuracy: 0.753085732460022
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.70      0.74     35002
           1       0.73      0.81      0.76     34836

    accuracy                           0.75     69838
   macro avg       0.76      0.75      0.75     69838
weighted avg       0.76      0.75      0.75     69838

Validation Confusion Matrix:
[[24528 10474]
 [ 6770 28066]]
Validation ROC-AUC Score: 0.830986121938645
Test Loss: 0.5254552960395813
Test Accuracy: 0.7159610390663147
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     43739
           1       0.30      0.81      0.44      6997

    accuracy                           0.72     50736
   macro avg       0.63      0.76      0.6