In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import pandas as pd
from collections import Counter
import findspark
from pyspark.sql import SparkSession

In [3]:
def neuralNetworkModel(dataFrame, tag, weight_question):
    # Split our preprocessed data into our features and target arrays
    X = dataFrame.drop(['Obesity_Level'], axis=1).values
    y = dataFrame['Obesity_Level'].values

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31)

    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    ## Compile, Train and Evaluate the Model
    input_features = X_train_scaled.shape[1]

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=28, activation='relu', input_dim=input_features))

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=20, activation='relu'))

    nn.add(tf.keras.layers.Dense(units=20, activation='relu'))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

    # Check the structure of the model
    nn.summary()

    # Compile the model
    nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    fit_model = nn.fit(X_train_scaled, y_train, epochs=80)

    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    nn_outcome_dictionary['Network_Model'].append(tag)
    nn_outcome_dictionary["With Height & Weight?"].append(weight_question)
    nn_outcome_dictionary['Accuracy'].append(model_accuracy)
    nn_outcome_dictionary['Loss'].append(model_loss)

def neuralNetworkModelNoHeightWeight(dataFrame, tag, weight_question):
    # Split our preprocessed data into our features and target arrays
    X = dataFrame.drop(['Obesity_Level', 'Height', 'Weight', 'BMI'], axis=1).values
    y = dataFrame['Obesity_Level'].values

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31)

    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    ## Compile, Train and Evaluate the Model
    input_features = X_train_scaled.shape[1]

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=28, activation='relu', input_dim=input_features))

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=20, activation='relu'))

    nn.add(tf.keras.layers.Dense(units=20, activation='relu'))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

    # Check the structure of the model
    nn.summary()

    # Compile the model
    nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    fit_model = nn.fit(X_train_scaled, y_train, epochs=80)

    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    nn_outcome_dictionary['Network_Model'].append(tag)
    nn_outcome_dictionary["With Height & Weight?"].append(weight_question)
    nn_outcome_dictionary['Accuracy'].append(model_accuracy)
    nn_outcome_dictionary['Loss'].append(model_loss)
    

In [4]:
findspark.init()

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").config("spark.driver.memory", "2g").getOrCreate()

# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://project-4-weight-data.s3.ap-southeast-2.amazonaws.com/processed_obesity_data_YES_WEIGHT.csv"
spark.sparkContext.addFile(url)

# DataFrame is caxlled obesity_df
obesity_df = (spark.read.csv(SparkFiles.get("processed_obesity_data_YES_WEIGHT.csv"), sep=",", header=True)).toPandas()



In [5]:
obesity_df.head()

Unnamed: 0,Age,Height,Weight,Overweight_Family_History,High_Caloric_Food_Frequency,Mmain_Meals_per_Day,Smoking_Status,Daily_Water_Intake_L,Monitor_Calorie_Intake,Physical_Activity,...,Alcohol_Frequency_Always,Alcohol_Frequency_Frequently,Alcohol_Frequency_Sometimes,Alcohol_Frequency_no,Mode_of_Transportaion_Automobile,Mode_of_Transportaion_Bike,Mode_of_Transportaion_Motorbike,Mode_of_Transportaion_Public_Transportation,Mode_of_Transportaion_Walking,Obesity_Level
0,21.0,1.62,64.0,1,0,0.0,0,2.0,0,0.0,...,0,0,0,1,0,0,0,1,0,1
1,21.0,1.52,56.0,1,0,0.0,1,3.0,1,3.0,...,0,0,1,0,0,0,0,1,0,1
2,23.0,1.8,77.0,1,0,0.0,0,2.0,0,2.0,...,0,1,0,0,0,0,0,1,0,1
3,27.0,1.8,87.0,0,0,0.0,0,2.0,0,2.0,...,0,1,0,0,0,0,0,0,1,2
4,22.0,1.78,89.8,0,0,0.0,0,2.0,0,0.0,...,0,0,1,0,0,0,0,1,0,2


In [6]:
nn_outcome_dictionary = {"Network_Model": [],
                         "With Height & Weight?": [],
                         "Accuracy": [],
                         "Loss": []}


In [7]:
obesity_df['Obesity_Level'].replace({'0': 0,
                                    '1': 1,
                                    '2': 2,
                                    '3': 3},
                                    inplace= True)

In [8]:
weight_target_sort = obesity_df.copy()

weight_target_sort['Obesity_Level'].replace({0: 0,
                                    1: 0,
                                    2: 1,
                                    3: 1},
                                    inplace= True)

print(Counter(weight_target_sort['Obesity_Level']))

Counter({1: 1552, 0: 559})


In [9]:
neuralNetworkModel(weight_target_sort, "obese/overweight_or_not", "Yes")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 28)                868       
                                                                 
 dense_1 (Dense)             (None, 20)                580       
                                                                 
 dense_2 (Dense)             (None, 20)                420       
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 1,889
Trainable params: 1,889
Non-trainable params: 0
_________________________________________________________________
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epo

In [10]:
neuralNetworkModelNoHeightWeight(weight_target_sort, "obese/overweight_or_not", "No")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 28)                784       
                                                                 
 dense_5 (Dense)             (None, 20)                580       
                                                                 
 dense_6 (Dense)             (None, 20)                420       
                                                                 
 dense_7 (Dense)             (None, 1)                 21        
                                                                 
Total params: 1,805
Trainable params: 1,805
Non-trainable params: 0
_________________________________________________________________
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
E

In [11]:
obesity_sort = obesity_df.copy()

obesity_sort = obesity_sort.loc[obesity_sort['Obesity_Level'] > 1]


obesity_sort['Obesity_Level'].replace({2: 0,
                                    3: 1},
                                    inplace= True)

print(Counter(obesity_sort['Obesity_Level']))

Counter({1: 972, 0: 580})


In [12]:
neuralNetworkModel(obesity_sort, "obese_or_overweight", "Yes")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 28)                868       
                                                                 
 dense_9 (Dense)             (None, 20)                580       
                                                                 
 dense_10 (Dense)            (None, 20)                420       
                                                                 
 dense_11 (Dense)            (None, 1)                 21        
                                                                 
Total params: 1,889
Trainable params: 1,889
Non-trainable params: 0
_________________________________________________________________
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
E

In [13]:
neuralNetworkModelNoHeightWeight(obesity_sort, "obese_or_overweight", "No")

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 28)                784       
                                                                 
 dense_13 (Dense)            (None, 20)                580       
                                                                 
 dense_14 (Dense)            (None, 20)                420       
                                                                 
 dense_15 (Dense)            (None, 1)                 21        
                                                                 
Total params: 1,805
Trainable params: 1,805
Non-trainable params: 0
_________________________________________________________________
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
E

In [14]:
outcome_df = pd.DataFrame(nn_outcome_dictionary)
outcome_df.head()

Unnamed: 0,Network_Model,With Height & Weight?,Accuracy,Loss
0,obese/overweight_or_not,Yes,0.99053,0.032908
1,obese/overweight_or_not,No,0.88447,0.525794
2,obese_or_overweight,Yes,0.987113,0.043065
3,obese_or_overweight,No,0.837629,0.619294


In [16]:
outcome_df.to_csv('..\model_comparison\nn_evaluation.csv')