In [26]:
import numpy as np
import tensorflow as tf
import pandas as pd
from keras import layers

In [27]:
all_columns = [
    "Career",  # str
    "Citizenship",  # str
    "Nationality",  # str
    "Year since Matriculation",  # int
    "Year of Study",  # int
    "Primary Programme",  # str
    "Gender",  # str
    "Department",  # str
    "Housing Type",  # str
    "Q1-How many events have you Volunteered in ?",  # int
    "Q2-How many events have you Participated in ?",  # int
    "Q3-How many activities are you Interested in ?",  # int
    "Q4-How many activities are you Passionate about ?",  # int
    "Q5-What are your levels of stress ?",  # int
    "Q6-How Satisfied You are with your Student Life ?",  # int
    "Q7-How much effort do you make to interact with others ?",  # float
    "Q8-About How events are you aware about ?",  # float
    "Q9-What is an ideal student life ?",  # long str
    "response_id",  # int
]

numerical_columns = [
    "Q1-How many events have you Volunteered in ?",  # int
    "Q2-How many events have you Participated in ?",  # int
    "Q3-How many activities are you Interested in ?",  # int
    "Q4-How many activities are you Passionate about ?",  # int
    "Q7-How much effort do you make to interact with others ?",  # float
    "Q8-About How events are you aware about ?",  # float
]

categorical_columns = [
    "Career",  # str
    "Citizenship",  # str
    "Nationality",  # str
    "Primary Programme",  # str
    "Gender",  # str
    "Department",  # str
    "Housing Type",  # str
]

numerical_categorical_columns = [
    "Year since Matriculation",  # int
    "Year of Study",  # int
]

unused_columns = [
    "response_id",  # int
    "Q9-What is an ideal student life ?",  # long str
]

missing_columns = [
    "Q7-How much effort do you make to interact with others ?",  # float
    "Q8-About How events are you aware about ?",  # float
]

In [28]:
# TARGET_COLUMN = "Q5-What are your levels of stress ?"
# TARGET_MAX = 10
TARGET_COLUMN = "Q6-How Satisfied You are with your Student Life ?"  # int
TARGET_MAX = 3

In [29]:

df = pd.read_csv("data/survey_responses.csv")

df["target"] = df[TARGET_COLUMN] / TARGET_MAX

df = df.drop(unused_columns, axis=1)

for col in missing_columns:
    df[col] = df[col].fillna(0)

In [30]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])


def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop("target")
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds.prefetch(batch_size)

    return ds


BATCH_SIZE = 32

train_ds = df_to_dataset(train, batch_size=BATCH_SIZE)
val_ds = df_to_dataset(val, shuffle=False, batch_size=BATCH_SIZE)
test_ds = df_to_dataset(test, shuffle=False, batch_size=BATCH_SIZE)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}
  df = {key: value[:, tf.newaxis] for key, value in df.items()}
  df = {key: value[:, tf.newaxis] for key, value in df.items()}


In [31]:
def get_numerical_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer


def category_encoding_layer_normalization(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == "string":
        index = layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)

    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Encode the integer indices.
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))

all_inputs = []
encoded_features = []

In [32]:
# Numerical features
for header in numerical_columns:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_numerical_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)

    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)


# Numerical Categorical features
for header in numerical_categorical_columns:
    categorical_column = layers.Input(shape=(1,), name=header, dtype="int64")
    test_layer = category_encoding_layer_normalization(header, train_ds, "int64")
    encoded_categorical_column = test_layer(categorical_column)

    all_inputs.append(categorical_column)
    encoded_features.append(encoded_categorical_column)


# Categorical features
for header in categorical_columns:
    categorical_column = layers.Input(shape=(1,), name=header, dtype="string")
    test_layer = category_encoding_layer_normalization(header, train_ds, "string")
    encoded_categorical_column = test_layer(categorical_column)

    all_inputs.append(categorical_column)
    encoded_features.append(encoded_categorical_column)

In [33]:
all_features = layers.Concatenate()(encoded_features)

x = layers.Dense(512, activation="relu")(all_features)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation="relu")(x)
output = layers.Dense(1)(x)

model = tf.keras.Model(inputs=all_inputs, outputs=output)

loss = tf.keras.losses.MeanSquaredError()

model.compile(
    optimizer="adam",
    loss=loss,
)

In [34]:
EPOCHS = 10

hist = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS).history

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
# creates an evenly filled array of the epochs
hist["epoch"] = np.linspace(1, EPOCHS, EPOCHS)

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Year since Matriculation (Inpu  [(None, 1)]         0           []                               
 tLayer)                                                                                          
                                                                                                  
 Year of Study (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 Career (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 Citizenship (InputLayer)       [(None, 1)]          0           []                         

In [36]:
loss = model.evaluate(test_ds)
print("Test Loss: ", loss)

 1/10 [==>...........................] - ETA: 0s - loss: 0.0307

Test Loss:  0.03320734202861786


In [37]:
student_1 = {
    "Career": "GRAD",
    "Citizenship": "Country Citzen",
    "Nationality": "China",
    "Year since Matriculation": 1,
    "Year of Study": 1,
    "Primary Programme": "Bachelor of Arts",
    "Gender": "M",
    "Department": "School of Arts & Social Science",
    "Housing Type": "Out of Campus",
    "Q1-How many events have you Volunteered in ?": 1,
    "Q2-How many events have you Participated in ?": 1,
    "Q3-How many activities are you Interested in ?": 5,
    "Q4-How many activities are you Passionate about ?": 5,
    "Q7-How much effort do you make to interact with others ?": 1.0,
    "Q8-About How events are you aware about ?": 4.0,
}

student_2 = {
    "Career": "UGRD",
    "Citizenship": "Country Citzen",
    "Nationality": "Singapore",
    "Year since Matriculation": 4,
    "Year of Study": 4,
    "Primary Programme": "Bachelor of Computing",
    "Gender": "F",
    "Department": "School of Science",
    "Housing Type": "Halls",
    "Q1-How many events have you Volunteered in ?": 4,
    "Q2-How many events have you Participated in ?": 4,
    "Q3-How many activities are you Interested in ?": 1,
    "Q4-How many activities are you Passionate about ?": 2,
    "Q7-How much effort do you make to interact with others ?": 3.0,
    "Q8-About How events are you aware about ?": 1.0,
}

low_input_dict = {name: tf.convert_to_tensor([value]) for name, value in student_1.items()}
low_sample_percentage = model.predict(low_input_dict)[0][0]

high_input_dict = {name: tf.convert_to_tensor([value]) for name, value in student_2.items()}
high_sample_percentage = model.predict(high_input_dict)[0][0]


print("Student 1 predicted satisfaction level", low_sample_percentage, "%") # 0.15236866 %
print("Student 2 predicted satisfaction level", high_sample_percentage, "%") # 0.70740527 %

predicted satisfaction level 0.15236866 %
predicted satisfaction level 0.70740527 %
