In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
train_df = pd.read_csv("./data/processed_train.csv")
test_df = pd.read_csv("./data/processed_test.csv")

In [3]:
val_df = train_df.sample(frac=0.2, random_state=1337)
train_df = train_df.drop(val_df.index)
test_df = test_df.sample(frac=1)

In [4]:
print(len(train_df), len(val_df), len(test_df))

26049 6512 16281


In [5]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("education-num")
    labels_temp = labels.to_numpy()
    labels_onehot = np.zeros((len(labels_temp), 16))
    for index, label in enumerate(labels_temp):
        labels_onehot[index][label - 1] = 1
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels_onehot))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)
test_ds = dataframe_to_dataset(test_df)

In [6]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)


Input: {'native-country': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'marital-status': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'race': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'relationship': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'occupation': <tf.Tensor: shape=(), dtype=int64, numpy=7>, 'workclass': <tf.Tensor: shape=(), dtype=int64, numpy=2>}
Target: tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.], shape=(16,), dtype=float64)


In [7]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)
test_ds = test_ds.batch(32)

In [8]:
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
def encode_integer_categorical_feature(feature, name, dataset):
    encoder = CategoryEncoding(output_mode="binary")
    
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    encoder.adapt(feature_ds)
    
    encoded_feature = encoder(feature)
    return encoded_feature

In [9]:
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [10]:
workclass = keras.Input(shape=(1,), name="workclass", dtype="int64")
sex = keras.Input(shape=(1,), name="sex", dtype="int64")
race = keras.Input(shape=(1,), name="race", dtype="int64")
marital_status = keras.Input(shape=(1,), name="marital-status", dtype="int64")
native_country = keras.Input(shape=(1,), name="native-country", dtype="int64")
relationship = keras.Input(shape=(1,), name="relationship", dtype="int64")
occupation = keras.Input(shape=(1,), name="occupation", dtype="int64")
all_inputs = [workclass, sex, race, marital_status, native_country, relationship, occupation]

workclass_encoded = encode_integer_categorical_feature(workclass, "workclass", train_ds)
sex_encoded = encode_integer_categorical_feature(sex, "sex", train_ds)
race_encoded = encode_integer_categorical_feature(race, "race", train_ds)
marital_status_encoded = encode_integer_categorical_feature(marital_status, "marital-status", train_ds)
native_country_encoded = encode_integer_categorical_feature(native_country, "native-country", train_ds)
relationship_encoded = encode_integer_categorical_feature(relationship, "relationship", train_ds)
occupation_encoded = encode_integer_categorical_feature(occupation, "occupation", train_ds)
all_features = layers.concatenate(
    [
        workclass_encoded, sex_encoded, race_encoded, marital_status_encoded, native_country_encoded, relationship_encoded, occupation_encoded
    ]
)
x = layers.Dense(32, activation="relu")(all_features)
output = layers.Dense(16, activation="softmax")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "categorical_crossentropy", metrics=['accuracy', f1_m, precision_m, recall_m])

In [11]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=30)

history = model.fit(train_ds, epochs=200, validation_data=val_ds, callbacks=[callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200


In [12]:
loss, accuracy, f1_score, precision, recall = model.evaluate(test_ds)



In [17]:
workclass = keras.Input(shape=(1,), name="workclass", dtype="int64")
sex = keras.Input(shape=(1,), name="sex", dtype="int64")
race = keras.Input(shape=(1,), name="race", dtype="int64")
marital_status = keras.Input(shape=(1,), name="marital-status", dtype="int64")
native_country = keras.Input(shape=(1,), name="native-country", dtype="int64")
relationship = keras.Input(shape=(1,), name="relationship", dtype="int64")
occupation = keras.Input(shape=(1,), name="occupation", dtype="int64")
all_inputs = [workclass, sex, race, marital_status, native_country, relationship, occupation]

workclass_encoded = encode_integer_categorical_feature(workclass, "workclass", train_ds)
sex_encoded = encode_integer_categorical_feature(sex, "sex", train_ds)
race_encoded = encode_integer_categorical_feature(race, "race", train_ds)
marital_status_encoded = encode_integer_categorical_feature(marital_status, "marital-status", train_ds)
native_country_encoded = encode_integer_categorical_feature(native_country, "native-country", train_ds)
relationship_encoded = encode_integer_categorical_feature(relationship, "relationship", train_ds)
occupation_encoded = encode_integer_categorical_feature(occupation, "occupation", train_ds)
all_features = layers.concatenate(
    [
        workclass_encoded, sex_encoded, race_encoded, marital_status_encoded, native_country_encoded, relationship_encoded, occupation_encoded
    ]
)
x = layers.Dense(16, activation="relu")(all_features)
x = layers.Dense(32, activation="relu")(x)
output = layers.Dense(16, activation="softmax")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "categorical_crossentropy", metrics=['accuracy', f1_m, precision_m, recall_m])

In [18]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=30)

history = model.fit(train_ds, epochs=200, validation_data=val_ds, callbacks=[callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200


In [19]:
loss, accuracy, f1_score, precision, recall = model.evaluate(train_ds)



In [20]:
loss, accuracy, f1_score, precision, recall = model.evaluate(test_ds)

