In [4]:
import pandas as pd

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K

import time

In [5]:
HEADERS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=HEADERS)

test_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
test_data = pd.read_csv(test_data_url, header=None, names=HEADERS)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (32561, 15)
Test dataset shape: (16282, 15)


In [6]:
train_data.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
26926,33,Private,312881,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,40,United-States,>50K
10266,56,Self-emp-inc,184598,9th,5,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,99,United-States,<=50K
24742,24,State-gov,197731,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,49,United-States,>50K
16560,24,Private,196332,HS-grad,9,Never-married,Other-service,Own-child,Black,Female,0,0,40,United-States,<=50K
29642,45,Self-emp-inc,204196,Bachelors,13,Divorced,Exec-managerial,Unmarried,White,Male,0,0,50,United-States,>50K
25827,40,Local-gov,105862,Bachelors,13,Divorced,Prof-specialty,Unmarried,White,Female,5455,0,40,United-States,<=50K
5870,51,Private,320513,7th-8th,4,Married-spouse-absent,Craft-repair,Not-in-family,Black,Male,0,0,50,Dominican-Republic,<=50K
18818,25,Private,52536,Assoc-acdm,12,Divorced,Tech-support,Own-child,White,Female,0,1594,25,United-States,<=50K
13825,54,Self-emp-not-inc,205066,10th,6,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,36,United-States,<=50K
3164,41,Private,173938,Prof-school,15,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,50,?,>50K


In [7]:
test_data = test_data[1:]
test_data.income_bracket = test_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

In [8]:
train_data_file = "adult/train_data.csv"
test_data_file = "adult/test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

In [9]:
train_data_description = train_data.describe()

In [10]:
def sort_none_last(xs):
    return sorted(xs, key=lambda x: (x is None, x))

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sort_none_last(list(train_data["workclass"].unique())),
    "education": sort_none_last(list(train_data["education"].unique())),
    "marital_status": sort_none_last(list(train_data["marital_status"].unique())),
    "occupation": sort_none_last(list(train_data["occupation"].unique())),
    "relationship": sort_none_last(list(train_data["relationship"].unique())),
    "race": sort_none_last(list(train_data["race"].unique())),
    "gender": sort_none_last(list(train_data["gender"].unique())),
    "native_country": sort_none_last(list(train_data["native_country"].unique()))
}

NUMERIC_FEATURE_NAMES = [
    "age",
    "fnlwgt",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
COLUMN_DEFAULTS = [
    train_data_description[feature_name]["mean"] if feature_name in NUMERIC_FEATURE_NAMES else ["NA"]
    for feature_name in HEADERS
]
TARGET_FEATURE_NAME = "income_bracket"
TARGET_LABELS = [" <=50K", " >50K"]

In [11]:
counts = train_data["income_bracket"].value_counts()
class_weight = {
    0: (1 / counts[0]) * (train_data.shape[0] / 2.0),
    1: (1 / counts[1]) * (train_data.shape[0] / 2.0)
}

In [12]:
class_weight

{0: 0.6585962783171521, 1: 2.0763295498023213}

In [13]:
LEARNING_RATE = 0.0006
WEIGHT_DECAY = 0.00002
DROPOUT_RATE = 0.012
BATCH_SIZE = 265
NUM_EPOCHS = 15

NUM_TRANSFORMER_BLOCKS = 4
NUM_HEADS = 4
EMBEDDING_DIMS = 16
MLP_HIDDEN_UNITS_FACTORS = [
    5,
    3,
    1,
]

In [14]:
target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)

def create_instance(features, target):
    target_index = target_label_lookup(target)
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=HEADERS,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(create_instance, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()

  return bool(asarray(a1 == a2).all())


In [15]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [16]:
def encode_inputs(inputs, embedding_dims):
    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            lookup = layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int",
            )
            encoded_feature = lookup(inputs[feature_name])
            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )
            encoded_categorical_feature = embedding(encoded_feature)
            encoded_categorical_feature_list.append(encoded_categorical_feature)
        else:
            numerical_feature = tf.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

In [17]:
def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [18]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):

    inputs = create_model_inputs()
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    encoded_categorical_features = tf.stack(encoded_categorical_feature_list, axis=1)
    numerical_features = layers.concatenate(numerical_feature_list)

    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = tf.range(start=0, limit=num_columns, delta=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )

    for block_idx in range(num_transformer_blocks):
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features)
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{block_idx}",
        )(x)
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)

    categorical_features = layers.Flatten()(encoded_categorical_features)
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    features = layers.concatenate([categorical_features, numerical_features])

    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)

    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [19]:
def recall_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_metric(y_true, y_pred):
    precision = precision_metric(y_true, y_pred)
    recall = recall_metric(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [20]:
def fit_model(
    model,
    train_data_file,
    test_data_file,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate,
        weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
                 f1_metric,
                 precision_metric,
                 recall_metric],
    )

    train_dataset = get_dataset_from_csv(
        train_data_file, batch_size, shuffle=True
    )
    validation_dataset = get_dataset_from_csv(
        test_data_file, batch_size
    )

    callback = keras.callbacks.EarlyStopping(
        monitor='loss', patience=3
    )

    history = model.fit(
        train_dataset,
        epochs=num_epochs,
        validation_data=validation_dataset,
        class_weight=class_weight,
        callbacks=[callback]
    )

    _, accuracy, f1, precision, recall = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
    print(f"Validation F1: {f1}")

    return history

In [21]:
tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())

Total model weights: 435197


In [22]:
start = time.time()
history = fit_model(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)
end = time.time()
print(f"Training time: {end - start}s")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Validation accuracy: 77.0%
Validation F1: 0.6385642886161804
Training time: 113.62454104423523s
