In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

In [2]:
tf.device("/gpu:0")

<tensorflow.python.eager.context._EagerDeviceContext at 0x1bd685695c0>

In [3]:
"""
## Prepare the data
This example uses the
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
provided by the
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
The task is binary classification
to predict whether a person is likely to be making over USD 50,000 a year.
The dataset includes 48,842 instances with 14 input features: 5 numerical features and 9 categorical features.
First, let's load the dataset from the UCI Machine Learning Repository into a Pandas
DataFrame:
"""

CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

test_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

"""
Remove the first record (because it is not a valid data example) and a trailing 'dot' in the class labels.
"""
n = len(train_data[1:])
train_data = train_data[1:].sample(int(n/12)).reset_index(drop=True).copy()
train_data.income_bracket = train_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

"""
Now we store the training and test data in separate CSV files.
"""

train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

"""
## Define dataset metadata
Here, we define the metadata of the dataset that will be useful for reading and parsing
the data into input features, and encoding the input features with respect to their types.
"""

# A list of the numerical feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]
# A dictionary of the categorical features and their vocabulary.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sorted(list(train_data["workclass"].unique())),
    "education": sorted(list(train_data["education"].unique())),
    "marital_status": sorted(list(train_data["marital_status"].unique())),
    "occupation": sorted(list(train_data["occupation"].unique())),
    "relationship": sorted(list(train_data["relationship"].unique())),
    "race": sorted(list(train_data["race"].unique())),
    "gender": sorted(list(train_data["gender"].unique())),
    "native_country": sorted(list(train_data["native_country"].unique())),
}
# Name of the column to be used as instances weight.
WEIGHT_COLUMN_NAME = "fnlwgt"
# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME] else ["NA"]
    for feature_name in CSV_HEADER
]
# The name of the target feature.
TARGET_FEATURE_NAME = "income_bracket"
# A list of the labels of the target features.
TARGET_LABELS = [" <=50K", " >50K"]

Train dataset shape: (1356, 15)
Test dataset shape: (32561, 15)


In [4]:
with open('unsupervised_trained_embeddings.dictionary', 'rb') as config_dictionary_file:
    unsupervised_embeddings = pickle.load(config_dictionary_file)
 
    # After config_dictionary is read from file
    print(unsupervised_embeddings)

{'workclass': {' ?': array([ 0.0209779 , -0.01017089,  0.05762693, -0.02040695, -0.06742756,
       -0.04062385,  0.08188439,  0.01881742], dtype=float32), ' Federal-gov': array([-0.00238482,  0.02157347,  0.03422296,  0.03657641,  0.0712916 ,
       -0.03283327, -0.02137609, -0.05952976], dtype=float32), ' Local-gov': array([ 0.0074615 , -0.07071084, -0.00371071,  0.00851991, -0.02765904,
        0.05101799, -0.00939276, -0.04100754], dtype=float32), ' Never-worked': array([ 0.03243618,  0.04083101,  0.00466387, -0.01333358, -0.04312501,
       -0.04420219,  0.07556188,  0.02350184], dtype=float32), ' Private': array([ 0.07735993, -0.06224407, -0.06876317,  0.0360746 ,  0.03068237,
       -0.02371984, -0.01185691,  0.02789628], dtype=float32), ' Self-emp-inc': array([-0.02888541,  0.0220619 , -0.04513967, -0.07610089,  0.03344824,
       -0.02240068, -0.01810209,  0.03336848], dtype=float32), ' Self-emp-not-inc': array([ 0.00490372,  0.00060253, -0.10424689, -0.04242617,  0.0332447 ,


In [5]:
"""
## Configure the hyperparameters
The hyperparameters includes model architecture and training configurations.
"""

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 265
NUM_EPOCHS = 20

NUM_TRANSFORMER_BLOCKS = 1  # Number of transformer blocks.
NUM_HEADS = 2  # Number of attention heads.
EMBEDDING_DIMS = 8  # Embedding dimensions of the categorical features.
MLP_HIDDEN_UNITS_FACTORS = [
    2,
    1,
]  # MLP hidden layer units, as factors of the number of inputs.
NUM_MLP_BLOCKS = 2  # Number of MLP blocks in the baseline model.

"""
## Implement data reading pipeline
We define an input function that reads and parses the file, then converts features
and labels into a[`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
for training or evaluation.
"""

target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    target_index = target_label_lookup(target)
    weights = features.pop(WEIGHT_COLUMN_NAME)
    return features, target_index, weights


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()

  return bool(asarray(a1 == a2).all())


In [6]:
"""
## Implement a training and evaluation procedure
"""


def run_experiment(
    model,
    train_data_file,
    test_data_file,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):

    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    print("Start training the model...")
    history = model.fit(
        train_dataset, epochs=num_epochs, validation_data=validation_dataset
    )
    print("Model training finished")

    _, accuracy = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")

    return history, model

In [7]:
"""
## Create model inputs
Now, define the inputs for the models as a dictionary, where the key is the feature name,
and the value is a `keras.layers.Input` tensor with the corresponding feature shape
and data type.
"""


def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs


"""
## Encode features
The `encode_inputs` method returns `encoded_categorical_feature_list` and `numerical_feature_list`.
We encode the categorical features as embeddings, using a fixed `embedding_dims` for all the features,
regardless their vocabulary sizes. This is required for the Transformer model.
"""


def encode_inputs(inputs, embedding_dims):

    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:

            # Get the vocabulary of the categorical feature.
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]

            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=1,
                output_mode="int",
                name = feature_name+"_string_lookup"
            )

            # Convert the string input values into integer indices.
            encoded_feature = lookup(inputs[feature_name])

            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims,
                name=feature_name+"_embedding"
            )

            # Convert the index values to embedding representations.
            encoded_categorical_feature = embedding(encoded_feature)
            encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:

            # Use the numerical features as-is.
            numerical_feature = tf.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list


"""
## Implement an MLP block
"""


def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [8]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):

    # Create model inputs.
    inputs = create_model_inputs()
    # encode features.
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    # Stack categorical feature embeddings for the Tansformer.
    encoded_categorical_features = tf.stack(encoded_categorical_feature_list, axis=1)
    # Concatenate numerical features.
    numerical_features = layers.concatenate(numerical_feature_list)

    # Add column embedding to categorical feature embeddings.
    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = tf.range(start=0, limit=num_columns, delta=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )

    # Create multiple layers of the Transformer block.
    for block_idx in range(num_transformer_blocks):
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features)
        # Skip connection 1.
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )
        # Layer normalization 1.
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        # Feedforward.
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{block_idx}",
        )(x)
        # Skip connection 2.
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        # Layer normalization 2.
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)

    # Flatten the "contextualized" embeddings of the categorical features.
    categorical_features = layers.Flatten(name="dyanmic_embedding")(encoded_categorical_features)
    # Apply layer normalization to the numerical features.
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    # Prepare the input for the final MLP block.
    features = layers.concatenate([categorical_features, numerical_features])

    # Compute MLP hidden_units.
    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    # Create final MLP.
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)

    # Add a sigmoid as a binary classifer.
    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

In [9]:
untrained_embeddings = {}

for layer in tabtransformer_model.layers: 
    if "_embedding" in  layer.get_config()["name"]:
        col_name = layer.get_config()["name"].split("_embedding")[0]
        if col_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            continue
        untrained_embeddings[col_name] = {}
        for idx, cat in enumerate(CATEGORICAL_FEATURES_WITH_VOCABULARY[col_name]):
            if "mask" in cat:
                continue
            untrained_embeddings[col_name][cat] = layer.get_weights()[0][idx]
            
untrained_embeddings

{'workclass': {' ?': array([-0.04354287,  0.02074732, -0.04730244,  0.03600276, -0.04636401,
          0.04750747,  0.00964289, -0.0054745 ], dtype=float32),
  ' Federal-gov': array([ 0.0055702 ,  0.04772575, -0.00171905,  0.00347515,  0.01405672,
          0.03492272,  0.03634921,  0.04504477], dtype=float32),
  ' Local-gov': array([-0.02846133,  0.04619798,  0.03202354,  0.04549927, -0.04690081,
          0.02081705,  0.0249604 , -0.04179724], dtype=float32),
  ' Never-worked': array([ 0.00805221, -0.0265083 ,  0.0190127 ,  0.01106777,  0.02155116,
         -0.01036626, -0.04524335, -0.00415348], dtype=float32),
  ' Private': array([-0.00123805,  0.01310295, -0.04280411,  0.04442531, -0.00173043,
         -0.04205872, -0.04842116,  0.03834835], dtype=float32),
  ' Self-emp-inc': array([-0.02300087, -0.00731345,  0.0463074 ,  0.03262572,  0.02881159,
         -0.0023679 , -0.04150233, -0.02519195], dtype=float32),
  ' Self-emp-not-inc': array([ 0.01601264, -0.00620093,  0.01010405, -0

In [10]:
print("Total model weights:", tabtransformer_model.count_params())
keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")

"""
Let's train and evaluate the TabTransformer model:
"""

history, model = run_experiment(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)

Total model weights: 21039
Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training finished
Validation accuracy: 83.07%


In [11]:
embeddings = {}

for layer in model.layers: 
    if "_embedding" in  layer.get_config()["name"]:
        col_name = layer.get_config()["name"].split("_embedding")[0]
        if col_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            continue
        embeddings[col_name] = {}
        for idx, cat in enumerate(CATEGORICAL_FEATURES_WITH_VOCABULARY[col_name]):
            if "mask" in cat:
                continue
            embeddings[col_name][cat] = layer.get_weights()[0][idx]
            
embeddings

{'workclass': {' ?': array([-0.04354287,  0.02074732, -0.04730244,  0.03600276, -0.04636401,
          0.04750747,  0.00964289, -0.0054745 ], dtype=float32),
  ' Federal-gov': array([-0.01153058,  0.03211899,  0.01593072, -0.00492078,  0.02463038,
          0.01761208,  0.05955955,  0.04191079], dtype=float32),
  ' Local-gov': array([-0.00433735,  0.06308363,  0.0039512 ,  0.03242684, -0.04304394,
          0.01680886,  0.00039976, -0.06596198], dtype=float32),
  ' Never-worked': array([-0.00775914, -0.03458345, -0.0051456 ,  0.03379905,  0.02265622,
         -0.00402073, -0.03586866, -0.01199169], dtype=float32),
  ' Private': array([ 0.01545795,  0.02377933, -0.03693162,  0.01036145, -0.01041461,
         -0.03039907, -0.06768843,  0.04208777], dtype=float32),
  ' Self-emp-inc': array([-0.02688174, -0.00860837,  0.06264772,  0.01677634,  0.03599747,
         -0.01340182, -0.02871622, -0.02705963], dtype=float32),
  ' Self-emp-not-inc': array([ 0.0412167 ,  0.01876653,  0.00560171, -0

In [12]:
tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

for layer in tabtransformer_model.layers: 
    if "_embedding" in  layer.get_config()["name"]:
        col_name = layer.get_config()["name"].split("_embedding")[0]
        if col_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            continue
        layer.set_weights([np.array(list([unsupervised_embeddings[col_name][c] for c in CATEGORICAL_FEATURES_WITH_VOCABULARY[col_name]]))])

"""
Let's train and evaluate the TabTransformer model:
"""

history, model = run_experiment(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)

Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training finished
Validation accuracy: 81.72%


# Embedding Comparison

In [13]:
categories_count = 0
simialrity = 0
for col in CATEGORICAL_FEATURES_WITH_VOCABULARY:
    for categories in CATEGORICAL_FEATURES_WITH_VOCABULARY[col]:
        simialrity += 1-spatial.distance.cosine(untrained_embeddings[col][categories], 
                        embeddings[col][categories])
        categories_count += 1
print(simialrity/categories_count)

0.8724631994001327


In [14]:
categories_count = 0
simialrity = 0
for col in CATEGORICAL_FEATURES_WITH_VOCABULARY:
    for categories in CATEGORICAL_FEATURES_WITH_VOCABULARY[col]:
        simialrity += 1-spatial.distance.cosine(untrained_embeddings[col][categories], 
                        unsupervised_embeddings[col][categories])
        categories_count += 1
print(simialrity/categories_count)

-0.026406694914624895


In [15]:
categories_count = 0
simialrity = 0
for col in CATEGORICAL_FEATURES_WITH_VOCABULARY:
    for categories in CATEGORICAL_FEATURES_WITH_VOCABULARY[col]:
        simialrity += 1-spatial.distance.cosine(embeddings[col][categories], 
                        unsupervised_embeddings[col][categories])
        categories_count += 1
print(simialrity/categories_count)

-0.023265469814300218
