In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from numpy import random
import pickle

In [2]:
tf.device("/gpu:0")

<tensorflow.python.eager.context._EagerDeviceContext at 0x24f70263f80>

In [3]:
"""
## Prepare the data
This example uses the
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
provided by the
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
The task is binary classification
to predict whether a person is likely to be making over USD 50,000 a year.
The dataset includes 48,842 instances with 14 input features: 5 numerical features and 9 categorical features.
First, let's load the dataset from the UCI Machine Learning Repository into a Pandas
DataFrame:
"""

CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

print(f"Train dataset shape: {train_data.shape}")

# A list of the numerical feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]

CATEGORICAL_FEATURE_NAMES = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country",
]

train_data.drop(columns = NUMERIC_FEATURE_NAMES, inplace=True)
train_data.drop(columns = "income_bracket", inplace=True)

Train dataset shape: (32561, 15)


In [4]:
le=LabelEncoder()
all_data = []
for f in CATEGORICAL_FEATURE_NAMES:
    all_data = all_data + train_data[f].apply(lambda x: f+x).values.tolist()
    
n = len(train_data)
cat_data = le.fit_transform(all_data).reshape(len(CATEGORICAL_FEATURE_NAMES),n).T
cat_count = len(np.unique(cat_data))
print("Total token count", cat_count)
cat_data[:5,]

Total token count 102


array([[100,   9,  22,  68,  88,  86,  17,  64],
       [ 99,   9,  20,  71,  87,  86,  17,  64],
       [ 97,  11,  18,  73,  88,  86,  17,  64],
       [ 97,   1,  20,  73,  87,  84,  17,  64],
       [ 97,   9,  20,  77,  92,  84,  16,  30]], dtype=int64)

In [5]:
masks_pos = random.randint(len(CATEGORICAL_FEATURE_NAMES), size=(n))
masked_data = train_data[CATEGORICAL_FEATURE_NAMES].copy()
mask_label = []
for i in range(n):
    mask_label = mask_label + [cat_data[i, masks_pos[i]]]
    masked_data.iloc[i, masks_pos[i]] = "mask"+str(masks_pos[i])

le2=LabelEncoder()
mask_label = le2.fit_transform(mask_label)    
masked_data["fnlwgt"]=train_data["fnlwgt"]
train_data = masked_data
masked_data["label"] = mask_label
CSV_HEADER = train_data.columns
masked_data.head(10)

"""
Now we store the training and test data in separate CSV files.
"""

train_data_file = "train_data.csv"

train_data.to_csv(train_data_file, index=False, header=True)

In [6]:
masked_data.head(10)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,gender,native_country,fnlwgt,label
0,State-gov,Bachelors,Never-married,mask3,Not-in-family,White,Male,United-States,77516,64
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,mask3,Husband,White,Male,United-States,83311,67
2,Private,mask1,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,215646,11
3,mask0,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,234721,92
4,Private,Bachelors,mask2,Prof-specialty,Wife,Black,Female,Cuba,338409,20
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,mask6,United-States,284582,16
6,Private,mask1,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,160187,6
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,mask4,White,Male,United-States,209642,83
8,Private,Masters,Never-married,Prof-specialty,mask4,White,Female,United-States,45781,84
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,mask7,159449,60


In [7]:
"""
## Define dataset metadata
Here, we define the metadata of the dataset that will be useful for reading and parsing
the data into input features, and encoding the input features with respect to their types.
"""
# A dictionary of the categorical features and their vocabulary.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sorted(list(train_data["workclass"].unique())),
    "education": sorted(list(train_data["education"].unique())),
    "marital_status": sorted(list(train_data["marital_status"].unique())),
    "occupation": sorted(list(train_data["occupation"].unique())),
    "relationship": sorted(list(train_data["relationship"].unique())),
    "race": sorted(list(train_data["race"].unique())),
    "gender": sorted(list(train_data["gender"].unique())),
    "native_country": sorted(list(train_data["native_country"].unique())),
}
# Name of the column to be used as instances weight.
WEIGHT_COLUMN_NAME = "fnlwgt"
# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

In [8]:
# A list of all the input features.
FEATURE_NAMES = CATEGORICAL_FEATURE_NAMES

# The name of the target feature.
TARGET_FEATURE_NAME = "label"
# A list of the labels of the target features.
TARGET_LABELS = masked_data["label"].unique()

# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in [WEIGHT_COLUMN_NAME] else ["NA"]
    for feature_name in CSV_HEADER
]

In [9]:
"""
## Configure the hyperparameters
The hyperparameters includes model architecture and training configurations.
"""

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 265
NUM_EPOCHS = 20

NUM_TRANSFORMER_BLOCKS = 1  # Number of transformer blocks.
NUM_HEADS = 2  # Number of attention heads.
EMBEDDING_DIMS = 8  # Embedding dimensions of the categorical features.
MLP_HIDDEN_UNITS_FACTORS = [
    2,
    1,
]  # MLP hidden layer units, as factors of the number of inputs.
NUM_MLP_BLOCKS = 2  # Number of MLP blocks in the baseline model.

"""
## Implement data reading pipeline
We define an input function that reads and parses the file, then converts features
and labels into a[`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
for training or evaluation.
"""

# target_label_lookup = layers.StringLookup(
#     vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
# )


def prepare_example(features, target):
    # target_index = target_label_lookup(target)
    target = int(target)
    weights = features.pop(WEIGHT_COLUMN_NAME)
    return features, target, weights


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        #column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=True,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()

In [10]:
"""
## Implement a training and evaluation procedure
"""


def run_experiment(
    model,
    train_data_file,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):

    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)

    print("Start training the model...")
    history = model.fit(
        train_dataset, epochs=num_epochs
    )
    print("Model training finished")

    return history, model

In [11]:
"""
## Create model inputs
Now, define the inputs for the models as a dictionary, where the key is the feature name,
and the value is a `keras.layers.Input` tensor with the corresponding feature shape
and data type.
"""


def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs


"""
## Encode features
The `encode_inputs` method returns `encoded_categorical_feature_list` and `numerical_feature_list`.
We encode the categorical features as embeddings, using a fixed `embedding_dims` for all the features,
regardless their vocabulary sizes. This is required for the Transformer model.
"""


def encode_inputs(inputs, embedding_dims):

    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        # Get the vocabulary of the categorical feature.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]

        # Create a lookup to convert string values to an integer indices.
        # Since we are not using a mask token nor expecting any out of vocabulary
        # (oov) token, we set mask_token to None and  num_oov_indices to 0.
        lookup = layers.StringLookup(
            vocabulary=vocabulary,
            mask_token=None,
            num_oov_indices=1,
            output_mode="int",
        )

        # Convert the string input values into integer indices.
        encoded_feature = lookup(inputs[feature_name])-1

        # Create an embedding layer with the specified dimensions.
        embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims,
                name=feature_name+"_embedding"
            )

        # Convert the index values to embedding representations.
        encoded_categorical_feature = embedding(encoded_feature)
        encoded_categorical_feature_list.append(encoded_categorical_feature)

    return encoded_categorical_feature_list


"""
## Implement an MLP block
"""


def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [12]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):

    # Create model inputs.
    inputs = create_model_inputs()
    # encode features.
    encoded_categorical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    print(encoded_categorical_feature_list)
    # Stack categorical feature embeddings for the Tansformer.
    encoded_categorical_features = tf.stack(encoded_categorical_feature_list, axis=1)

    # Add column embedding to categorical feature embeddings.
    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = tf.range(start=0, limit=num_columns, delta=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )

    # Create multiple layers of the Transformer block.
    for block_idx in range(num_transformer_blocks):
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features)
        # Skip connection 1.
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )
        # Layer normalization 1.
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        # Feedforward.
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{block_idx}",
        )(x)
        # Skip connection 2.
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        # Layer normalization 2.
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)

    # Flatten the "contextualized" embeddings of the categorical features.
    categorical_features = layers.Flatten(name="dyanmic_embedding")(encoded_categorical_features)

    # Compute MLP hidden_units.
    mlp_hidden_units = [
        factor * categorical_features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    # Create final MLP.
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(categorical_features)

    # Add a sigmoid as a binary classifer.
    outputs = layers.Dense(units=len(TARGET_LABELS), activation="softmax", name="softmax")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())
keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")

"""
Let's train and evaluate the TabTransformer model:
"""

history, model = run_experiment(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)

[<KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'workclass_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'education_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'marital_status_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'occupation_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'relationship_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'race_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'gender_embedding')>, <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'native_country_embedding')>]
Total model weights: 24705
Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model train

In [13]:
pd.Series(masked_data["label"]).value_counts(normalize=True)

60    0.112558
82    0.107153
92    0.088910
17    0.082430
20    0.056233
        ...   
41    0.000061
65    0.000061
58    0.000061
47    0.000061
96    0.000031
Name: label, Length: 97, dtype: float64

In [14]:
embeddings = {}

for layer in model.layers: 
    if "_embedding" in  layer.get_config()["name"]:
        col_name = layer.get_config()["name"].split("_embedding")[0]
        if col_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            continue
        embeddings[col_name] = {}
        for idx, cat in enumerate(CATEGORICAL_FEATURES_WITH_VOCABULARY[col_name]):
            if "mask" in cat:
                continue
            embeddings[col_name][cat] = layer.get_weights()[0][idx]
            
embeddings

{'workclass': {' ?': array([ 0.0209779 , -0.01017089,  0.05762693, -0.02040695, -0.06742756,
         -0.04062385,  0.08188439,  0.01881742], dtype=float32),
  ' Federal-gov': array([-0.00238482,  0.02157347,  0.03422296,  0.03657641,  0.0712916 ,
         -0.03283327, -0.02137609, -0.05952976], dtype=float32),
  ' Local-gov': array([ 0.0074615 , -0.07071084, -0.00371071,  0.00851991, -0.02765904,
          0.05101799, -0.00939276, -0.04100754], dtype=float32),
  ' Never-worked': array([ 0.03243618,  0.04083101,  0.00466387, -0.01333358, -0.04312501,
         -0.04420219,  0.07556188,  0.02350184], dtype=float32),
  ' Private': array([ 0.07735993, -0.06224407, -0.06876317,  0.0360746 ,  0.03068237,
         -0.02371984, -0.01185691,  0.02789628], dtype=float32),
  ' Self-emp-inc': array([-0.02888541,  0.0220619 , -0.04513967, -0.07610089,  0.03344824,
         -0.02240068, -0.01810209,  0.03336848], dtype=float32),
  ' Self-emp-not-inc': array([ 0.00490372,  0.00060253, -0.10424689, -0

In [15]:
with open('unsupervised_trained_embeddings.dictionary', 'wb') as config_dictionary_file:
    pickle.dump(embeddings, config_dictionary_file)