In [21]:
config = {
    'model': {
        'embedding_dims': 16,
        'num_transformer_blocks': 3,
        'num_heads': 4,
        'mlp_hidden_units_factors': [2, 1],
        'dropout_rate': 0.2,
        'use_column_embedding': False
    },
    'training': {
        'learning_rate': 0.001,
        'weight_decay': 0.0001,
        'batch_size': 265,
        'num_epochs': 15
    },
    'data': {
        'train_data_file': "train_data.csv",
        'test_data_file': "test_data.csv",
        'csv_header': [
            "age", "workclass", "fnlwgt", "education", "education_num", 
            "marital_status", "occupation", "relationship", "race", 
            "gender", "capital_gain", "capital_loss", "hours_per_week", 
            "native_country", "income_bracket"
        ],
        'target_labels': [" <=50K", " >50K"],
        'weight_column_name': "fnlwgt",
        'numerical_feature_names': ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"],
        'categorical_feature_names': ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
    }
}


In [22]:
import keras
from keras import layers
from keras import ops
import pandas as pd
from tensorflow import data as tf_data
from functools import partial

class TabTransformer:
    def __init__(self, config):
        self.config = config
        self._prepare_metadata()

    def _prepare_metadata(self):
        self.csv_header = self.config['data']['csv_header']
        self.target_labels = self.config['data']['target_labels']
        self.weight_column_name = self.config['data']['weight_column_name']
        self.numerical_feature_names = self.config['data']['numerical_feature_names']
        self.categorical_feature_names = self.config['data']['categorical_feature_names']
        self.categorical_features_with_vocabulary = self._get_categorical_features_vocab()
        self.feature_names = self.numerical_feature_names + self.categorical_feature_names

    def _get_categorical_features_vocab(self):
        train_data = pd.read_csv(self.config['data']['train_data_file'], header=None, names=self.csv_header)
        vocab = {}
        for feature_name in self.categorical_feature_names:
            vocab[feature_name] = sorted(train_data[feature_name].unique())
        return vocab

    def create_model_inputs(self):
        inputs = {}
        for feature_name in self.feature_names:
            if feature_name in self.numerical_feature_names:
                inputs[feature_name] = layers.Input(name=feature_name, shape=(), dtype="float32")
            else:
                inputs[feature_name] = layers.Input(name=feature_name, shape=(), dtype="int32")
        return inputs

    def encode_inputs(self, inputs, embedding_dims):
        encoded_categorical_feature_list = []
        numerical_feature_list = []

        for feature_name in inputs:
            if feature_name in self.categorical_feature_names:
                vocabulary = self.categorical_features_with_vocabulary[feature_name]
                embedding = layers.Embedding(input_dim=len(vocabulary), output_dim=embedding_dims)
                encoded_categorical_feature = embedding(inputs[feature_name])
                encoded_categorical_feature_list.append(encoded_categorical_feature)
            else:
                numerical_feature = ops.expand_dims(inputs[feature_name], -1)
                numerical_feature_list.append(numerical_feature)

        return encoded_categorical_feature_list, numerical_feature_list

    def create_mlp(self, hidden_units, dropout_rate, activation, normalization_layer, name=None):
        mlp_layers = []
        for units in hidden_units:
            mlp_layers.append(normalization_layer())
            mlp_layers.append(layers.Dense(units, activation=activation))
            mlp_layers.append(layers.Dropout(dropout_rate))

        return keras.Sequential(mlp_layers, name=name)

    def create_tabtransformer_classifier(self):
        embedding_dims = self.config['model']['embedding_dims']
        num_transformer_blocks = self.config['model']['num_transformer_blocks']
        num_heads = self.config['model']['num_heads']
        mlp_hidden_units_factors = self.config['model']['mlp_hidden_units_factors']
        dropout_rate = self.config['model']['dropout_rate']
        use_column_embedding = self.config['model']['use_column_embedding']

        inputs = self.create_model_inputs()
        encoded_categorical_feature_list, numerical_feature_list = self.encode_inputs(inputs, embedding_dims)
        encoded_categorical_features = ops.stack(encoded_categorical_feature_list, axis=1)
        numerical_features = layers.concatenate(numerical_feature_list)

        if use_column_embedding:
            num_columns = encoded_categorical_features.shape[1]
            column_embedding = layers.Embedding(input_dim=num_columns, output_dim=embedding_dims)
            column_indices = ops.arange(start=0, stop=num_columns, step=1)
            encoded_categorical_features = encoded_categorical_features + column_embedding(column_indices)

        for block_idx in range(num_transformer_blocks):
            attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dims, dropout=dropout_rate, name=f"multihead_attention_{block_idx}")(encoded_categorical_features, encoded_categorical_features)
            x = layers.Add(name=f"skip_connection1_{block_idx}")([attention_output, encoded_categorical_features])
            x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
            feedforward_output = self.create_mlp(hidden_units=[embedding_dims], dropout_rate=dropout_rate, activation=keras.activations.gelu, normalization_layer=partial(layers.LayerNormalization, epsilon=1e-6), name=f"feedforward_{block_idx}")(x)
            x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
            encoded_categorical_features = layers.LayerNormalization(name=f"layer_norm2_{block_idx}", epsilon=1e-6)(x)

        categorical_features = layers.Flatten()(encoded_categorical_features)
        numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
        features = layers.concatenate([categorical_features, numerical_features])

        mlp_hidden_units = [factor * features.shape[-1] for factor in mlp_hidden_units_factors]
        features = self.create_mlp(hidden_units=mlp_hidden_units, dropout_rate=dropout_rate, activation=keras.activations.selu, normalization_layer=layers.BatchNormalization, name="MLP")(features)

        outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
        model = keras.Model(inputs=inputs, outputs=outputs)
        return model


In [23]:
import keras
import pandas as pd
from tensorflow import data as tf_data

def prepare_example(features, target, weight_column_name):
    target_label_lookup = layers.StringLookup(vocabulary=config['data']['target_labels'], mask_token=None, num_oov_indices=0)
    target_index = target_label_lookup(target)
    weights = features.pop(weight_column_name)
    return features, target_index, weights

def get_dataset_from_csv(csv_file_path, batch_size, shuffle, config):
    CSV_HEADER = config['data']['csv_header']
    COLUMN_DEFAULTS = [[0.0] if feature_name in config['data']['numerical_feature_names'] + [config['data']['weight_column_name']] else ["NA"] for feature_name in CSV_HEADER]

    dataset = (
        tf_data.experimental.make_csv_dataset(
            csv_file_path,
            batch_size=batch_size,
            column_names=CSV_HEADER,
            column_defaults=COLUMN_DEFAULTS,
            label_name='income_bracket',
            num_epochs=1,
            header=False,
            na_value="?",
            shuffle=shuffle,
        )
        .map(lambda features, target: prepare_example(features, target, config['data']['weight_column_name']), num_parallel_calls=tf_data.AUTOTUNE, deterministic=False)
        .map(lambda features, target, weights: encode_categorical(features, target, weights, config))
    )
    return dataset.cache()

def encode_categorical(batch_x, batch_y, weights, config):
    lookup_dict = {}
    for feature_name in config['data']['categorical_feature_names']:
        vocabulary = config['data']['categorical_features_with_vocabulary'][feature_name]
        lookup = layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
        lookup_dict[feature_name] = lookup

    for feature_name in config['data']['categorical_feature_names']:
        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])

    return batch_x, batch_y, weights

def run_experiment(model, train_data_file, test_data_file, num_epochs, learning_rate, weight_decay, batch_size, config):
    optimizer = keras.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay)

    model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True, config=config)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size, shuffle=False, config=config)
    
    print("Start training the model...")
    history = model.fit(train_dataset, epochs=num_epochs, validation_data=validation_dataset)
    print("Model training finished")
    
    _, accuracy = model.evaluate(validation_dataset, verbose=0)
    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
    
    return history

# Instantiate and run the TabTransformer
tab_transformer = TabTransformer(config)
model = tab_transformer.create_tabtransformer_classifier()

history = run_experiment(
    model=model,
    train_data_file=config['data']['train_data_file'],
    test_data_file=config['data']['test_data_file'],
    num_epochs=config['training']['num_epochs'],
    learning_rate=config['training']['learning_rate'],
    weight_decay=config['training']['weight_decay'],
    batch_size=config['training']['batch_size'],
    config=config
)


KeyError: in user code:

    File "/tmp/ipykernel_18218/3085553066.py", line 28, in None  *
        lambda features, target, weights: encode_categorical(features, target, weights, config)
    File "/tmp/ipykernel_18218/3085553066.py", line 35, in encode_categorical  *
        vocabulary = config['data']['categorical_features_with_vocabulary'][feature_name]

    KeyError: 'categorical_features_with_vocabulary'


In [24]:
# tab_transformer = TabTransformer(config)
# model = tab_transformer.create_tabtransformer_classifier()

# run_experiment(
#     model=model,
#     train_data_file=config['data']['train_data_file'],
#     test_data_file=config['data']['test_data_file'],
#     num_epochs=config['training']['num_epochs'],
#     learning_rate=config['training']['learning_rate'],
#     weight_decay=config['training']['weight_decay'],
#     batch_size=config['training']['batch_size'],
#     config=config
# )


In [25]:
!head data/train_data.csv

59,Self-emp-inc,12,9th,Divorced,Exec-managerial,Other-relative,Black,Male,5985,869,86,France, <=50K
25,Local-gov,9,9th,Divorced,Machine-op-inspct,Wife,Black,Female,9447,625,5,Cambodia, <=50K
38,Private,13,1st-4th,Widowed,Exec-managerial,Unmarried,Other,Female,1550,2555,65,Portugal, <=50K
48,Federal-gov,7,11th,Never-married,Machine-op-inspct,Other-relative,Other,Male,7001,1240,92,Honduras, >50K
53,Local-gov,11,9th,Married-spouse-absent,Transport-moving,Own-child,Black,Female,7420,3793,62,India, <=50K
18,Local-gov,8,Prof-school,Widowed,Armed-Forces,Unmarried,Other,Female,4868,2581,87,Greece, >50K
43,Private,13,Prof-school,Never-married,Machine-op-inspct,Wife,Amer-Indian-Eskimo,Male,4911,3946,36,Thailand, >50K
59,Federal-gov,1,Preschool,Married-civ-spouse,Adm-clerical,Unmarried,White,Female,6431,2795,29,Nicaragua, <=50K
25,Federal-gov,7,Assoc-voc,Separated,Protective-serv,Other-relative,Other,Male,3916,111,88,France, >50K
41,Federal-gov,2,Masters,Separated,Sales,Unmarried,Other,Female,130