In [None]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
import tensorflow_addons as tfa

from tensorflow.keras.callbacks import EarlyStopping

from tabtransformertf.models.fttransformer import FTTransformerEncoder, FTTransformer
from tabtransformertf.utils.preprocessing import df_to_dataset

import catboost as cb
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns

In [2]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams.update({'font.size': 15})

In [53]:
import random
random.seed(42)

## Download Data

In [36]:
dset = fetch_california_housing()

In [37]:
data = dset['data']
y = dset['target']
LABEL = dset['target_names'][0]

NUMERIC_FEATURES = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Longitude', 'Latitude']

data = pd.DataFrame(data, columns=dset['feature_names'])
data[LABEL] = y

In [38]:
train_data, test_data = train_test_split(data, test_size=0.2)

In [None]:
print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")


## Data Processing

In [40]:
X_train, X_val = train_test_split(train_data, test_size=0.2)

In [41]:
sc = StandardScaler()
X_train.loc[:, NUMERIC_FEATURES] = sc.fit_transform(X_train[NUMERIC_FEATURES])
X_val.loc[:, NUMERIC_FEATURES] = sc.transform(X_val[NUMERIC_FEATURES])
test_data.loc[:, NUMERIC_FEATURES] = sc.transform(test_data[NUMERIC_FEATURES])

In [42]:
FEATURES = NUMERIC_FEATURES

In [None]:
sns.distplot(X_train[LABEL])
sns.distplot(X_val[LABEL])
sns.distplot(test_data[LABEL])

# Baselines

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=20)
rf.fit(X_train[FEATURES], X_train[LABEL])

In [None]:
rf_preds = rf.predict(test_data[FEATURES])
rf_rms = mean_squared_error(test_data[LABEL], rf_preds, squared=False)
print(rf_rms)

## CatBoost

In [46]:
catb_train_dataset = cb.Pool(X_train[FEATURES], X_train[LABEL]) 
catb_val_dataset = cb.Pool(X_val[FEATURES], X_val[LABEL]) 
catb_test_dataset = cb.Pool(test_data[FEATURES], test_data[LABEL])

In [None]:
tuned_catb = cb.CatBoostRegressor()
tuned_catb.fit(catb_train_dataset, eval_set=catb_val_dataset, early_stopping_rounds=50)

In [89]:
catb_preds = tuned_catb.predict(catb_test_dataset)

In [90]:
catb_rms = mean_squared_error(test_data[LABEL], catb_preds, squared=False)

## Modelling Prep

In [None]:
# To TF Dataset
train_dataset = df_to_dataset(X_train[FEATURES + [LABEL]], LABEL, shuffle=True)
val_dataset = df_to_dataset(X_val[FEATURES + [LABEL]], LABEL, shuffle=False)  # No shuffle
test_dataset = df_to_dataset(test_data[FEATURES + [LABEL]], shuffle=False) # No target, no shuffle

# FTTransformer

## FT Transformer - Linear Numerical Encoding

In [None]:
ft_linear_encoder = FTTransformerEncoder(
    numerical_features = NUMERIC_FEATURES,
    categorical_features = [],
    numerical_data = X_train[NUMERIC_FEATURES].values,
    categorical_data =None, # No categorical data
    y = None,
    numerical_embedding_type='linear',
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    explainable=True
)

# Pass th encoder to the model
ft_linear_transformer = FTTransformer(
    encoder=ft_linear_encoder,
    out_dim=1,
    out_activation="relu",
)

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

ft_linear_transformer.compile(
    optimizer = optimizer,
    loss = {"output": tf.keras.losses.MeanSquaredError(name='mse'), "importances": None},
    metrics= {"output": [tf.keras.metrics.RootMeanSquaredError(name='rmse')], "importances": None},
)

early = EarlyStopping(monitor="val_output_loss", mode="min", patience=16, restore_best_weights=True)
callback_list = [early]

ft_linear_history = ft_linear_transformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list
)

## FT Transformer - Periodic Numerical Encoding

In [None]:
ft_periodic_encoder = FTTransformerEncoder(
    numerical_features = NUMERIC_FEATURES,
    categorical_features = [],
    numerical_data = X_train[NUMERIC_FEATURES].values,
    categorical_data = None,
    y = None,
    numerical_embedding_type='periodic',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    explainable=True
)

# Pass th encoder to the model
ft_periodic_transformer = FTTransformer(
    encoder=ft_periodic_encoder,
    out_dim=1,
    out_activation="relu",
)

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

ft_periodic_transformer.compile(
    optimizer = optimizer,
    loss = {"output": tf.keras.losses.MeanSquaredError(name='mse'), "importances": None},
    metrics= {"output": [tf.keras.metrics.RootMeanSquaredError(name='rmse')], "importances": None},
)

early = EarlyStopping(monitor="val_output_loss", mode="min", patience=16, restore_best_weights=True)
callback_list = [early]

ft_periodic_history = ft_periodic_transformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list
)

## FT Transformer - PLE Quantile

In [None]:
ft_pleq_encoder = FTTransformerEncoder(
    numerical_features = NUMERIC_FEATURES,
    categorical_features = [],
    numerical_data = X_train[NUMERIC_FEATURES].values,
    categorical_data = None,
    y = None,
    numerical_embedding_type='ple',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    explainable=True
)

# Pass the encoder to the model
ft_pleq_transformer = FTTransformer(
    encoder=ft_pleq_encoder,
    out_dim=1,
    out_activation="relu",
)

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

ft_pleq_transformer.compile(
    optimizer = optimizer,
    loss = {"output": tf.keras.losses.MeanSquaredError(name='mse'), "importances": None},
    metrics= {"output": [tf.keras.metrics.RootMeanSquaredError(name='rmse')], "importances": None},
)

early = EarlyStopping(monitor="val_loss", mode="min", patience=20, restore_best_weights=True)
callback_list = [early]

ft_pleq_history = ft_pleq_transformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list
)

## FT Transformer - PLE Target

In [None]:
ft_plet_encoder = FTTransformerEncoder(
   numerical_features = NUMERIC_FEATURES,
    categorical_features = [],
    numerical_data = X_train[NUMERIC_FEATURES].values,
    categorical_data = None,
    y = X_train[LABEL].values,
    task='regression',
    numerical_embedding_type='ple',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    ple_tree_params = {
        "min_samples_leaf": 20,
    },
    explainable=True
)


# Pass th encoder to the model
ft_plet_transformer = FTTransformer(
    encoder=ft_plet_encoder,
    out_dim=1,
    out_activation=None,
)

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

ft_plet_transformer.compile(
    optimizer = optimizer,
    loss = {"output": tf.keras.losses.MeanSquaredError(name='mse'), "importances": None},
    metrics= {"output": [tf.keras.metrics.RootMeanSquaredError(name='rmse')], "importances": None},
)

early = EarlyStopping(monitor="val_output_loss", mode="min", patience=20, restore_best_weights=True)
callback_list = [early]

ft_plet_history = ft_plet_transformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list
)

## Compare

In [None]:
plt.plot(ft_linear_history.history['val_loss'][:72], label='Linear Val Loss')
plt.plot(ft_periodic_history.history['val_loss'][:100], label='Periodic Val Loss')
plt.plot(ft_pleq_history.history['val_loss'][:100], label='PLE Quantile Val Loss')
plt.plot(ft_plet_history.history['val_loss'][:100], label='PLE Target Val Loss')

plt.title('Model Validation Loss')
plt.legend()
plt.show()

In [92]:
linear_test_preds = ft_linear_transformer.predict(test_dataset)
linear_rms = mean_squared_error(test_data[LABEL], linear_test_preds['output'].ravel(), squared=False)

periodic_test_preds = ft_periodic_transformer.predict(test_dataset)
periodic_rms = mean_squared_error(test_data[LABEL], periodic_test_preds['output'].ravel()., squared=False)

pleq_test_preds = ft_pleq_transformer.predict(test_dataset)
pleq_rms = mean_squared_error(test_data[LABEL], pleq_test_preds['output'].ravel(), squared=False)

plet_test_preds = ft_plet_transformer.predict(test_dataset)
plet_rms = mean_squared_error(test_data[LABEL], plet_test_preds['output'].ravel(), squared=False)

In [None]:
print("-" * 28 + " FT Transformer " + "-" * 27)
print("Linear Encoding RMSE:", linear_rms.round(4))
print("Periodic Encoding RMSE:", periodic_rms.round(4))
print("PLE Encoding with Qantile Binning RMSE:", pleq_rms.round(4))
print("PLE Encoding with Target Binning RMSE:", plet_rms.round(4))
print("")
print("-" * 30 + " Baselines " + "-" * 30)
print("Random Forest RMSE:", rf_rms.round(4))
print("Catboost RMSE:", catb_rms.round(4))


## Tuning

In [66]:
# import optuna
# import gc

# def objective(trial):
#     ft_encoder = FTTransformerEncoder(
#         numerical_features = NUMERIC_FEATURES,
#         categorical_features = [],
#         numerical_data = X_train[NUMERIC_FEATURES].values,
#         categorical_data = None,
#         y = X_train[LABEL].values,
#         task='regression',
#         numerical_embedding_type= 'ple',
#         numerical_bins=trial.suggest_int('numerical_bins', 20, 200),
#         embedding_dim=trial.suggest_int('embedding_dim', 8, 100),
#         depth=trial.suggest_int('depth', 1, 6),
#         heads=trial.suggest_int('heads', 2, 8),
#         attn_dropout=trial.suggest_float('attn_dropout', 0., 0.5),
#         ff_dropout=trial.suggest_float('ff_dropout', 0., 0.5),
#         explainable=True
#     )


#     # Pass th encoder to the model
#     ft_transformer = FTTransformer(
#         encoder=ft_encoder,
#         out_dim=1,
#         out_activation=housing_act
#     )

#     LEARNING_RATE = 0.001
#     WEIGHT_DECAY = 0.00001
#     NUM_EPOCHS = 1000

#     optimizer = tfa.optimizers.AdamW(
#             learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
#         )

#     ft_transformer.compile(
#         optimizer = optimizer,
#         loss = {"output": tf.keras.losses.MeanSquaredError(name='mse'), "importances": None},
#         metrics= {"output": [tf.keras.metrics.RootMeanSquaredError(name='rmse')], "importances": None},
#     )

#     early = EarlyStopping(monitor="val_output_loss", mode="min", patience=20, restore_best_weights=True)
#     callback_list = [early]

#     ft_history = ft_transformer.fit(
#         train_dataset, 
#         epochs=NUM_EPOCHS, 
#         validation_data=val_dataset,
#         callbacks=callback_list
#     ) 
    
#     preds = ft_transformer.predict(test_dataset)
    
#     rmse = mean_squared_error(test_data[LABEL], preds['output'].ravel().clip(0, 5),squared=False)
#     gc.collect()
    
#     return rmse

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)