<a href="https://colab.research.google.com/github/RonRichman/CredibilityTransformer/blob/main/The_Credibility_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Credibility Transformer: A Novel Approach for Tabular Data Prediction
## Ronald Richman, Salvatore Scognamiglio, Mario V. Wüthrich

https://arxiv.org/pdf/2409.16653

Inspired by the success of Transformer architectures in natural language processing, the Credibility Transformer extends their application to tabular data. This method introduces a novel *credibility mechanism* that integrates prior information with observed data. By embedding tabular data into low-dimensional spaces, the model enables attention mechanisms similar to those in time-series data processing.

The core innovation lies in the use of a special "CLS token" representing prior information and its combination with learned data through a credibility-weighted averaging process. This structure enhances predictive performance, stabilizes training, and outperforms state-of-the-art models on tasks like motor insurance claims prediction.

This notebook implements a simplified version of the Credibility Transformer for tabular data, including steps for tokenization, embedding, and integration of the credibility mechanism. The goal is to demonstrate how the proposed method works, without too much complication of the code.

In [1]:
import os
!pip install --upgrade keras



In [2]:
import keras
from keras.layers import Layer
import pandas as pd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [3]:
url='https://drive.google.com/file/d/1xIQJINshspnP2VshVrCVEl8blzOaxkMX/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
dat = pd.read_csv(url)
dat = pl.from_pandas(dat)
train = dat.filter(pl.col("set") == "train")
test = dat.filter(pl.col("set") == "test")
train = train.sample(n=train.shape[0], shuffle=True)
test = test.sample(n=test.shape[0], shuffle=True)

In [4]:
### check empirical proportions

dat.group_by("set").agg(
    pl.sum("ClaimNb") / pl.sum("Exposure"))

set,ClaimNb
str,f64
"""train""",0.073631
"""test""",0.07354


In [5]:
intercept_log = dat.group_by("set").agg(
    pl.sum("ClaimNb") / pl.sum("Exposure")).select("ClaimNb").to_numpy()[1]

In [6]:
var_names = dat.columns

cat_vars = [var_names[i-1] for i in [3,9,8,11]]

cont_vars = [var_names[i-1] for i in [4,5,6,7,10]]

In [7]:
### scale continuous and convert categorical to integers
def scale_data(col, method="min_max"):
    if method == "normalize":
        mean_val = col.mean()
        std_val = col.std()
        normalized = (col - mean_val) / std_val
        return {"mean": mean_val, "std": std_val, "normalized": normalized}

for col in cont_vars:
    stats = scale_data(train.select(col).to_series(), method="normalize")
    train = train.with_columns(stats["normalized"].alias(f"{col}_input"))
    test = test.with_columns(((pl.col(col) - stats["mean"]) / stats["std"]).alias(f"{col}_input"))

def cat_to_int(col):
    mapping = col.to_series().unique().sort().to_frame().with_columns(pl.arange(0, pl.count()).alias("col_name_input"))
    return {"map": mapping}

for col in cat_vars:
    maps = cat_to_int(train.select(col))["map"]
    maps = maps.rename({col: col, "col_name_input": f"{col}_input"})
    train = train.join(maps, on=col)
    test = test.join(maps, on=col)

train

  mapping = col.to_series().unique().sort().to_frame().with_columns(pl.arange(0, pl.count()).alias("col_name_input"))


IDpol,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimTotal,ClaimNb,id,set,VehPower_input,VehAge_input,DrivAge_input,BonusMalus_input,Density_input,Area_input,VehGas_input,VehBrand_input,Region_input
f64,f64,str,i64,i64,i64,i64,str,str,f64,str,f64,i64,i64,str,f64,f64,f64,f64,f64,i64,i64,i64,i64
4.018108e6,0.4,"""D""",7,0,35,67,"""B12""","""Regular""",7.280008,"""R31""",0.0,0,496370,"""train""",0.265784,-1.241702,-0.742593,0.462921,0.693577,3,1,3,7
6.11418e6,0.05,"""F""",4,0,76,50,"""B12""","""Regular""",10.203592,"""R11""",0.0,0,677857,"""train""",-1.196788,-1.241702,2.157636,-0.624469,2.255798,5,1,3,0
1.148748e6,1.0,"""C""",6,2,67,50,"""B5""","""Regular""",6.159095,"""R53""",0.0,0,165430,"""train""",-0.22174,-0.889183,1.521,-0.624469,0.094616,2,1,9,12
3.244114e6,0.07,"""C""",9,13,37,50,"""B2""","""Regular""",4.934474,"""R22""",0.0,0,478468,"""train""",1.240832,1.049675,-0.601119,-0.624469,-0.559763,2,1,6,2
1.10415e6,1.0,"""D""",7,2,41,50,"""B1""","""Regular""",7.18007,"""R24""",0.0,0,139891,"""train""",0.265784,-0.889183,-0.31817,-0.624469,0.640175,3,1,0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
4.161371e6,1.0,"""E""",7,8,23,85,"""B2""","""Regular""",8.257645,"""R72""",0.0,0,570188,"""train""",0.265784,0.168376,-1.591441,1.614276,1.215978,4,1,6,14
2.192244e6,1.0,"""C""",10,5,48,54,"""B11""","""Regular""",4.875197,"""R53""",0.0,0,290223,"""train""",1.728356,-0.360403,0.176992,-0.368613,-0.591437,2,1,2,12
73120.0,1.0,"""C""",7,5,54,64,"""B5""","""Regular""",6.165418,"""R93""",0.0,0,35942,"""train""",0.265784,-0.360403,0.601415,0.271029,0.097994,2,1,9,20
5.109401e6,0.03,"""E""",4,0,22,80,"""B12""","""Regular""",8.395477,"""R11""",0.0,0,640376,"""train""",-1.196788,-1.241702,-1.662178,1.294456,1.289629,4,1,3,0


In [8]:
### get data in format for keras
def get_keras_data(dat):
    temp = {}
    for col in cat_vars + cont_vars:
        temp[f"{col}_input"] = dat.select(f"{col}_input").to_numpy()
    temp["Exposure"] = dat.select("Exposure").to_numpy()
    return temp

train_x = get_keras_data(train)
test_x = get_keras_data(test)

all_data = pl.concat([train, test])
all_x = get_keras_data(all_data)

train_y = train.select("ClaimNb").to_numpy()
test_y = test.select("ClaimNb").to_numpy()
train_x

{'Area_input': array([[3],
        [5],
        [2],
        ...,
        [2],
        [4],
        [0]]),
 'VehGas_input': array([[1],
        [1],
        [1],
        ...,
        [1],
        [1],
        [0]]),
 'VehBrand_input': array([[3],
        [3],
        [9],
        ...,
        [9],
        [3],
        [0]]),
 'Region_input': array([[ 7],
        [ 0],
        [12],
        ...,
        [20],
        [ 0],
        [ 1]]),
 'VehPower_input': array([[ 0.26578424],
        [-1.1967877 ],
        [-0.22173974],
        ...,
        [ 0.26578424],
        [-1.1967877 ],
        [-0.22173974]]),
 'VehAge_input': array([[-1.24170232],
        [-1.24170232],
        [-0.88918271],
        ...,
        [-0.36040328],
        [-1.24170232],
        [ 1.04967519]]),
 'DrivAge_input': array([[-0.74259339],
        [ 2.15763594],
        [ 1.52100023],
        ...,
        [ 0.60141532],
        [-1.6621783 ],
        [ 0.53067802]]),
 'BonusMalus_input': array([[ 0.46292139],
     

In [9]:
class PositionalEmbedding(Layer):
    def __init__(self, embedding_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim

    def build(self, input_shape):
        self.pos_emb = self.add_weight(
            name='pos_emb',
            shape=(input_shape[1], self.embedding_dim),
            initializer='uniform',
            trainable=True
        )

    def call(self, inputs):
        batch_size = keras.ops.shape(inputs)[0]

        # Repeat self.pos_emb along the batch dimension to match the batch size
        expanded_pos_emb = keras.ops.broadcast_to(self.pos_emb, [batch_size, keras.ops.shape(self.pos_emb)[0], self.embedding_dim])

        return expanded_pos_emb

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1], self.embedding_dim)


## add a single embedding of size embedding dim. This is a learned embedding that is the same for all rows.
### we will append this to the embeddings later, apply the transformer, and then strip this out as input into the LocalGLMnet's FFN.
### See the BERT paper for more details; arxiv link: https://arxiv.org/pdf/1810.04805.pdf

class ClsEmbedding(Layer):
    def __init__(self, embedding_dim, **kwargs):
        super(ClsEmbedding, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim

    def build(self, input_shape):
        self.cls_emb = self.add_weight(
            name='cls_emb',
            shape=(1, 1, self.embedding_dim),
            initializer='uniform',
            trainable=True
        )

    def call(self, inputs):
        batch_size = keras.ops.shape(inputs)[0]
        expanded_cls_emb = keras.ops.broadcast_to(self.cls_emb, [batch_size, 1, self.embedding_dim])
        return expanded_cls_emb

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 1, self.embedding_dim)

### the aim of this next layer is to strip out the cls embedding, which now contains all the transformer information

class ClsTokenLayer(Layer):
    def __init__(self, **kwargs):
        super(ClsTokenLayer, self).__init__(**kwargs)

    def call(self, inputs):
        cls_token = inputs[:, -1, :]
        return cls_token

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 1, input_shape[2])


class TransformerLayer(Layer):
    def __init__(self, q_dim=10, k_dim=10, v_dim=10, ffn_dim=32, dropout_rate=0.01, **kwargs):
        super(TransformerLayer, self).__init__(**kwargs)
        self.q_dim = q_dim
        self.k_dim = k_dim
        self.v_dim = v_dim
        self.ffn_dim = ffn_dim
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.layer_norm1 = keras.layers.TimeDistributed(keras.layers.LayerNormalization())
        self.q_dense = keras.layers.TimeDistributed(keras.layers.Dense(units=self.q_dim, activation="gelu"))
        self.k_dense = keras.layers.TimeDistributed(keras.layers.Dense(units=self.k_dim, activation="gelu"))
        self.v_dense = keras.layers.TimeDistributed(keras.layers.Dense(units=self.v_dim, activation="gelu"))
        self.attention = keras.layers.Attention(use_scale=True)
        self.dropout1 = keras.layers.Dropout(rate=self.dropout_rate)
        self.layer_norm2 = keras.layers.TimeDistributed(keras.layers.LayerNormalization())
        self.ffn_dense1 = keras.layers.TimeDistributed(keras.layers.Dense(units=self.ffn_dim, activation="gelu"))
        self.dropout2 = keras.layers.Dropout(rate=self.dropout_rate)
        self.ffn_dense2 = keras.layers.TimeDistributed(keras.layers.Dense(units=self.v_dim, activation="gelu"))
        self.dropout3 = keras.layers.Dropout(rate=self.dropout_rate)
        self.cls_token_layer = ClsTokenLayer()

    def call(self, inputs):
        embeds_norm = self.layer_norm1(inputs)
        q = self.q_dense(embeds_norm)
        k = self.k_dense(embeds_norm)
        v = self.v_dense(embeds_norm)
        attended = self.dropout1(self.attention([q, v, k]))
        skip1 = keras.layers.Add()([embeds_norm, attended])
        ffn = self.dropout3(self.ffn_dense2(self.dropout2(self.ffn_dense1(self.layer_norm2(skip1)))))
        skip2 = keras.layers.Add()([skip1, ffn])

        cls_token = self.cls_token_layer(v)
        cls_token = keras.ops.expand_dims(cls_token, axis=1)
        cls_token = self.layer_norm2(cls_token)
        cls_token = self.ffn_dense1(cls_token)
        cls_token = self.dropout2(cls_token)
        cls_token = self.ffn_dense2(cls_token)
        cls_token = keras.layers.Flatten()(cls_token)

        return [skip2, cls_token]

    def compute_output_shape(self, input_shape):
        return [input_shape, input_shape]

In [10]:
### define NN
keras.backend.clear_session()
input_list = []
embedding_list = []
cardinalities = {}
count = 0

### loop over categorical. for each, define an input layer and an output layer

### here we set embed dim to 2

for column in [f"{col}_input" for col in cat_vars]:
    count += 1
    cardinalities[column] = train[column].max()
    print(cardinalities[column])
    input_list.append(keras.layers.Input(shape=(1,), dtype="int32", name=column))
    embedding_list.append(keras.layers.Embedding(input_dim=cardinalities[column]+1, output_dim=5)(input_list[-1]))
    embedding_list[-1] = keras.layers.Flatten(name=f"{column}_embed")(embedding_list[-1])

### loop over continuous, for each, define an input layer and an output layer
### here we apply a small neural network to get the continuous variables into the same dimension as the categorical variables

for column in [f"{col}_input" for col in cont_vars]:
    count += 1
    input_list.append(keras.layers.Input(shape=(1,), dtype="float32", name=column))
    embedding_list.append(keras.layers.Dense(units=5)(input_list[-1]))
    embedding_list[-1] = keras.layers.Dense(units=5, activation="tanh")(embedding_list[-1])

input_list.append(keras.layers.Input(shape=(1,), dtype="float32", name="Exposure"))

embeds = keras.layers.Concatenate(axis=1)(embedding_list)
embeds = keras.layers.Reshape((9, 5))(embeds)

### here we add the positional embedding to the categorical embeddings; this is one embedding for each column that
### is the same for all the rows

positional_embedding_layer = PositionalEmbedding(embedding_dim=5)
positional_embedding_layer = positional_embedding_layer(embeds)
embeds = keras.layers.Concatenate()([embeds, positional_embedding_layer])

# Create an instance of the cls layer
cls_embedding_layer = ClsEmbedding(embedding_dim=10)

# get the cls embedding
cls_embed = cls_embedding_layer(embeds)
embeds = keras.layers.Concatenate(axis=1)([embeds, cls_embed])

### Custom Multi-Head Attention Transformer Layer

transformer_layer = TransformerLayer(q_dim=10, k_dim=10, v_dim=10, ffn_dim=40, dropout_rate=0.01)
transformer_output = transformer_layer(embeds)

class VectorSelectionLayer(keras.layers.Layer):
    def __init__(self, rate, apply_during_test=False, **kwargs):
        super().__init__(**kwargs)
        self.rate = rate
        self.apply_during_test = apply_during_test
        self.seed_generator = keras.random.SeedGenerator(seed=1337)

    def call(self, inputs, training=None):
        vector1, vector2 = inputs

        if training or self.apply_during_test:
            random_tensor = keras.random.uniform(shape=(keras.ops.shape(vector1)[0],), seed=self.seed_generator)
            selection_mask = keras.ops.cast(random_tensor < self.rate, dtype='float32')
            selection_mask = keras.ops.expand_dims(selection_mask, axis=-1)
            selected_vector = vector1 * selection_mask + vector2 * (1 - selection_mask)
        else:
            selected_vector = vector1

        return selected_vector

    def compute_output_shape(self, input_shape):
        return input_shape[0]

### strip out the cls embedding, which now contains all the transformer information

cls_token_layer = ClsTokenLayer()
cls_token = cls_token_layer(transformer_output[0])

# Create an instance of the VectorSelectionLayer
vector_selection_layer = VectorSelectionLayer(rate=0.9, apply_during_test=False)

# Apply the layer
transformer_output2 = vector_selection_layer([cls_token, transformer_output[1]])

### rest of the network
### we use a very simple FFN
initializer = keras.initializers.Constant(0)
initializer_bias = keras.initializers.Constant(np.log(intercept_log))

output_rescale = keras.layers.Dense(units=16, activation="gelu")(keras.layers.Flatten()(transformer_output2))
output_rescale = keras.layers.Dense(units=1, activation="exponential", kernel_initializer=initializer, bias_initializer= initializer_bias)(output_rescale)

output = keras.layers.Multiply(name="output")([output_rescale, input_list[-1]])

### define model

model = keras.Model(inputs=input_list, outputs=output)

5
1
10
21


In [11]:
model.summary()

In [12]:
### see paper for source on beta_2

adamw = keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.01, beta_2 = 0.95)

model.compile(optimizer=adamw, loss="poisson")


In [13]:
model_write =keras.callbacks.ModelCheckpoint("frmtpl_cred_transformer.weights.h5", save_best_only=True, verbose=1, save_weights_only=True)
learn_rate = keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.9, patience=20, verbose=1)
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=1)
# fit model.

fit = model.fit(x=train_x, y=train_y, batch_size=1024, epochs=250, callbacks=[model_write,learn_rate,early_stop],
                validation_split=0.1, verbose=1, shuffle=True)

Epoch 1/250
[1m535/537[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 28ms/step - loss: 0.1596
Epoch 1: val_loss improved from inf to 0.15534, saving model to frmtpl_cred_transformer.weights.h5
[1m537/537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 32ms/step - loss: 0.1596 - val_loss: 0.1553 - learning_rate: 0.0010
Epoch 2/250
[1m536/537[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - loss: 0.1576
Epoch 2: val_loss improved from 0.15534 to 0.15403, saving model to frmtpl_cred_transformer.weights.h5
[1m537/537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 28ms/step - loss: 0.1576 - val_loss: 0.1540 - learning_rate: 0.0010
Epoch 3/250
[1m535/537[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - loss: 0.1571
Epoch 3: val_loss improved from 0.15403 to 0.15382, saving model to frmtpl_cred_transformer.weights.h5
[1m537/537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 28ms/step - loss: 0.1571 - val_loss: 0.1538 - lea

KeyboardInterrupt: 

In [14]:
model.load_weights("frmtpl_cred_transformer.weights.h5")

In [15]:
import numpy as np

train_pred = model.predict(train_x, batch_size=32000)
test_pred = model.predict(test_x, batch_size=32000)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 125ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


In [16]:
import polars as pl

# Ensure train_pred and test_pred are flattened to match the DataFrame structure
train_pred_flat = train_pred.flatten()
test_pred_flat = test_pred.flatten()

# Add the predictions to the DataFrames
train = train.with_columns(pl.Series("pred_NN", train_pred_flat))
test = test.with_columns(pl.Series("pred_NN", test_pred_flat))

### scoring
def score_model(train, test, pred_col):
    train_pred = train.select(pred_col).to_numpy()
    test_pred = test.select(pred_col).to_numpy()

    def poisson_deviance(true, pred):
        sum_pred = np.sum(pred)
        sum_true = np.sum(true)
        log_term = np.sum((true + 1e-15) * np.log((true + 1e-15) / (pred + 1e-15)))
        print("sum_pred:", sum_pred)
        print("sum_true:", sum_true)
        print("log_term:", log_term)
        return 2 * (sum_pred - sum_true + log_term) / len(true)

    pois_dev = poisson_deviance(train.select("ClaimNb").to_numpy(), train_pred)
    pois_dev_test = poisson_deviance(test.select("ClaimNb").to_numpy(), test_pred)
    results = {
        "pois_dev": pois_dev,
        "pois_dev_test": pois_dev_test,
        "model": pred_col
    }

    return {"results": results}

# Ensure you are passing the correct DataFrames
score_model(train, test, "pred_NN")

sum_pred: 22883.371
sum_true: 23738
log_term: 73351.84859226522
sum_pred: 2562.9553
sum_true: 2645
log_term: 8145.217765981433


{'results': {'pois_dev': 0.23761555830658898,
  'pois_dev_test': 0.2378482054319865,
  'model': 'pred_NN'}}