# Intro

Notebook trains Banksformer models and generates sythetic data.   
Parameters for generating data (seq_len, number of seqs) are near bottom (Under "Generate Full dataset")

In [1]:
import logging
import os
import sys
import time
import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf
import seaborn as sns

In [2]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

# SETUP DATA

### Set input dataset and nb_id

In [3]:
from field_config import CLOCK_DIMS, get_field_info, DATA_KEY_ORDER, LOSS_TYPES

DATA_KEY_ORDER is ['tcode_num', 'dow', 'month', 'day', 'dtme', 'td_sc', 'log_amount_sc']
LOSS_TYPES are: day - scce, dtme - scce, dow - scce, month - scce, td_sc - pdf, log_amount_sc - pdf, tcode_num - scce
If this is not correct, edit field_config.py and re-run notebook


In [4]:
ds_suffix = "-uk"
nb_id = "cond"

### Load training data

In [5]:
inp_tensor = np.load(f"stored_data/inp_tensor-{ds_suffix}.npy")
tar_tensor = np.load(f"stored_data/tar_tensor-{ds_suffix}.npy")
attributes = np.load(f"stored_data/attributes-{ds_suffix}.npy")

inp_tensor.shape, tar_tensor.shape, attributes.shape

((6983, 21, 54), (6983, 20, 7), (6983,))

In [6]:
n_seqs, n_steps, n_feat_inp = inp_tensor.shape
n_feat_tar = tar_tensor.shape[2]

In [7]:
from my_lib.encoding import load_data_encoder
data_encoder = load_data_encoder(ds_suffix)

### Split and create tf dataset

In [8]:
from sklearn.model_selection import train_test_split

x_tr, x_cv, inds_tr, inds_cv, targ_tr, targ_cv = train_test_split(
    inp_tensor, np.arange(n_seqs), tar_tensor, test_size=0.2)

In [9]:
ds_tr = tf.data.Dataset.from_tensor_slices((x_tr.astype(np.float32), targ_tr.astype(np.float32)))
ds_cv = tf.data.Dataset.from_tensor_slices((x_cv.astype(np.float32), targ_cv.astype(np.float32)))

ds_tr

2022-06-29 11:56:27.712714: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<TensorSliceDataset element_spec=(TensorSpec(shape=(21, 54), dtype=tf.float32, name=None), TensorSpec(shape=(20, 7), dtype=tf.float32, name=None))>

In [10]:
from my_lib.transformer_core import make_batches

BUFFER_SIZE = ds_tr.cardinality().numpy()


# Training

## Loss

In [11]:
def log_normal_pdf(sample, mean, logvar, raxis=1):
    log2pi = tf.math.log(2. * np.pi)
    return  -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi)


In [12]:
def log_normal_pdf_gen(sample, mean, logvar, raxis=1):
    log2pi = tf.cast(tf.math.log(2. * np.pi), tf.float64)
    return  -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi)


In [13]:

from tensorflow.keras.losses import CategoricalCrossentropy, MeanSquaredError, SparseCategoricalCrossentropy


loss_scce_logit = SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

loss_scce_probit = SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

loss_mse = MeanSquaredError(reduction='none')



def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(tf.reduce_sum(seq, axis=2), 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)



def loss_function(real, pred):
    loss_parts = []
    loss_parts_weighted = []

    for k, k_pred in pred.items():

        st = FIELD_STARTS_TAR[k]
        end = st + FIELD_DIMS_TAR[k]
        loss_type = LOSS_TYPES[k]
        

        if loss_type == "scce":
            loss_ = loss_scce_logit(real[:, :, st:end], k_pred)
        elif loss_type == "clock":
            loss_ = loss_scce_probit(real[:, :, st:end], clock_to_onehot(k, k_pred))
        elif loss_type == "mse":
            loss_ = loss_mse(real[:, :, st:end], k_pred)
        elif loss_type == "pdf":
            loss_ = -log_normal_pdf(real[:, :, st:end], k_pred[:,:,0:1], k_pred[:,:,1:2])[:,:,0]
        else:
            raise Exception(f"Invalid loss type! Got loss type = {loss_type} with key = {k}. Check field_config.py for loss types")
            

        mask = tf.math.logical_not(tf.math.equal(tf.reduce_sum(real, axis=2), 0))
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        loss_ = tf.reduce_sum(loss_)/tf.reduce_sum(mask) 

        loss_parts.append(loss_)
        loss_parts_weighted.append(loss_ * LOSS_WEIGHTS[k])

    return tf.reduce_sum(loss_parts_weighted), loss_parts




In [14]:
from my_lib.encoding import bulk_encode_time_value

EPS_CLOCKP = 0.01

CLOCKS = {}
for k, val in CLOCK_DIMS.items():
    CLOCKS[k] = tf.constant(bulk_encode_time_value(np.arange(val), val), dtype=tf.float32)

def clock_to_probs(pt, pts):
    
    ds = tf.constant(pts) - pt
    sq_ds = np.sum(tf.square(ds+EPS_CLOCKP), axis=1)
    raw_ps = 1/ sq_ds   
    
    return raw_ps / np.sum(raw_ps)



def clock_to_onehot(k, vals):
    orig_shape = vals.shape

    vals = tf.reshape(vals, (-1, orig_shape[-1]))

    return np.array([clock_to_probs(p, CLOCKS[k]) for p in vals]).reshape(*orig_shape[:-1], -1)   


CLOCK_DIMS

{'day': 31, 'dtme': 31, 'dow': 7, 'month': 12}

## Set Banksformer configs

In [15]:
ACTIVATIONS = {
    "td_sc": "relu",
    "log_amount_sc": "relu"
}

In [16]:
config = {}
FIELD_DIMS_IN, FIELD_STARTS_IN, FIELD_DIMS_TAR, FIELD_STARTS_TAR, FIELD_DIMS_NET, FIELD_STARTS_NET = get_field_info(ds_suffix)

config["ORDER"] = DATA_KEY_ORDER
config["FIELD_STARTS_IN"] = FIELD_STARTS_IN
config["FIELD_DIMS_IN"] = FIELD_DIMS_IN
config["FIELD_STARTS_NET"] = FIELD_STARTS_NET
config["FIELD_DIMS_NET"] = FIELD_DIMS_NET


config["ACTIVATIONS"] = ACTIVATIONS



## Training Loop 

In [None]:
from my_lib.BanksformerGen import Transformer
import pickle 


all_models = []
for_df = []


def to_num(x):
    try: return int(x)
    except: return float(x)

    
def id_str_to_folder(id_str):
    return id_str.replace(".", "__")
beta = 1


# moredate
LOSS_WEIGHTS_OLD = {
 'td_sc':1.,
 'year': 0.5,
 'month': 0.15,
 'day': 0.25,
 'dow': 0.1,
 'tcode_num': 1.,
 'log_amount_sc': 2.}


LOSS_WEIGHTS_0 = {
 'td_sc':1.,
 'month': 0.015,
 'day': 0.025,
 'dow': 0.01,
 'tcode_num': 1.,
 'log_amount_sc': 2.}



LOSS_WEIGHTS_MID = {
 'td_sc':1.,
 'month': 0.07,
 'day': 0.1,
 'dow': 0.04,
 'tcode_num': 1.,
 'log_amount_sc': 2.}



lws = [(LOSS_WEIGHTS_0, "0"), (LOSS_WEIGHTS_OLD, "moredate")] 

# lws = [(LOSS_WEIGHTS_MID, "mid")]

td_loss_fns = [(loss_mse, "loss_mse")]


EPOCHS = 1
EARLY_STOP = 2

num_layers_enc = None
dropout_rate = 0.1
dr = dropout_rate
opt_name = "adam"
# td_loss_fn = loss_mse


## Tuning these ! 
d_model = 128
num_layers_dec = 4
num_heads = 2
bs = 64
# lws # above


LOSS_WEIGHTS, lwi = lws[0]


dff = d_model



for i in range(3):


    start = time.time()


    print(datetime.datetime.now().strftime("%H:%M"))


    transformer = Transformer(
        num_layers_enc=num_layers_enc, num_layers_dec=num_layers_dec,
        d_model=d_model,
        num_heads=num_heads,
        dff=dff,
        maximum_position_encoding=256,
       net_info = None, 
        inp_dim = n_feat_inp,
        final_dim= None,
        config=config,
        rate=dr)

    optimizer = tf.keras.optimizers.Adam()
    transformer.optimizer =  optimizer


    train_batches = make_batches(ds_tr, BUFFER_SIZE, bs)


    transformer.loss_function = loss_function
    LOSS_WEIGHTS["dtme"] = LOSS_WEIGHTS["day"]

    LOSS_WEIGHTS["k_symbol_num"] = LOSS_WEIGHTS["tcode_num"]
    LOSS_WEIGHTS["operation_num"] = LOSS_WEIGHTS["tcode_num"]
    LOSS_WEIGHTS["type_num"] = LOSS_WEIGHTS["tcode_num"]
    transformer.LOSS_WEIGHTS = LOSS_WEIGHTS

    id_str = f"v2b__nld_{num_layers_dec}-dm_{d_model}-nh_{num_heads}-i_{i}-dr_{dr}-opt_{opt_name}-lwi_{lwi}-bs_{bs}"

    print("Begin running", id_str)
    transformer.id_str = id_str


    all_models.append(transformer)
    transformer.compile()


    transformer.checkpoint_path = f"./checkpoints/{id_str_to_folder(transformer.id_str)}-{ds_suffix}-{nb_id}"
    # ensure checkpoint directory exists (helpful on Colab)
    try:
        os.makedirs(transformer.checkpoint_path, exist_ok=True)
    except Exception:
        # if path creation fails for any reason, continue â€” TF will still try to save
        pass

    transformer.ckpt = tf.train.Checkpoint(transformer=transformer,
                                       optimizer=optimizer)
    transformer.ckpt_manager = tf.train.CheckpointManager(transformer.ckpt, 
                                                              transformer.checkpoint_path, max_to_keep=EARLY_STOP)

    latest_ckpt = transformer.ckpt_manager.latest_checkpoint
    if latest_ckpt:
        transformer.ckpt.restore(latest_ckpt)
        print(f'Latest checkpoint restored from {latest_ckpt}!!')
        # try to parse checkpoint index (e.g. '.../ckpt-5') and resume from that index
        try:
            import re
            m = re.search(r'ckpt-(\d+)$', latest_ckpt)
            if m:
                start_epoch = int(m.group(1))
                print(f"Resuming from checkpoint index {start_epoch}")
            else:
                start_epoch = 0
                print("Could not parse checkpoint index; starting from 0")
        except Exception:
            start_epoch = 0
            print("Error parsing checkpoint index; starting from 0")

        # resume training from parsed start_epoch
        transformer.fit(train_batches, x_cv, targ_cv, epochs=EPOCHS, initial_epoch=start_epoch, early_stop=EARLY_STOP, print_every=50, ckpt_every = 1)
    else:
        transformer.fit(train_batches, x_cv, targ_cv, epochs= EPOCHS, early_stop=EARLY_STOP, print_every=50, ckpt_every = 1)

    transformer.fit_time = time.time() - start
    transformer.results["fit_time"] = transformer.fit_time 

    # ensure training history folder exists before writing
    try:
        os.makedirs("training_history", exist_ok=True)
    except Exception:
        pass

    # try to pickle results; sanitize non-serializable entries if needed
    safe_results = {}
    for k, v in transformer.results.items():
        try:
            if isinstance(v, (list, tuple)):
                new_list = []
                for item in v:
                    try:
                        # tensors / numpy -> primitive
                        if hasattr(item, "numpy"):
                            val = item.numpy()
                            try:
                                new_list.append(val.tolist())
                            except Exception:
                                new_list.append(val)
                        else:
                            new_list.append(item)
                    except Exception:
                        try:
                            new_list.append(float(item))
                        except Exception:
                            new_list.append(str(item))
                safe_results[k] = new_list
            else:
                if hasattr(v, "numpy"):
                    safe_results[k] = v.numpy().tolist()
                else:
                    try:
                        safe_results[k] = float(v)
                    except Exception:
                        safe_results[k] = str(v)
        except Exception:
            safe_results[k] = str(v)

    with open(f"training_history/{id_str_to_folder(transformer.id_str)}.pickle", "wb") as f:
        pickle.dump(safe_results, f) 
        print("Wrote transformer.results to", f.name)


    # guard against empty val_loss when summarizing for for_df
    val_losses = transformer.results.get("val_loss", [])
    if len(val_losses) > 0:
        best_val = np.min(val_losses)
    else:
        best_val = np.nan

    for_df.append((num_layers_dec, d_model, num_heads, i, dr, beta, dff,
                   best_val, opt_name, transformer.id_str))


11:56
Begin running v2b__nld_4-dm_128-nh_2-i_0-dr_0.1-opt_adam-lwi_0-bs_64
Epoch 1 Batch 0 Loss 12.0507
Epoch 1 Batch 50 Loss 10.5404
Epoch 1 Loss 10.1559
** on validation data loss is 9.3292
Not recording acc: 'Transformer' object has no attribute 'acc_function'
Time taken for 1 epoch: 98.31 secs

Saving checkpoint for epoch 1 at ./checkpoints/v2b__nld_4-dm_128-nh_2-i_0-dr_0__1-opt_adam-lwi_0-bs_64--uk-cond/ckpt-1
Epoch 2 Batch 0 Loss 9.3089
Epoch 2 Batch 50 Loss 9.2354
Epoch 2 Loss 8.9652
** on validation data loss is 7.0959
Time taken for 1 epoch: 100.06 secs

Saving checkpoint for epoch 2 at ./checkpoints/v2b__nld_4-dm_128-nh_2-i_0-dr_0__1-opt_adam-lwi_0-bs_64--uk-cond/ckpt-2
Epoch 3 Batch 0 Loss 7.0371
Epoch 3 Batch 50 Loss 7.2459
Epoch 3 Loss 7.1521
** on validation data loss is 6.8766
Time taken for 1 epoch: 114.98 secs

Saving checkpoint for epoch 3 at ./checkpoints/v2b__nld_4-dm_128-nh_2-i_0-dr_0__1-opt_adam-lwi_0-bs_64--uk-cond/ckpt-3
Epoch 4 Batch 0 Loss 6.7960
Epoch 4 Batch

### Results

In [18]:
df = pd.DataFrame.from_records(for_df, columns=['num_layers_dec', 'd_model', 'num_heads', 'i', "dr", "beta", "dff",
                                                "val loss", "opt name","id_str"]).sort_values("val loss")

In [19]:
with pd.option_context('display.max_colwidth', None, "display.max_rows", None, "display.max_columns", None):
    display(df.sort_values("val loss"))

Unnamed: 0,num_layers_dec,d_model,num_heads,i,dr,beta,dff,val loss,opt name,id_str
2,4,128,2,2,0.1,1,128,5.25968,adam,v2b__nld_4-dm_128-nh_2-i_2-dr_0.1-opt_adam-lwi_0-bs_64
0,4,128,2,0,0.1,1,128,5.291912,adam,v2b__nld_4-dm_128-nh_2-i_0-dr_0.1-opt_adam-lwi_0-bs_64
1,4,128,2,1,0.1,1,128,5.424337,adam,v2b__nld_4-dm_128-nh_2-i_1-dr_0.1-opt_adam-lwi_0-bs_64
