In [1]:
import os
os.chdir('../')

import transtab

# set random seed
transtab.random_seed(42)

In [2]:
# load multiple datasets by passing a list of data names
allset, trainset, valset, testset, cat_cols, num_cols, bin_cols \
    = transtab.load_data(['credit-g','credit-approval'])

# build contrastive learner, set supervised=True for supervised VPCL
model, collate_fn = transtab.build_contrastive_learner(
    cat_cols, num_cols, bin_cols, 
    supervised=True, # if take supervised CL
    num_partition=4, # num of column partitions for pos/neg sampling
    overlap_ratio=0.5, # specify the overlap ratio of column partitions during the CL
)

########################################
openml data index: 31
load data from credit-g
# data: 1000, # feat: 20, # cate: 11,  # bin: 2, # numerical: 7, pos rate: 0.70
########################################
openml data index: 29
load data from credit-approval
# data: 690, # feat: 15, # cate: 9,  # bin: 0, # numerical: 6, pos rate: 0.56


In [3]:
# start contrastive pretraining training
training_arguments = {
    'num_epoch':50,
    'batch_size':64,
    'lr':1e-4,
    'eval_metric':'val_loss',
    'eval_less_is_better':True,
    'output_dir':'./checkpoint'
    }

transtab.train(model, trainset, valset, collate_fn=collate_fn, **training_arguments)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test val_loss: 5.794664
epoch: 0, train loss: 105.4182, lr: 0.000100, spent: 1.1 secs
epoch: 1, test val_loss: 5.786065
epoch: 1, train loss: 104.5511, lr: 0.000100, spent: 2.0 secs
epoch: 2, test val_loss: 5.781867
epoch: 2, train loss: 104.5076, lr: 0.000100, spent: 3.0 secs
epoch: 3, test val_loss: 5.777907
epoch: 3, train loss: 104.4728, lr: 0.000100, spent: 4.1 secs
epoch: 4, test val_loss: 5.775703
epoch: 4, train loss: 104.4284, lr: 0.000100, spent: 5.0 secs
epoch: 5, test val_loss: 5.772933
epoch: 5, train loss: 104.4126, lr: 0.000100, spent: 6.0 secs
epoch: 6, test val_loss: 5.771537
epoch: 6, train loss: 104.3681, lr: 0.000100, spent: 6.9 secs
epoch: 7, test val_loss: 5.768374
epoch: 7, train loss: 104.3112, lr: 0.000100, spent: 7.8 secs
epoch: 8, test val_loss: 5.766492
epoch: 8, train loss: 104.3186, lr: 0.000100, spent: 8.8 secs
epoch: 9, test val_loss: 5.763317
epoch: 9, train loss: 104.2437, lr: 0.000100, spent: 9.7 secs
epoch: 10, test val_loss: 5.763273
epoch

2022-08-31 10:56:45.227 | INFO     | transtab.trainer:train:132 - load best at last from ./checkpoint
2022-08-31 10:56:45.242 | INFO     | transtab.trainer:save_model:239 - saving model checkpoint to ./checkpoint
2022-08-31 10:56:45.379 | INFO     | transtab.trainer:train:137 - training complete, cost 27.2 secs.


epoch: 24, test val_loss: 5.751015
EarlyStopping counter: 5 out of 5
early stopped


In [4]:
# load the pretrained model and finetune on a target dataset
allset, trainset, valset, testset, cat_cols, num_cols, bin_cols \
     = transtab.load_data('credit-approval')

# build transtab classifier model, and load from the pretrained dir
model = transtab.build_classifier(checkpoint='./checkpoint')

# update model's categorical/numerical/binary column dict
model.update({'cat':cat_cols,'num':num_cols,'bin':bin_cols})

# start finetuning
training_arguments = {
    'num_epoch':50,
    'eval_metric':'val_loss',
    'eval_less_is_better':True,
    'output_dir':'./checkpoint'
    }
transtab.train(model, trainset, valset, **training_arguments)

########################################


2022-08-31 10:56:48.527 | INFO     | transtab.modeling_transtab:load:782 - missing keys: ['clf.fc.weight', 'clf.fc.bias', 'clf.norm.weight', 'clf.norm.bias']
2022-08-31 10:56:48.528 | INFO     | transtab.modeling_transtab:load:783 - unexpected keys: ['projection_head.dense.weight']
2022-08-31 10:56:48.528 | INFO     | transtab.modeling_transtab:load:784 - load model from ./checkpoint
2022-08-31 10:56:48.542 | INFO     | transtab.modeling_transtab:load:222 - load feature extractor from ./checkpoint/extractor/extractor.json


openml data index: 29
load data from credit-approval
# data: 690, # feat: 15, # cate: 9,  # bin: 0, # numerical: 6, pos rate: 0.56


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test val_loss: 0.683971
epoch: 0, train loss: 5.4453, lr: 0.000100, spent: 0.3 secs
epoch: 1, test val_loss: 0.646593
epoch: 1, train loss: 5.2291, lr: 0.000100, spent: 0.6 secs
epoch: 2, test val_loss: 0.598986
epoch: 2, train loss: 4.9122, lr: 0.000100, spent: 0.8 secs
epoch: 3, test val_loss: 0.571086
epoch: 3, train loss: 4.6084, lr: 0.000100, spent: 1.1 secs
epoch: 4, test val_loss: 0.500248
epoch: 4, train loss: 4.2688, lr: 0.000100, spent: 1.3 secs
epoch: 5, test val_loss: 0.461829
epoch: 5, train loss: 3.8759, lr: 0.000100, spent: 1.6 secs
epoch: 6, test val_loss: 0.418263
epoch: 6, train loss: 3.5448, lr: 0.000100, spent: 1.9 secs
epoch: 7, test val_loss: 0.406784
epoch: 7, train loss: 3.3226, lr: 0.000100, spent: 2.2 secs
epoch: 8, test val_loss: 0.415289
EarlyStopping counter: 1 out of 5
epoch: 8, train loss: 3.2534, lr: 0.000100, spent: 2.5 secs
epoch: 9, test val_loss: 0.395700
epoch: 9, train loss: 3.1036, lr: 0.000100, spent: 2.7 secs
epoch: 10, test val_loss: 

2022-08-31 10:56:53.974 | INFO     | transtab.trainer:train:132 - load best at last from ./checkpoint
2022-08-31 10:56:54.000 | INFO     | transtab.trainer:save_model:239 - saving model checkpoint to ./checkpoint
2022-08-31 10:56:54.130 | INFO     | transtab.trainer:train:137 - training complete, cost 5.6 secs.


epoch: 19, test val_loss: 0.406734
EarlyStopping counter: 5 out of 5
early stopped


In [5]:
# evaluation
x_test, y_test = testset
ypred = transtab.predict(model, x_test)
transtab.evaluate(ypred, y_test, metric='auc')

auc 0.95 mean/interval 0.8382(0.06)


[0.8382272091644043]