In [1]:
import transtab

# set random seed
transtab.random_seed(42)

In [2]:
# load multiple datasets by passing a list of data names
allset, trainset, valset, testset, cat_cols, num_cols, bin_cols \
    = transtab.load_data(['credit-g','credit-approval'])

# build contrastive learner, set supervised=True for supervised VPCL
model, collate_fn = transtab.build_contrastive_learner(
    cat_cols, num_cols, bin_cols, 
    supervised=True, # if take supervised CL
    num_partition=4, # num of column partitions for pos/neg sampling
    overlap_ratio=0.5, # specify the overlap ratio of column partitions during the CL
)

########################################
openml data index: 31
load data from credit-g
# data: 1000, # feat: 20, # cate: 11,  # bin: 2, # numerical: 7, pos rate: 0.70
########################################
openml data index: 29
load data from credit-approval
# data: 690, # feat: 15, # cate: 9,  # bin: 0, # numerical: 6, pos rate: 0.56


In [3]:
# start contrastive pretraining training
training_arguments = {
    'num_epoch':50,
    'batch_size':64,
    'lr':1e-4,
    'eval_metric':'val_loss',
    'eval_less_is_better':True,
    'output_dir':'./checkpoint'
    }

transtab.train(model, trainset, valset, collate_fn=collate_fn, **training_arguments)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test val_loss: 5.799497
epoch: 0, train loss: 105.8933, lr: 0.000100, spent: 1.0 secs
epoch: 1, test val_loss: 5.787415
epoch: 1, train loss: 104.5827, lr: 0.000100, spent: 1.9 secs
epoch: 2, test val_loss: 5.783943
epoch: 2, train loss: 104.5336, lr: 0.000100, spent: 2.8 secs
epoch: 3, test val_loss: 5.780502
epoch: 3, train loss: 104.4926, lr: 0.000100, spent: 3.7 secs
epoch: 4, test val_loss: 5.777996
epoch: 4, train loss: 104.4469, lr: 0.000100, spent: 4.7 secs
epoch: 5, test val_loss: 5.775274
epoch: 5, train loss: 104.4308, lr: 0.000100, spent: 5.6 secs
epoch: 6, test val_loss: 5.773897
epoch: 6, train loss: 104.3934, lr: 0.000100, spent: 6.5 secs
epoch: 7, test val_loss: 5.771337
epoch: 7, train loss: 104.3471, lr: 0.000100, spent: 7.4 secs
epoch: 8, test val_loss: 5.769358
epoch: 8, train loss: 104.3560, lr: 0.000100, spent: 8.3 secs
epoch: 9, test val_loss: 5.767703
epoch: 9, train loss: 104.2987, lr: 0.000100, spent: 9.2 secs
epoch: 10, test val_loss: 5.766528
epoch

2022-05-11 12:02:42.405 | INFO     | transtab.trainer:train:131 - load best at last from ./checkpoint
2022-05-11 12:02:42.412 | INFO     | transtab.trainer:save_model:238 - saving model checkpoint to ./checkpoint
2022-05-11 12:02:42.509 | INFO     | transtab.trainer:train:136 - training complete, cost 30.9 secs.


epoch: 32, test val_loss: 5.749441
EarlyStopping counter: 5 out of 5
early stopped


In [4]:
# load the pretrained model and finetune on a target dataset
allset, trainset, valset, testset, cat_cols, num_cols, bin_cols \
     = transtab.load_data('credit-approval')

# build transtab classifier model, and load from the pretrained dir
model = transtab.build_classifier(checkpoint='./checkpoint')

# update model's categorical/numerical/binary column dict
model.update({'cat':cat_cols,'num':num_cols,'bin':bin_cols})

# start finetuning
training_arguments = {
    'num_epoch':50,
    'eval_metric':'val_loss',
    'eval_less_is_better':True,
    'output_dir':'./checkpoint'
    }
transtab.train(model, trainset, valset, **training_arguments)

########################################


2022-05-11 12:02:45.700 | INFO     | transtab.modeling_transtab:load:622 - missing keys: ['clf.fc.weight', 'clf.fc.bias', 'clf.norm.weight', 'clf.norm.bias']
2022-05-11 12:02:45.700 | INFO     | transtab.modeling_transtab:load:623 - unexpected keys: ['projection_head.dense.weight']
2022-05-11 12:02:45.701 | INFO     | transtab.modeling_transtab:load:624 - load model from ./checkpoint
2022-05-11 12:02:45.713 | INFO     | transtab.modeling_transtab:load:217 - load feature extractor from ./checkpoint/extractor/extractor.json


openml data index: 29
load data from credit-approval
# data: 690, # feat: 15, # cate: 9,  # bin: 0, # numerical: 6, pos rate: 0.56


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test val_loss: 0.677445
epoch: 0, train loss: 5.4725, lr: 0.000100, spent: 0.2 secs
epoch: 1, test val_loss: 0.670075
epoch: 1, train loss: 5.2918, lr: 0.000100, spent: 0.5 secs
epoch: 2, test val_loss: 0.607029
epoch: 2, train loss: 4.9795, lr: 0.000100, spent: 0.7 secs
epoch: 3, test val_loss: 0.585563
epoch: 3, train loss: 4.6663, lr: 0.000100, spent: 1.0 secs
epoch: 4, test val_loss: 0.515265
epoch: 4, train loss: 4.3954, lr: 0.000100, spent: 1.2 secs
epoch: 5, test val_loss: 0.490379
epoch: 5, train loss: 4.1109, lr: 0.000100, spent: 1.4 secs
epoch: 6, test val_loss: 0.443343
epoch: 6, train loss: 3.8025, lr: 0.000100, spent: 1.6 secs
epoch: 7, test val_loss: 0.458647
EarlyStopping counter: 1 out of 5
epoch: 7, train loss: 3.5434, lr: 0.000100, spent: 1.9 secs
epoch: 8, test val_loss: 0.416663
epoch: 8, train loss: 3.4231, lr: 0.000100, spent: 2.1 secs
epoch: 9, test val_loss: 0.408597
epoch: 9, train loss: 3.2808, lr: 0.000100, spent: 2.3 secs
epoch: 10, test val_loss: 

2022-05-11 12:02:51.639 | INFO     | transtab.trainer:train:131 - load best at last from ./checkpoint
2022-05-11 12:02:51.645 | INFO     | transtab.trainer:save_model:238 - saving model checkpoint to ./checkpoint
2022-05-11 12:02:51.743 | INFO     | transtab.trainer:train:136 - training complete, cost 6.0 secs.


epoch: 25, test val_loss: 0.480252
EarlyStopping counter: 5 out of 5
early stopped


In [5]:
# evaluation
x_test, y_test = testset
ypred = transtab.predict(model, x_test)
transtab.evaluate(ypred, y_test, metric='auc')

auc 0.95 mean/interval 0.8398(0.06)


[0.8398471246601826]