In [30]:
import pandas as pd
import numpy as np
from Data_Clean_Room import data_clean_room
import matplotlib.pyplot as plt
from realtabformer import REaLTabFormer
import os
from pathlib import Path
from ctgan import CTGAN

# This file demonstrates the synthesis process of each mentioned model in the paper

# DEREC-REaLTabFormer

In [2]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

for task_id in task_ids:

    d1 = pd.read_csv(f"data/task_id_{task_id}/feeds.csv")
    d2 = pd.read_csv(f"data/task_id_{task_id}/ads.csv")
    dcr = data_clean_room(d1, d2, 'user_id')
    dcr.derec()
    dcr.sampling(200)
    dcr.synthesize()


    ###Export codes are commented as the data is synthesized already. 
    #dcr_parent_syn.to_csv(f"data/task_id_{task_id}/DEREC/synthetic/realtabformer_syn_parent.csv")
    #derec_child_1_syn.to_csv(f"data/task_id_{task_id}/DEREC/synthetic/realtabformer_syn_child_ads.csv")
    #derec_child_2_syn.to_csv(f"data/task_id_{task_id}/DEREC/synthetic/realtabformer_syn_child_ads.csv")

# Control Group

It is necessary to keep only the unique observation of one table to 'forcefully' comply with the multitable synthesizer, which results in worsened performance.

In [25]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]



for task_id in task_ids:

    d = pd.read_csv(f"data/task_id_{task_id}/feeds.csv")

    d_unique = d.drop_duplicates(subset = 'user_id')

    if len(d) < 200:
        d_feeds_small = d_unique.sample(len(d_unique), random_state = 1018)
    else:
        d_feeds_small = d_unique.sample(200, random_state = 1018)


    d_ads = pd.read_csv(f"data/task_id_{task_id}/ads.csv")


    d_ads_small = d_ads[d_ads['user_id'].isin(d_feeds_small['user_id'])]

    dc = d_ads_small
    dp = d_feeds_small





    join_on = "user_id"



    parent_model = REaLTabFormer(model_type="tabular", epochs = 1, batch_size = 5, train_size = 0.8)
    parent_model.fit(dp.drop(join_on, axis=1), num_bootstrap=5)

    pdir = Path("realtabformer/parent")
    parent_model.save(pdir)


    parent_model_path = sorted([p for p in pdir.glob("id*") if p.is_dir()], key=os.path.getmtime)[-1]



    child_model_1 = REaLTabFormer(
        model_type="relational",
        parent_realtabformer_path=parent_model_path, epochs=10, batch_size = 5, train_size = 0.8)

    child_model_1.fit(
        df=dc,
        in_df=dp,
        join_on=join_on, num_bootstrap=10)


    parent_samples = parent_model.sample(len(dp))


    parent_samples.index.name = join_on
    parent_samples = parent_samples.reset_index()

    child_samples = child_model_1.sample(n_samples = len(dc),
        input_unique_ids=parent_samples[join_on],
        input_df=parent_samples.drop(join_on, axis=1),
        output_max_length = None,
        gen_batch = 1)

    child_samples.index.name = 'user_id'

    ###Export codes are commented as the data is synthesized already. 
    #parent_samples.to_csv(f"data/task_id_{task_id}/Control Group/synthetic/realtabformer_syn_parent.csv", index = False)
    #child_samples.to_csv(f"data/task_id_{task_id}/Control Group/synthetic/realtabformer_syn_child.csv", index = False)


Computing the sensitivity threshold...
Using parallel computation!!!




Bootstrap round:   0%|          | 0/5 [00:00<?, ?it/s]

Sensitivity threshold summary:
count    5.000000
mean     0.002273
std      0.010714
min     -0.015909
25%      0.002273
50%      0.005303
75%      0.008333
max      0.011364
dtype: float64
Sensitivity threshold: 0.010757575757575754 qt_max: 0.05


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss


  0%|          | 0/99 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 128 samples generated. Sampling efficiency is: 100.0000%
Critic round: 5,                     sensitivity_threshold: 0.010757575757575754,                         val_sensitivity: -0.0011616161616161626,                             val_sensitivities: [-0.01287878787878788, 0.0053030303030303025, 0.01136363636363636, 0.017424242424242422, -0.00681818181818182, -0.000757575757575759, 0.03257575757575758, -0.01287878787878788, -0.01287878787878788, -0.00984848484848485, -0.000757575757575759, 0.008333333333333331, -0.01893939393939394, 0.008333333333333333, -0.025]




Copying artefacts from: best-disc-model
Copying artefacts from: mean-best-disc-model
Copying artefacts from: not-best-disc-model
Copying artefacts from: last-epoch-model


  torch.load(parent_realtabformer_path / ModelFileName.rtf_model_pt)


Map:   0%|          | 0/4409 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]



Map:   0%|          | 0/86 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss


  0%|          | 0/200 [00:00<?, ?it/s]

Generated 0 invalid samples out of total 256 samples generated. Sampling efficiency is: 100.0000%




Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  0%|          | 0/200 [00:00<?, ?it/s]

# CT-GAN

In [31]:
task_ids = [10005, 10006, 14584, 22100, 31941, 31996, 34382, 34975]

def extract_numeric_columns(dataset):
        d = []
        for col in dataset.columns:
            if pd.api.types.is_numeric_dtype(dataset[col]):
                d.append(dataset[col])
        return pd.DataFrame(d).T

for task_id in task_ids:
    d = pd.read_csv(f"data/task_id_{task_id}/feeds.csv")
    d_unique = d.drop_duplicates(subset = 'user_id')

    if len(d) < 200:
        d_feeds_small = d_unique.sample(len(d_unique), random_state = 1018)
    else:
        d_feeds_small = d_unique.sample(200, random_state = 1018)


    d_ads = pd.read_csv(f"data/task_id_{task_id}/ads.csv")


    d_ads_small = d_ads[d_ads['user_id'].isin(d_feeds_small['user_id'])]

    d_ads = d_ads_small
    d_feeds = d_feeds_small
    
    
    
    d = pd.merge(d_feeds, d_ads, left_on = 'user_id', right_on = 'user_id', how = 'right')
    
    
    d_num = extract_numeric_columns(d)
    
    discrete_col = []
    for col in d_num.columns:
        if d_num[col].nunique() < 20:
            discrete_col.append(col)
            
    ctgan = CTGAN(epochs = 50)
    ctgan.fit(d_num, discrete_col)
    
    synthetic_data = ctgan.sample(len(d_num))
    
    ###Export codes are commented as the data is synthesized already. 
    #d_num.to_csv(f"data/task_id_{task_id}/CTGAN/original_dataset.csv", index = False)
    #synthetic_data.to_csv(f"data/task_id_{task_id}/CTGAN/synthetic_dataset.csv", index = False)

# TabDDPM

The synthesis was conducted through the TabDDPM synthesizer in Trustworthy AI Lab's own website: https://www.trusetic.com/

The synthesis was done by uploading the respective dataset and choosing the correct data type for each column. To replicate, please select the Trusetic Tier 3 Model which currently represents TabDDPM.