In [None]:
# Create necessary directories
!mkdir -p modules
!mkdir -p data
!mkdir -p assets

In [None]:
# Install required packages
!pip install torch tqdm numpy pandas matplotlib networkx igraph statsmodels dython

Collecting igraph
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting dython
  Downloading dython-0.7.9-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3

# **ADULT DATASET**

In [None]:
# Download and prepare the Adult dataset
!mkdir -p data

# Download the Adult dataset
!wget -O data/adult.data https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
!wget -O data/adult.test https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test

# Process the data
import pandas as pd

# Define column names
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'educational-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Read train data
train_data = pd.read_csv('data/adult.data', header=None, names=columns, sep=', ', engine='python')

# Read test data (skip the first line as it's a header)
test_data = pd.read_csv('data/adult.test', header=None, names=columns, sep=', ', engine='python', skiprows=1)
test_data['income'] = test_data['income'].str.replace('.', '')  # Remove period at the end of income values

# Combine and save
combined_data = pd.concat([train_data, test_data])
combined_data.to_csv('data/adult.csv', index=False)

print("Adult dataset preparation complete!")

--2025-04-24 14:56:52--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘data/adult.data’

data/adult.data         [     <=>            ]   3.79M  3.25MB/s    in 1.2s    

2025-04-24 14:56:54 (3.25 MB/s) - ‘data/adult.data’ saved [3974305]

--2025-04-24 14:56:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘data/adult.test’

data/adult.test         [    <=>             ]   1.91M  1.85MB/s    in 1.0s    

2025-04-24 14:56:56 (1.85 MB/s) - ‘data/adult.test’ sav

# **Forest Cover Type Dataset**

In [None]:
# Download and prepare the Forest Cover Type dataset
!mkdir -p data

# Download covtype dataset
!wget -O data/covtype.data.gz https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
!gunzip data/covtype.data.gz

# Process the file
import pandas as pd
import numpy as np

# Define column names
columns = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]

# Add soil type and wilderness area columns
for i in range(4):
    columns.append(f'Wilderness_Area_{i+1}')
for i in range(40):
    columns.append(f'Soil_Type_{i+1}')

columns.append('Cover_Type')

# Read the data
data = pd.read_csv('data/covtype.data', header=None, names=columns)

# Process data (simplified for this dataset)
covtype_data = data[['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                     'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                     'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                     'Horizontal_Distance_To_Fire_Points', 'Cover_Type']]

# Save to CSV
covtype_data.to_csv('data/covtype.csv', index=False)

print("Cover type dataset preparation complete!")

--2025-04-24 14:57:25--  https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘data/covtype.data.gz’

data/covtype.data.g     [      <=>           ]  10.72M  6.73MB/s    in 1.6s    

2025-04-24 14:57:28 (6.73 MB/s) - ‘data/covtype.data.gz’ saved [11240707]

Cover type dataset preparation complete!


# **Home Credit Default Risk Dataset**

In [None]:
# Process the Home Credit Default Risk Dataset
!mkdir -p data

# Unzip the application_train.csv.zip file
!unzip -o /content/data/application_train.csv.zip -d /content/data/

# Process the data
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/data/application_train.csv')

# Take a sample if needed to reduce size (optional, remove if you want to use all data)
df = df.sample(n=50000, random_state=0) if len(df) > 50000 else df

# Save to CSV at the expected path
df.to_csv('/content/data/application_train.csv', index=False)

print("Home Credit Default Risk dataset preparation complete!")

Archive:  /content/data/application_train.csv.zip
  inflating: /content/data/application_train.csv  
Home Credit Default Risk dataset preparation complete!


# **PERSONAL LOAN MODELLING DATASET**

In [None]:
# Process the Personal Loan Modeling Dataset
!mkdir -p data

# Process the data
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/data/Bank_Personal_Loan_Modelling.csv')

# No need to sample as this dataset is small

# Save to CSV at the expected path
df.to_csv('/content/data/Bank_Personal_Loan_Modelling.csv', index=False)

print("Personal Loan Modeling dataset preparation complete!")

Personal Loan Modeling dataset preparation complete!


# **Taxi Pricing (Cabs)**

In [None]:
# Process the Taxi Pricing (Cabs) Dataset
!mkdir -p data

# Unzip the test.csv.zip file
!unzip -o /content/data/test.csv.zip -d /content/data/

# Process the data
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/data/test.csv')

# Rename the file to what the code expects
df.to_csv('/content/data/sigma_cabs.csv', index=False)

print("Taxi Pricing (Cabs) dataset preparation complete!")

Archive:  /content/data/test.csv.zip
  inflating: /content/data/test.csv  
Taxi Pricing (Cabs) dataset preparation complete!


# **King County House Sales Dataset**

In [None]:
# Process the King County House Sales Dataset
!mkdir -p data

# Unzip the kc_house_data.csv.zip file
!unzip -o /content/data/kc_house_data.csv.zip -d /content/data/

# Process the data
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/data/kc_house_data.csv')

# No need to rename this file as it already has the expected name
# Just saving it back to ensure it's properly formatted
df.to_csv('/content/data/kc_house_data.csv', index=False)

print("King County House Sales dataset preparation complete!")

Archive:  /content/data/kc_house_data.csv.zip
  inflating: /content/data/kc_house_data.csv  
King County House Sales dataset preparation complete!


In [None]:
with open('modules/simulation.py', 'w') as f:
    f.write('''
import random
import numpy as np
import torch

"""for reproducibility"""
def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
''')

print("Created simulation.py in the modules folder")

Created simulation.py in the modules folder


# **Implementation of DistVAE for Synthetic Tabular Data Generation**

In [None]:
with open('modules/model.py', 'w') as f:
    f.write('''import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd

class VAE(nn.Module):
    def __init__(self, config, device):
        super(VAE, self).__init__()

        self.config = config
        self.device = device

        """encoder"""
        self.encoder = nn.Sequential(
            nn.Linear(config["CRPS_dim"] + config["softmax_dim"], 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, config["latent_dim"] * 2),
        ).to(device)

        """spline"""
        self.delta = torch.arange(0, 1 + config["step"], step=config["step"]).view(1, -1).to(device)
        self.M = self.delta.size(1) - 1
        self.spline = nn.Sequential(
            nn.Linear(config["latent_dim"], 16),
            nn.ReLU(),
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, config["CRPS_dim"] * (1 + (self.M + 1)) + config["softmax_dim"]),
        ).to(device)

    def get_posterior(self, input):
        h = self.encoder(input)
        mean, logvar = torch.split(h, self.config["latent_dim"], dim=1)
        return mean, logvar

    def sampling(self, mean, logvar, deterministic=False):
        if deterministic:
            z = mean
        else:
            noise = torch.randn(mean.size(0), self.config["latent_dim"]).to(self.device)
            z = mean + torch.exp(logvar / 2) * noise
        return z

    def encode(self, input, deterministic=False):
        mean, logvar = self.get_posterior(input)
        z = self.sampling(mean, logvar, deterministic=deterministic)
        return z, mean, logvar

    def quantile_parameter(self, z):
        # Make sure z is on the correct device
        z = z.to(self.device)
        h = self.spline(z)
        logit = h[:, -self.config["softmax_dim"]:]
        spline = h[:, :-self.config["softmax_dim"]]
        h = torch.split(spline, 1 + (self.M + 1), dim=1)

        gamma = [h_[:, [0]] for h_ in h]
        beta = [nn.Softplus()(h_[:, 1:]) for h_ in h] # positive constraint
        return gamma, beta, logit

    def quantile_function(self, alpha, gamma, beta, j):
        return gamma[j] + (beta[j] * torch.where(alpha - self.delta > 0,
                                                alpha - self.delta,
                                                torch.zeros(()).to(self.device))).sum(axis=1, keepdims=True)

    def _quantile_inverse(self, x, gamma, beta, j):
        delta_ = self.delta.unsqueeze(2).repeat(1, 1, self.M + 1)
        delta_ = torch.where(delta_ - self.delta > 0,
                            delta_ - self.delta,
                            torch.zeros(()).to(self.device))
        mask = gamma[j] + (beta[j] * delta_.unsqueeze(2)).sum(axis=-1).squeeze(0).t()
        mask = torch.where(mask <= x,
                        mask,
                        torch.zeros(()).to(self.device)).type(torch.bool).type(torch.float)
        alpha_tilde = x - gamma[j]
        alpha_tilde += (mask * beta[j] * self.delta).sum(axis=1, keepdims=True)
        alpha_tilde /= (mask * beta[j]).sum(axis=1, keepdims=True) + 1e-6
        alpha_tilde = torch.clip(alpha_tilde, self.config["threshold"], 1) # numerical stability
        return alpha_tilde

    def quantile_inverse(self, x, gamma, beta):
        alpha_tilde_list = []
        for j in range(self.config["CRPS_dim"]):
            alpha_tilde = self._quantile_inverse(x[:, [j]], gamma, beta, j)
            alpha_tilde_list.append(alpha_tilde)
        return alpha_tilde_list

    def forward(self, input, deterministic=False):
        z, mean, logvar = self.encode(input, deterministic=deterministic)
        gamma, beta, logit = self.quantile_parameter(z)
        return z, mean, logvar, gamma, beta, logit

    def gumbel_sampling(self, size, eps = 1e-20):
        U = torch.rand(size).to(self.device)
        G = (- (U + eps).log() + eps).log()
        return G

    def generate_data(self, n, OutputInfo_list, dataset, reverse_col=False):
        data = []
        steps = n // self.config["batch_size"] + 1

        with torch.no_grad():
            for _ in range(steps):
                # Use device for randn
                randn = torch.randn(self.config["batch_size"], self.config["latent_dim"], device=self.device)
                gamma, beta, logit = self.quantile_parameter(randn)

                samples = []
                st = 0
                for j, info in enumerate(OutputInfo_list):
                    if info.activation_fn == "CRPS":
                        # Use device for alpha
                        alpha = torch.rand(self.config["batch_size"], 1, device=self.device)
                        samples.append(self.quantile_function(alpha, gamma, beta, j))

                    elif info.activation_fn == "softmax":
                        ed = st + info.dim
                        out = logit[:, st : ed]

                        """Gumbel-Max Trick"""
                        G = self.gumbel_sampling(out.shape)
                        _, out = (nn.LogSoftmax(dim=1)(out) + G).max(dim=1)

                        samples.append(out.unsqueeze(1))
                        # samples.append(F.one_hot(out, num_classes=info.dim))
                        st = ed

                samples = torch.cat(samples, dim=1)
                data.append(samples)
        data = torch.cat(data, dim=0)
        data = data[:n, :]
        # Move data to CPU for pandas processing
        data = data.cpu().numpy()
        data = pd.DataFrame(data, columns=dataset.continuous + dataset.discrete)

        """un-standardization of synthetic data"""
        data[dataset.continuous] = data[dataset.continuous] * dataset.std.to_numpy() + dataset.mean.to_numpy()

        """post-process integer columns (calibration)"""
        data[dataset.integer] = data[dataset.integer].round(0).astype(int)
        data[dataset.discrete] = data[dataset.discrete].astype(int)

        if reverse_col:
            """reverse to original column names"""
            for dis, disdict in zip(dataset.discrete, dataset.discrete_dicts_reverse):
                data[dis] = data[dis].apply(lambda x: disdict.get(x))

        return data
''')

print("Created model.py in the modules folder")

Created model.py in the modules folder


In [None]:
with open('modules/train.py', 'w') as f:
    f.write('''
import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

def train_VAE(OutputInfo_list, dataloader, model, config, optimizer, device):
    logs = {
        'loss': [],
        'quantile': [],
        'KL': [],
    }
    # for debugging
    logs['activated'] = []

    for (x_batch) in tqdm.tqdm(iter(dataloader), desc="inner loop"):

        if config["cuda"]:
            x_batch = x_batch.cuda()

        # with torch.autograd.set_detect_anomaly(True):
        optimizer.zero_grad()

        z, mean, logvar, gamma, beta, logit = model(x_batch)

        loss_ = []

        """alpha_tilde"""
        alpha_tilde_list = model.quantile_inverse(x_batch, gamma, beta)

        """loss"""
        j = 0
        st = 0
        total_loss = 0
        # tmp1 = []
        # tmp2 = []
        for j, info in enumerate(OutputInfo_list):
            if info.activation_fn == "CRPS":
                term = (1 - model.delta.pow(3)) / 3 - model.delta - torch.maximum(alpha_tilde_list[j], model.delta).pow(2)
                term += 2 * torch.maximum(alpha_tilde_list[j], model.delta) * model.delta

                loss = (2 * alpha_tilde_list[j]) * x_batch[:, [j]]
                loss += (1 - 2 * alpha_tilde_list[j]) * gamma[j]
                loss += (beta[j] * term).sum(axis=1, keepdims=True)
                loss *= 0.5
                total_loss += loss.mean()
                # tmp1.append(x_batch[:, [j]])

            elif info.activation_fn == "softmax":
                ed = st + info.dim
                _, targets = x_batch[:, config["CRPS_dim"] + st : config["CRPS_dim"] + ed].max(dim=1)
                out = logit[:, st : ed]
                # tmp1.append(x_batch[:, config["CRPS_dim"] + st : config["CRPS_dim"] + ed])
                # tmp2.append(out)
                total_loss += nn.CrossEntropyLoss()(out, targets)
                st = ed

        # assert (torch.cat(tmp1, dim=1) - x_batch).sum().item() == 0
        # assert (torch.cat(tmp2, dim=1) - logit).sum().item() == 0

        loss_.append(('quantile', total_loss))

        """KL-Divergence"""
        KL = torch.pow(mean, 2).sum(axis=1)
        KL -= logvar.sum(axis=1)
        KL += torch.exp(logvar).sum(axis=1)
        KL -= config["latent_dim"]
        KL *= 0.5
        KL = KL.mean()
        loss_.append(('KL', KL))

        ### activated: for debugging
        var_ = torch.exp(logvar).mean(axis=0)
        loss_.append(('activated', (var_ < 0.1).sum()))

        loss = total_loss + config["beta"] * KL
        loss_.append(('loss', loss))

        loss.backward()
        optimizer.step()

        """accumulate losses"""
        for x, y in loss_:
            logs[x] = logs.get(x) + [y.item()]

    return logs
''')

print("Created train.py in the modules folder")

Created train.py in the modules folder


In [None]:
# Fix the data type issues in all dataset modules
import os

for dataset in ['adult', 'covtype', 'credit', 'loan', 'cabs', 'kings']:
    dataset_path = f'modules/{dataset}_datasets.py'

    # Check if file exists
    if os.path.exists(dataset_path):
        with open(dataset_path, 'r') as f:
            content = f.read()

        # Modify content to use float32
        modified_content = content.replace(
            "self.x_data = df.to_numpy()",
            "self.x_data = df.to_numpy().astype(np.float32)"
        )

        # Write back
        with open(dataset_path, 'w') as f:
            f.write(modified_content)

        print(f"Fixed {dataset}_datasets.py to handle data type conversion")

In [None]:
with open('modules/adult_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/adult.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)
        base = base[(base == '?').sum(axis=1) == 0]

        self.continuous = [
            'age', # target variable
            'educational-num',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
        ]
        self.discrete = [
            'workclass',
            'education',
            'marital-status',
            'occupation',
            'relationship',
            'race',
            'gender',
            'native-country',
            'income', # target variable
        ]
        self.integer = self.continuous
        base = base[self.continuous + self.discrete]
        base = base.dropna()

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        self.RegTarget = 'age'
        self.ClfTarget = 'income'

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        split_num = 40000

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)  # Ensure float32 type
        else:
            self.train_raw = base.iloc[:split_num]
            self.test_raw = base.iloc[split_num:]

            df_train = base_dummy.iloc[:split_num] # train
            df = base_dummy.iloc[split_num:] # test

            self.mean = df_train[self.continuous].mean(axis=0)
            self.std = df_train[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.test = df
            self.x_data = df.to_numpy().astype(np.float32)  # Ensure float32 type

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')

print("Created adult_datasets.py in the modules folder")

Created adult_datasets.py in the modules folder


In [None]:
# Create covtype_datasets.py
with open('modules/covtype_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/covtype.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)
        base = base.dropna(axis=0)
        base = base.iloc[:50000]

        self.continuous = [
            'Elevation', # target variable
            'Aspect',
            'Slope',
            'Horizontal_Distance_To_Hydrology',
            'Vertical_Distance_To_Hydrology',
            'Horizontal_Distance_To_Roadways',
            'Hillshade_9am',
            'Hillshade_Noon',
            'Hillshade_3pm',
            'Horizontal_Distance_To_Fire_Points',
        ]
        self.discrete = [
            'Cover_Type', # target variable
        ]
        self.integer = self.continuous
        base = base[self.continuous + self.discrete]

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        self.RegTarget = 'Elevation'
        self.ClfTarget = 'Cover_Type'

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        split_num = 45000

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)
        else:
            self.train_raw = base.iloc[:split_num]
            self.test_raw = base.iloc[split_num:]

            df_train = base_dummy.iloc[:split_num] # train
            df = base_dummy.iloc[split_num:] # test

            self.mean = df_train[self.continuous].mean(axis=0)
            self.std = df_train[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.test = df
            self.x_data = df.to_numpy().astype(np.float32)

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')

# Create credit_datasets.py
with open('modules/credit_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/application_train.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)

        self.continuous = [
            'AMT_INCOME_TOTAL',
            'AMT_CREDIT', # target variable
            'AMT_ANNUITY',
            'AMT_GOODS_PRICE',
            'REGION_POPULATION_RELATIVE',
            'DAYS_BIRTH',
            'DAYS_EMPLOYED',
            'DAYS_REGISTRATION',
            'DAYS_ID_PUBLISH',
            'OWN_CAR_AGE',
        ]
        self.discrete = [
            'NAME_CONTRACT_TYPE',
            'CODE_GENDER',
            'FLAG_OWN_REALTY',
            'NAME_TYPE_SUITE',
            'NAME_INCOME_TYPE',
            'NAME_EDUCATION_TYPE',
            'NAME_FAMILY_STATUS',
            'NAME_HOUSING_TYPE',
            'TARGET', # target variable
        ]
        self.integer = [
            'DAYS_BIRTH',
            'DAYS_EMPLOYED',
            'DAYS_ID_PUBLISH']
        base = base[self.continuous + self.discrete]
        base = base.dropna()
        base = base.iloc[:50000]

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        self.RegTarget = 'AMT_CREDIT'
        self.ClfTarget = 'TARGET'

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        split_num = 45000

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)
        else:
            self.train_raw = base.iloc[:split_num]
            self.test_raw = base.iloc[split_num:]

            df_train = base_dummy.iloc[:split_num] # train
            df = base_dummy.iloc[split_num:] # test

            self.mean = df_train[self.continuous].mean(axis=0)
            self.std = df_train[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.test = df
            self.x_data = df.to_numpy().astype(np.float32)

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')

# Create loan_datasets.py
with open('modules/loan_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/Bank_Personal_Loan_Modelling.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)

        self.continuous = [
            'Age', # target variable
            'Experience',
            'Income',
            'CCAvg',
            'Mortgage',
        ]
        self.discrete = [
            'Family',
            'Personal Loan', # target variable
            'Securities Account',
            'CD Account',
            'Online',
            'CreditCard'
        ]
        self.integer = [
            'Age',
            'Experience',
            'Income',
            'Mortgage']
        base = base[self.continuous + self.discrete]
        base = base.dropna()

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        self.RegTarget = 'Age'
        self.ClfTarget = 'Personal Loan'

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        split_num = 4000

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)
        else:
            self.train_raw = base.iloc[:split_num]
            self.test_raw = base.iloc[split_num:]

            df_train = base_dummy.iloc[:split_num] # train
            df = base_dummy.iloc[split_num:] # test

            self.mean = df_train[self.continuous].mean(axis=0)
            self.std = df_train[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.test = df
            self.x_data = df.to_numpy().astype(np.float32)

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')


# Create kings_datasets.py
with open('modules/kings_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/kc_house_data.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)

        self.continuous = [
            'price',
            'sqft_living',
            'sqft_lot',
            'sqft_above',
            'sqft_basement',
            'yr_built',
            'yr_renovated',
            'lat',
            'long', # target variable
            'sqft_living15',
            'sqft_lot15',
        ]
        self.discrete = [
            'bedrooms',
            'bathrooms',
            'floors',
            'waterfront',
            'view',
            'condition', # target variable
            'grade',
        ]
        self.integer = [
            'price',
            'sqft_living',
            'sqft_lot',
            'sqft_above',
            'sqft_basement',
            'yr_built',
            'yr_renovated',
            'sqft_living15',
            'sqft_lot15',]
        base = base[self.continuous + self.discrete]

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        self.RegTarget = 'long'
        self.ClfTarget = 'condition'

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        split_num = 20000

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)
        else:
            self.train_raw = base.iloc[:split_num]
            self.test_raw = base.iloc[split_num:]

            df_train = base_dummy.iloc[:split_num] # train
            df = base_dummy.iloc[split_num:] # test

            self.mean = df_train[self.continuous].mean(axis=0)
            self.std = df_train[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.test = df
            self.x_data = df.to_numpy().astype(np.float32)

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')

print("Created all dataset modules")

Created all dataset modules


In [None]:
with open('modules/cabs_datasets.py', 'w') as f:
    f.write('''
import tqdm
import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from collections import namedtuple

OutputInfo = namedtuple('OutputInfo', ['dim', 'activation_fn'])

class TabularDataset(Dataset):
    def __init__(self, train=True):
        base = pd.read_csv('./data/sigma_cabs.csv')
        base = base.sample(frac=1, random_state=0).reset_index(drop=True)

        # Define continuous and discrete columns based on actual data
        self.continuous = [
            'Trip_Distance', # target variable
            'Life_Style_Index',
            'Customer_Rating',
            'Var1',
            'Var2',
            'Var3',
        ]

        self.discrete = [
            'Type_of_Cab',
            'Customer_Since_Months',
            'Confidence_Life_Style_Index',
            'Destination_Type',
            'Cancellation_Last_1Month',
            'Gender',
        ]

        # Since Surge_Pricing_Type is missing, use Gender as the classification target
        self.RegTarget = 'Trip_Distance'
        self.ClfTarget = 'Gender'  # Using Gender as alternative classification target

        self.integer = [
            'Var1',
            'Var2',
            'Var3']

        # Drop any rows with missing values in our selected columns
        base = base[self.continuous + self.discrete].dropna()

        self.discrete_dicts = []
        self.discrete_dicts_reverse = []
        for dis in self.discrete:
            discrete_dict = {x:i for i,x in enumerate(sorted(base[dis].unique()))}
            self.discrete_dicts_reverse.append({i:x for i,x in enumerate(sorted(base[dis].unique()))})
            base[dis] = base[dis].apply(lambda x: discrete_dict.get(x))
            self.discrete_dicts.append(discrete_dict)

        # one-hot encoding
        df_dummy = []
        for d in self.discrete:
            df_dummy.append(pd.get_dummies(base[d], prefix=d))
        base_dummy = pd.concat([base.drop(columns=self.discrete)] + df_dummy, axis=1)

        # Set split based on data size
        split_num = min(int(len(base) * 0.8), 40000)

        if train:
            self.train_raw = base.iloc[:split_num]

            df = base_dummy.iloc[:split_num] # train

            self.mean = df[self.continuous].mean(axis=0)
            self.std = df[self.continuous].std(axis=0)

            df[self.continuous] = df[self.continuous] - self.mean
            df[self.continuous] /= self.std

            self.train = df
            self.x_data = df.to_numpy().astype(np.float32)
        else:
            self.train_raw = base.iloc[:split_num]

            if len(base) > split_num:
                self.test_raw = base.iloc[split_num:]

                df_train = base_dummy.iloc[:split_num] # train
                df = base_dummy.iloc[split_num:] # test

                self.mean = df_train[self.continuous].mean(axis=0)
                self.std = df_train[self.continuous].std(axis=0)

                df[self.continuous] = df[self.continuous] - self.mean
                df[self.continuous] /= self.std

                self.test = df
                self.x_data = df.to_numpy().astype(np.float32)
            else:
                # Not enough data for testing, use a small portion of training data
                self.test_raw = base.iloc[:min(500, len(base))]

                df_train = base_dummy.iloc[:split_num] # train
                df = base_dummy.iloc[:min(500, len(base_dummy))] # small test sample

                self.mean = df_train[self.continuous].mean(axis=0)
                self.std = df_train[self.continuous].std(axis=0)

                df[self.continuous] = df[self.continuous] - self.mean
                df[self.continuous] /= self.std

                self.test = df
                self.x_data = df.to_numpy().astype(np.float32)

        # Output Information
        self.OutputInfo_list = []
        for c in self.continuous:
            self.OutputInfo_list.append(OutputInfo(1, 'CRPS'))
        for d, dummy in zip(self.discrete, df_dummy):
            self.OutputInfo_list.append(OutputInfo(dummy.shape[1], 'softmax'))

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        return x
''')

print("Updated cabs_datasets.py to use the actual columns from your data")

Updated cabs_datasets.py to use the actual columns from your data


In [None]:
# Check your current directory
!pwd

# List files to see if simple_train.py exists
!ls -la

# If the file exists but in a different location, find it
!find /content -name "simple_train.py"



/content
total 28
drwxr-xr-x 1 root root 4096 Apr 24 14:55 .
drwxr-xr-x 1 root root 4096 Apr 24 14:49 ..
drwxr-xr-x 2 root root 4096 Apr 24 14:55 assets
drwxr-xr-x 4 root root 4096 Apr 22 13:37 .config
drwxr-xr-x 2 root root 4096 Apr 24 14:57 data
drwxr-xr-x 2 root root 4096 Apr 24 14:58 modules
drwxr-xr-x 1 root root 4096 Apr 22 13:37 sample_data


In [None]:
# Create simple_train.py file
with open('simple_train.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from modules.simulation import set_random_seed
from modules.model import VAE
from modules.train import train_VAE

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--seed', type=int, default=1,
                        help='seed for repeatable results')
    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options')

    parser.add_argument("--latent_dim", default=2, type=int,
                        help="the latent dimension size")
    parser.add_argument("--step", default=0.1, type=float,
                        help="interval size of quantile levels")

    parser.add_argument('--epochs', default=10, type=int,
                        help='the number of epochs')
    parser.add_argument('--batch_size', default=256, type=int,
                        help='batch size')
    parser.add_argument('--lr', default=1e-3, type=float,
                        help='learning rate')
    parser.add_argument('--threshold', default=1e-5, type=float,
                        help='threshold for clipping alpha_tilde')

    parser.add_argument('--beta', default=0.5, type=float,
                        help='scale parameter')

    return parser.parse_args()

def main():
    # Get configuration
    config = vars(get_args())
    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print("Configuration:")
    for k, v in config.items():
        print(f"  {k}: {v}")

    # Set random seed
    set_random_seed(config["seed"])
    torch.manual_seed(config["seed"])
    if config["cuda"]:
        torch.cuda.manual_seed(config["seed"])

    # Import dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    # Load dataset
    dataset = TabularDataset()
    dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)

    # Update config with dataset dimensions
    OutputInfo_list = dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim

    print(f"\\nDataset: {config['dataset']}")
    print(f"CRPS dimensions: {CRPS_dim}")
    print(f"Softmax dimensions: {softmax_dim}")

    # Initialize model
    model = VAE(config, device).to(device)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config["lr"]
    )

    model.train()

    # Training loop
    print("\\nStarting training...")
    for epoch in range(config["epochs"]):
        logs = train_VAE(OutputInfo_list, dataloader, model, config, optimizer, device)

        print_input = f"[epoch {epoch + 1:03d}]"
        print_input += ''.join([f", {x}: {np.mean(y):.4f}" for x, y in logs.items()])
        print(print_input)

    # Save model
    print("\\nSaving model...")
    os.makedirs('./assets', exist_ok=True)
    torch.save(model.state_dict(), f'./assets/DistVAE_{config["dataset"]}.pth')

    # Generate synthetic data
    print("\\nGenerating synthetic data...")
    model.eval()
    with torch.no_grad():
        n = len(dataset.train)
        synthetic_data = model.generate_data(n, OutputInfo_list, dataset, reverse_col=True)

    # Save synthetic data
    synthetic_data.to_csv(f'./assets/synthetic_{config["dataset"]}.csv', index=False)
    print(f"Synthetic data saved to ./assets/synthetic_{config['dataset']}.csv")

    # Print sample of synthetic data
    print("\\nSample of synthetic data:")
    print(synthetic_data.head())

if __name__ == '__main__':
    main()
''')

print("Created simple_train.py file")

Created simple_train.py file


# **GENERATING SYNTHEIC DATA FOR ALL DATASET**

# **ADULTS_DATASET**

In [None]:
!python simple_train.py --dataset adult --epochs 10 --batch_size 256 --latent_dim 2

Configuration:
  seed: 1
  dataset: adult
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std

Dataset: adult
CRPS dimensions: 5
Softmax dimensions: 100

Starting training...
inner loop: 100% 157/157 [00:04<00:00, 38.73it/s]
[epoch 001], loss: 13.6668, quantile: 13.5881, KL: 0.1573, activated: 0.0000
inner loop: 100% 15

# **COVOTYPE DATASET**

In [None]:
!python simple_train.py --dataset covtype --epochs 10 --batch_size 256 --latent_dim 2

Configuration:
  seed: 1
  dataset: covtype
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std

Dataset: covtype
CRPS dimensions: 10
Softmax dimensions: 7

Starting training...
inner loop: 100% 176/176 [00:04<00:00, 36.22it/s]
[epoch 001], loss: 4.4516, quantile: 4.4400, KL: 0.0232, activated: 0.0000
inner loop: 100% 1

# **CREDIT DATASET**

In [None]:
!python simple_train.py --dataset credit --epochs 10 --batch_size 256 --latent_dim 2

Configuration:
  seed: 1
  dataset: credit
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True

Dataset: credit
CRPS dimensions: 10
Softmax dimensions: 37

Starting training...
inner loop: 100% 67/67 [00:02<00:00, 30.54it/s]
[epoch 001], loss: 12.0883, quantile: 12.0594, KL: 0.0577, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 38.14it/s]
[epoch 002], loss: 8.7645, quantile: 8.7473, KL: 0.0344, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 39.31it/s]
[epoch 003], loss: 8.4560, quantile: 8.4461, KL: 0.0198, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 36.28it/s]
[epoch 004], loss: 8.4034, quantile: 8.3841, KL: 0.0386, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 36.01it/s]
[epoch 005], loss: 8.3685, quantile: 8.3342, KL: 0.0687, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 37.61it/s]
[epoch 006], loss: 8.3358, quantile: 8.2751, KL: 0.1214, activated: 0.0000
inner loop: 100% 67/

# **LOAN DATASET**

In [None]:
!python simple_train.py --dataset loan --epochs 10 --batch_size 256 --latent_dim 2


Configuration:
  seed: 1
  dataset: loan
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std

Dataset: loan
CRPS dimensions: 5
Softmax dimensions: 14

Starting training...
inner loop: 100% 16/16 [00:00<00:00, 21.76it/s]
[epoch 001], loss: 6.9936, quantile: 6.9689, KL: 0.0495, activated: 0.0000
inner loop: 100% 16/16 [00

In [None]:
import pandas as pd

# Check the actual columns in the cabs dataset
df = pd.read_csv('./data/sigma_cabs.csv')
print("Columns in sigma_cabs.csv:")
print(df.columns.tolist())

Columns in sigma_cabs.csv:
['Trip_ID', 'Trip_Distance', 'Type_of_Cab', 'Customer_Since_Months', 'Life_Style_Index', 'Confidence_Life_Style_Index', 'Destination_Type', 'Customer_Rating', 'Cancellation_Last_1Month', 'Var1', 'Var2', 'Var3', 'Gender']


# **cabs dataset**

In [None]:
!python simple_train.py --dataset cabs --epochs 10 --batch_size 256 --latent_dim 2

Configuration:
  seed: 1
  dataset: cabs
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std

Dataset: cabs
CRPS dimensions: 6
Softmax dimensions: 44

Starting training...
inner loop: 100% 88/88 [00:02<00:00, 43.00it/s]
[epoch 001], loss: 11.3605, quantile: 11.3311, KL: 0.0588, activated: 0.0000
inner loop: 100% 88/88 [

In [None]:
!python simple_train.py --dataset kings --epochs 10 --batch_size 256 --latent_dim 2

Configuration:
  seed: 1
  dataset: kings
  latent_dim: 2
  step: 0.1
  epochs: 10
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std

Dataset: kings
CRPS dimensions: 11
Softmax dimensions: 73

Starting training...
inner loop: 100% 79/79 [00:02<00:00, 30.23it/s]
[epoch 001], loss: 14.0153, quantile: 13.9565, KL: 0.1176, activated: 0.0000
inner loop: 100% 79/7

# **RUNNING INFERENCE SCRIPT**

In [None]:
# Fix the simple_inference.py script
with open('simple_inference.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from modules.simulation import set_random_seed
from modules.model import VAE

from statsmodels.distributions.empirical_distribution import ECDF
from scipy import interpolate

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument('--beta', default=0.5, type=float,
                        help='observation noise')
    parser.add_argument("--latent_dim", default=2, type=int,
                        help="the latent dimension size")
    parser.add_argument("--step", default=0.1, type=float,
                        help="interval size of quantile levels")
    parser.add_argument('--threshold', default=1e-5, type=float,
                        help='threshold for clipping alpha_tilde')

    return parser.parse_args()

def main():
    # Get configuration
    args = get_args()
    config = vars(args)

    # Setup directories
    if not os.path.exists(f'./assets/{config["dataset"]}'):
        os.makedirs(f'./assets/{config["dataset"]}')

    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print(f"Using device: {device}")
    print(f"Evaluating dataset: {config['dataset']}")

    # Set random seed
    set_random_seed(1)
    torch.manual_seed(1)
    if config["cuda"]:
        torch.cuda.manual_seed(1)

    # Import dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    dataset = TabularDataset()
    test_dataset = TabularDataset(train=False)

    # Update config with dataset dimensions
    OutputInfo_list = dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim

    # Initialize model
    model = VAE(config, device).to(device)

    # Load model
    model_path = f'./assets/DistVAE_{config["dataset"]}.pth'
    print(f"Loading model from {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Quantile Estimation with sampling mechanism
    print("Evaluating quantile estimation...")

    n = 100
    MC = 1000  # Monte Carlo samples
    x_linspace = np.linspace(
        [np.min(dataset.x_data[:, k]) for k in range(len(dataset.continuous))],
        [np.max(dataset.x_data[:, k]) for k in range(len(dataset.continuous))],
        n)
    x_linspace = torch.from_numpy(x_linspace).to(device).float()

    alpha_hat = torch.zeros((n, len(dataset.continuous)), device=device)
    for _ in tqdm.tqdm(range(MC), desc="Estimate CDF..."):
        randn = torch.randn(n, config["latent_dim"], device=device)
        with torch.no_grad():
            gamma, beta, _ = model.quantile_parameter(randn)
            alpha_tilde_list = model.quantile_inverse(x_linspace, gamma, beta)
            alpha_hat += torch.cat(alpha_tilde_list, dim=1)
    alpha_hat /= MC

    # Alpha-rate
    alpha_levels = np.array([0.1, 0.3, 0.5, 0.7, 0.9])
    alpha_rate = []

    # Move to CPU for numpy operations
    alpha_hat_cpu = alpha_hat.cpu().numpy()
    x_linspace_cpu = x_linspace.cpu().numpy()

    for j in range(len(dataset.continuous)):
        tmp = []
        for alpha in alpha_levels:
            if len(np.where(alpha_hat_cpu[:, j] < alpha)[0]):
                cut1 = np.where(alpha_hat_cpu[:, j] < alpha)[0][-1]
            else:
                cut1 = 0
            if len(np.where(alpha < alpha_hat_cpu[:, j])[0]):
                cut2 = np.where(alpha < alpha_hat_cpu[:, j])[0][0]
            else:
                cut2 = -1

            f_inter = interpolate.interp1d(
                [alpha_hat_cpu[cut1, j], alpha_hat_cpu[cut2, j]],
                [x_linspace_cpu[:, j][cut1], x_linspace_cpu[:, j][cut2]])
            try:
                tmp.append((test_dataset.x_data[:, j] <= f_inter(alpha)).mean())
            except:
                tmp.append((test_dataset.x_data[:, j] <= x_linspace_cpu[:, j][cut2]).mean())
        alpha_rate.append(tmp)

    alpha_rate = np.array(alpha_rate).mean(axis=0)

    # Save alpha_rate results
    pd.DataFrame(
        np.concatenate([
            alpha_rate[None, :],
            np.abs(alpha_rate - alpha_levels)[None, :]
        ], axis=0).round(3),
        columns=[str(x) for x in alpha_levels]
    ).to_csv(f'./assets/{config["dataset"]}/{config["dataset"]}_alpha_rate.csv')

    # Visualize CDFs
    # Adjust figure layout based on dataset
    if config["dataset"] in ["covtype", "credit"]:
        fig, ax = plt.subplots(2, CRPS_dim // 2,
                               figsize=(3 * CRPS_dim // 2, 3 * 2))
    elif config["dataset"] in ["loan", "adult", "cabs"]:
        fig, ax = plt.subplots(1, CRPS_dim,
                               figsize=(3 * CRPS_dim, 3 * 1))
    elif config["dataset"] == "kings":
        fig, ax = plt.subplots(2, CRPS_dim // 2 + 1,
                               figsize=(3 * CRPS_dim // 2 + 1, 3 * 2))
    else:
        fig, ax = plt.subplots(1, CRPS_dim, figsize=(3 * CRPS_dim, 3))

    # Get original data in original scale
    orig = dataset.x_data[:, :len(dataset.continuous)] * np.array(dataset.std)
    orig += np.array(dataset.mean)
    orig = pd.DataFrame(orig, columns=dataset.continuous)
    if hasattr(dataset, 'integer'):
        orig[dataset.integer] = orig[dataset.integer].astype(int)

    for k, v in enumerate(dataset.continuous):
        x_linspace_orig = [np.arange(x, y, 1) for x, y in zip(
            [np.min(orig.to_numpy()[:, k])],
            [np.max(orig.to_numpy()[:, k])])][0]

        if hasattr(dataset, 'integer') and v in dataset.integer:
            ecdf = ECDF(orig[dataset.continuous].to_numpy()[:, k])
            emp = [ecdf(x) for x in x_linspace_orig]
            ax.flatten()[k].step(
                (x_linspace_orig - dataset.mean[k]) / dataset.std[k],
                emp, where='post',
                label="empirical", linewidth=3.5, color=u'#ff7f0e')
        else:
            q = np.linspace(0, 1, 100)
            ax.flatten()[k].step(
                np.quantile(dataset.x_data[:, k], q=q),
                q, where='post',
                label="empirical", linewidth=3.5, color=u'#ff7f0e')

        ax.flatten()[k].plot(
            x_linspace_cpu[:, k], alpha_hat_cpu[:, k],
            label="estimate", linewidth=3.5, linestyle='dashed', color=u'#1f77b4')

        ax.flatten()[k].set_xlabel(v, fontsize=12)
        ax.flatten()[k].tick_params(axis="x", labelsize=14)
        ax.flatten()[k].tick_params(axis="y", labelsize=14)

    plt.legend()
    plt.tight_layout()
    plt.savefig(f'./assets/{config["dataset"]}/{config["dataset"]}_estimated_quantile.png')
    plt.close()

    print(f"Results saved to ./assets/{config['dataset']}/")
    print("Inference evaluation complete!")

if __name__ == '__main__':
    main()
''')

print("Fixed simple_inference.py by adding missing config parameters")

Fixed simple_inference.py by adding missing config parameters


In [None]:
for dataset in ['adult', 'covtype', 'credit', 'loan', 'cabs', 'kings']:
    print(f"\n{'='*50}")
    print(f"Running inference for dataset: {dataset}")
    print(f"{'='*50}")
    !python simple_inference.py --dataset {dataset}


Running inference for dataset: adult
Using device: cuda:0
Evaluating dataset: adult
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.cont

# **CALIBRATING THE DATA**

In [None]:
# Create simple_calibration.py
with open('simple_calibration.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F

from modules.simulation import set_random_seed
from modules.model import VAE

from statsmodels.distributions.empirical_distribution import ECDF

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument("--latent_dim", default=2, type=int,
                        help="the latent dimension size")
    parser.add_argument("--step", default=0.1, type=float,
                        help="interval size of quantile levels")
    parser.add_argument('--threshold', default=1e-5, type=float,
                        help='threshold for clipping alpha_tilde')

    return parser.parse_args()

def main():
    # Get configuration
    args = get_args()
    config = vars(args)

    # Setup
    if not os.path.exists(f'./assets/{config["dataset"]}'):
        os.makedirs(f'./assets/{config["dataset"]}')

    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print(f"Using device: {device}")
    print(f"Calibrating dataset: {config['dataset']}")

    # Set random seed
    set_random_seed(1)
    torch.manual_seed(1)
    if config["cuda"]:
        torch.cuda.manual_seed(1)

    # Import dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    dataset = TabularDataset()

    # Update config with dataset dimensions
    OutputInfo_list = dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim

    # Initialize model
    model = VAE(config, device).to(device)

    # Load model
    model_path = f'./assets/DistVAE_{config["dataset"]}.pth'
    print(f"Loading model from {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Get raw data for calibration
    df = dataset.train_raw[dataset.continuous]

    # Select a continuous variable for calibration demonstration
    j = 1 if len(dataset.continuous) > 1 else 0  # Use second continuous var if available
    print(f"Calibrating {dataset.continuous[j]}")

    # Monte Carlo estimation
    MC = 1000  # Monte Carlo samples

    # Step 0: Estimate CDF for various x values (for demonstration)
    n = 100
    x_linspace_est = np.linspace(
        np.min(dataset.x_data[:, j]),
        np.max(dataset.x_data[:, j]),
        n)

    print("Estimating CDF...")
    alpha_est = torch.zeros((len(x_linspace_est), 1), device=device)
    for _ in tqdm.tqdm(range(MC), desc="Estimate CDF..."):
        randn = torch.randn(len(x_linspace_est), config["latent_dim"], device=device)
        with torch.no_grad():
            gamma, beta, _ = model.quantile_parameter(randn)
            x_tmp = torch.from_numpy(x_linspace_est[:, None]).to(device).float()
            alpha_tilde = model._quantile_inverse(x_tmp, gamma, beta, j)
            alpha_est += alpha_tilde
    alpha_est /= MC

    # Convert to original scale for visualization
    x_linspace_est = x_linspace_est * dataset.std[j] + dataset.mean[j]

    # Calibration Step 1: Estimate F(x + 0.5), F(x - 0.5)
    x_linspace = [np.arange(x, y+2, 1) - 0.5 for x, y in zip(
        [int(np.min(df.to_numpy()[:, j]))],
        [int(np.max(df.to_numpy()[:, j]))])][0]

    print("Calibration Step 1...")
    alpha_hat = torch.zeros((len(x_linspace), 1), device=device)
    for _ in tqdm.tqdm(range(MC), desc="Estimate CDF..."):
        randn = torch.randn(len(x_linspace), config["latent_dim"], device=device)
        with torch.no_grad():
            gamma, beta, _ = model.quantile_parameter(randn)
            x_tmp = torch.from_numpy(x_linspace[:, None]).to(device).float()
            x_tmp = (x_tmp - dataset.mean[j]) / dataset.std[j]
            alpha_tilde = model._quantile_inverse(x_tmp, gamma, beta, j)
            alpha_hat += alpha_tilde
    alpha_hat /= MC

    # Move to CPU for numpy operations
    alpha_hat_cpu = alpha_hat.cpu().numpy()

    # Prepare x_linspace for steps 2 and 3
    x_linspace_step2 = [np.arange(x, y+1, 1) for x, y in zip(
        [int(np.min(df.to_numpy()[:, j]))],
        [int(np.max(df.to_numpy()[:, j]))])][0]

    # Calibration Step 2: Discretization F^*(x) = F^*(x-1) + F(x+0.5) - F(x-0.5)
    print("Calibration Step 2...")
    alpha_cal = []
    for i in range(len(alpha_hat_cpu)-1):
        alpha_cal.append((alpha_hat_cpu[i+1] - alpha_hat_cpu[i]).item())
    alpha_cal = np.array(alpha_cal) / np.sum(alpha_cal)
    alpha_cal = np.cumsum(alpha_cal)

    # Calibration Step 3: Ensure monotonicity
    print("Calibration Step 3...")
    alpha_mono = [alpha_cal[0]]
    for i in range(1, len(alpha_cal)):
        if alpha_cal[i] < alpha_mono[-1]:
            alpha_mono.append(alpha_mono[-1])
        else:
            alpha_mono.append(alpha_cal[i])

    # Visualize the results
    print("Visualizing results...")
    ecdf = ECDF(df.to_numpy()[:, j])
    emp = [ecdf(x) for x in x_linspace_step2]

    fig, ax = plt.subplots(1, 1, figsize=(7, 4))

    ax.step(x_linspace_step2, emp, label="empirical", where='post',
            linewidth=3.5, color=u'#ff7f0e')
    ax.plot(x_linspace_est, alpha_est.cpu().numpy(), label="estimate",
            linewidth=3.5, color=u'#2ca02c')
    ax.step(x_linspace_step2, alpha_mono, label="calibration", where='post',
            linewidth=3.5, linestyle='--', color='black')

    ax.set_xlabel(dataset.continuous[j], fontsize=15)
    ax.tick_params(axis='x', labelsize=14)
    ax.tick_params(axis='y', labelsize=14)
    plt.grid(True, axis='y', linestyle='--')

    plt.legend(fontsize=14)
    plt.tight_layout()
    plt.savefig(f'./assets/{config["dataset"]}/{config["dataset"]}_CDF_calibration.png')
    plt.close()

    print(f"CDF calibration visualization saved to ./assets/{config['dataset']}/{config['dataset']}_CDF_calibration.png")
    print("Calibration complete!")

if __name__ == '__main__':
    main()
''')

print("Created simple_calibration.py")

Created simple_calibration.py


In [None]:
for dataset in ['adult', 'covtype', 'credit', 'loan', 'cabs', 'kings']:
    print(f"\n{'='*50}")
    print(f"Running calibration for dataset: {dataset}")
    print(f"{'='*50}")
    !python simple_calibration.py --dataset {dataset}


Running calibration for dataset: adult
Using device: cuda:0
Calibrating dataset: adult
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Loading model from ./assets/DistVAE_adult.pth
Calibrating educational-num
Estimating CDF...
Estimate CDF...: 100% 1000/1000 [00:01<00:00, 770.23it/s]
  x_linspace_est = x_linspace_est * dataset.std[j] + dataset.mean[j]
Calibration Step 1...
  x_tmp = (x_tmp - dataset.mean[j]) / dataset.std[j]
Es

# **SYNTHEIZING**

In [None]:
# Create simple_synthesize.py
with open('simple_synthesize.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from modules.simulation import set_random_seed
from modules.model import VAE
from modules.evaluation import (
    regression_eval,
    classification_eval,
    statistical_similarity,
    DCR_metric,
    attribute_disclosure
)

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument('--beta', default=0.5, type=float,
                        help='observation noise')

    return parser.parse_args()

def main():
    # Your evaluation code will go here
    args = get_args()
    print(f"Evaluating synthetic data quality for {args.dataset} dataset...")

if __name__ == '__main__':
    main()
''')

print("Created simple_synthesize.py file")

Created simple_synthesize.py file


# **CREATE EVALUATION**

In [None]:
# Create modules/evaluation.py
with open('modules/evaluation.py', 'w') as f:
    f.write('''
import numpy as np
import pandas as pd
import tqdm

import statsmodels.api as sm
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression
)
from sklearn.ensemble import (
    RandomForestRegressor,
    RandomForestClassifier,
    GradientBoostingRegressor,
    GradientBoostingClassifier
)

from sklearn.metrics import f1_score, precision_score, recall_score
from scipy.spatial import distance_matrix

from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import wasserstein_distance

from sklearn import metrics

def regression_eval(train, test, target, mean, std):
    train[target] = train[target] * std + mean
    test[target] = test[target] * std + mean

    covariates = [x for x in train.columns if x not in [target]]

    result = []
    for name, regr in [
        ('linear', None),
        ('RF', RandomForestRegressor(random_state=0)),
        ('GradBoost', GradientBoostingRegressor(random_state=0))]:

        if name == 'linear':
            regr = sm.OLS(train[target], train[covariates]).fit()
        else:
            regr.fit(train[covariates], train[target])
        pred = regr.predict(test[covariates])

        mare = (test[target] - pred).abs()
        mare /= test[target].abs() + 1e-6
        mare = mare.mean()

        result.append((name, mare))
        print("[{}] MARE: {:.3f}".format(name, mare))
    return result

def classification_eval(train, test, target):
    covariates = [x for x in train.columns if not x.startswith(target)]
    target_ = [x for x in train.columns if x.startswith(target)]
    train_target = train[target_].idxmax(axis=1)
    test_target = test[target_].idxmax(axis=1).to_numpy()

    result = []
    for name, clf in [
        ('logistic', LogisticRegression(multi_class='ovr', fit_intercept=False, max_iter=1000)),
        ('RF', RandomForestClassifier(random_state=0)),
        ('GradBoost', GradientBoostingClassifier(random_state=0))]:

        clf.fit(train[covariates], train_target)
        pred = clf.predict(test[covariates])

        f1 = f1_score(test_target, pred, average='micro')

        result.append((name, f1))
        print("[{}] F1: {:.3f}".format(name, f1))
    return result

def statistical_similarity(train, synthetic, standardize, continuous=None):
    if standardize:
        train[continuous] -= train[continuous].mean(axis=0)
        train[continuous] /= train[continuous].std(axis=0)
        train = train.to_numpy()

        synthetic[continuous] -= synthetic[continuous].mean(axis=0)
        synthetic[continuous] /= synthetic[continuous].std(axis=0)
        synthetic = synthetic.to_numpy()

    Dn_list = []
    W1_list = []
    for j in range(train.shape[1]):
        xj = train[:, j]
        ecdf = ECDF(xj)
        ecdf_hat = ECDF(synthetic[:, j])

        Dn = np.abs(ecdf(xj) - ecdf_hat(xj)).max()
        W1 = wasserstein_distance(xj, synthetic[:, j])

        Dn_list.append(Dn)
        W1_list.append(W1)
    return Dn_list, W1_list

def DCR_metric(train, synthetic, data_percent=15):
    """
    Computes the Distance to Closest Record metric.

    Returns:
    - 5th percentile distance between real and synthetic
    - 5th percentile distance within real
    - 5th percentile distance within synthetic
    """
    # Sampling smaller sets to reduce computation time
    real_sampled = train.sample(n=int(len(train)*(.01*data_percent)), random_state=42).to_numpy()
    fake_sampled = synthetic.sample(n=int(len(synthetic)*(.01*data_percent)), random_state=42).to_numpy()

    # Computing pair-wise distances
    dist_rf = metrics.pairwise_distances(real_sampled, Y=fake_sampled, metric='minkowski', n_jobs=-1)
    dist_rr = metrics.pairwise_distances(real_sampled, Y=None, metric='minkowski', n_jobs=-1)
    dist_ff = metrics.pairwise_distances(fake_sampled, Y=None, metric='minkowski', n_jobs=-1)

    # Removes distances of data points to themselves
    rd_dist_rr = dist_rr[~np.eye(dist_rr.shape[0],dtype=bool)].reshape(dist_rr.shape[0],-1)
    rd_dist_ff = dist_ff[~np.eye(dist_ff.shape[0],dtype=bool)].reshape(dist_ff.shape[0],-1)

    # Computing smallest nearest neighbour distances
    smallest_two_indexes_rf = [dist_rf[i].argsort()[:2] for i in range(len(dist_rf))]
    smallest_two_rf = [dist_rf[i][smallest_two_indexes_rf[i]] for i in range(len(dist_rf))]
    smallest_two_indexes_rr = [rd_dist_rr[i].argsort()[:2] for i in range(len(rd_dist_rr))]
    smallest_two_rr = [rd_dist_rr[i][smallest_two_indexes_rr[i]] for i in range(len(rd_dist_rr))]
    smallest_two_indexes_ff = [rd_dist_ff[i].argsort()[:2] for i in range(len(rd_dist_ff))]
    smallest_two_ff = [rd_dist_ff[i][smallest_two_indexes_ff[i]] for i in range(len(rd_dist_ff))]

    # Computing 5th percentiles
    min_dist_rf = np.array([i[0] for i in smallest_two_rf])
    fifth_perc_rf = np.percentile(min_dist_rf,5)
    min_dist_rr = np.array([i[0] for i in smallest_two_rr])
    fifth_perc_rr = np.percentile(min_dist_rr,5)
    min_dist_ff = np.array([i[0] for i in smallest_two_ff])
    fifth_perc_ff = np.percentile(min_dist_ff,5)

    return [fifth_perc_rf, fifth_perc_rr, fifth_perc_ff]

def attribute_disclosure(K, compromised, synthetic, attr_compromised, dataset):
    dist = distance_matrix(
        compromised[attr_compromised].to_numpy(),
        synthetic[attr_compromised].to_numpy(),
        p=2)
    K_idx = dist.argsort(axis=1)[:, :K]

    def most_common(lst):
        return max(set(lst), key=lst.count)

    votes = []
    trues = []
    for i in tqdm.tqdm(range(len(K_idx)), desc="Majority vote..."):
        true = np.zeros((len(dataset.discrete), ))
        vote = np.zeros((len(dataset.discrete), ))
        for j in range(len(dataset.discrete)):
            true[j] = compromised.to_numpy()[i, len(dataset.continuous) + j]
            vote[j] = most_common(list(synthetic.to_numpy()[K_idx[i], len(dataset.continuous) + j]))
        votes.append(vote)
        trues.append(true)
    votes = np.vstack(votes)
    trues = np.vstack(trues)

    acc = 0
    f1 = 0
    for j in range(trues.shape[1]):
        acc += (trues[:, j] == votes[:, j]).mean()
        f1 += f1_score(trues[:, j], votes[:, j], average="macro", zero_division=0)
    acc /= trues.shape[1]
    f1 /= trues.shape[1]

    return acc, f1
''')

print("Created evaluation.py file")

Created evaluation.py file


# **CREATE SYNTHESIZE**

In [None]:
# Create complete updated simple_synthesize.py
with open('simple_synthesize.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm

import torch
from torch import nn
import torch.nn.functional as F

from modules.simulation import set_random_seed
from modules.model import VAE
from modules.evaluation import (
    regression_eval,
    classification_eval,
    statistical_similarity,
    DCR_metric,
    attribute_disclosure
)

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument('--beta', default=0.5, type=float,
                        help='observation noise')

    return parser.parse_args()

def main():
    # Get configuration
    args = get_args()
    config = vars(args)

    print(f"Evaluating synthetic data quality for {config['dataset']} dataset...")

    # Create results storage
    results = {
        'dataset': config['dataset'],
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    # Load dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    # Load real dataset
    dataset = TabularDataset()
    test_dataset = TabularDataset(train=False)

    # Load synthetic data
    syndata_path = f'./assets/synthetic_{config["dataset"]}.csv'
    print(f"Loading synthetic data from {syndata_path}")
    syndata_original = pd.read_csv(syndata_path)

    # Create a copy of synthetic data for preprocessing
    syndata = syndata_original.copy()

    # Convert categorical variables from strings to numeric codes
    for i, dis in enumerate(dataset.discrete):
        # Create a mapping from string values to numeric codes using the original dictionary
        if dis in syndata.columns:
            # Check if the column contains string values
            if syndata[dis].dtype == 'object':
                mapping = dataset.discrete_dicts[i]
                # Apply the mapping, handle any new values by assigning a default value
                syndata[dis] = syndata[dis].apply(lambda x: mapping.get(x, 0) if x in mapping else 0)

    # Correlation Structure
    try:
        from dython.nominal import associations
        print("\\nEvaluating Correlation Structure...")
        syn_asso = associations(
            syndata_original, nominal_columns=dataset.discrete,
            compute_only=True)
        true_asso = associations(
            dataset.train_raw, nominal_columns=dataset.discrete,
            compute_only=True)
        corr_dist = np.linalg.norm(true_asso["corr"] - syn_asso["corr"])
        print(f'Correlation Matrix Distance: {corr_dist:.3f}')
        results['corr_dist'] = corr_dist
    except Exception as e:
        print(f"Error in correlation analysis: {e}")
        results['corr_dist'] = np.nan

    # Statistical Similarity
    print("\\nEvaluating Statistical Similarity...")
    try:
        # Make sure all data is numeric for statistical similarity
        train_copy = dataset.train_raw.copy()
        syn_copy = syndata.copy()

        Dn, W1 = statistical_similarity(
            train_copy, syn_copy,
            standardize=True, continuous=dataset.continuous)

        cont_Dn = np.mean(Dn[:len(dataset.continuous)])
        disc_Dn = np.mean(Dn[len(dataset.continuous):])
        cont_W1 = np.mean(W1[:len(dataset.continuous)])
        disc_W1 = np.mean(W1[len(dataset.continuous):])

        print(f'K-S (continuous): {cont_Dn:.3f}')
        print(f'1-WD (continuous): {cont_W1:.3f}')
        print(f'K-S (discrete): {disc_Dn:.3f}')
        print(f'1-WD (discrete): {disc_W1:.3f}')

        results['K-S_continuous'] = cont_Dn
        results['1-WD_continuous'] = cont_W1
        results['K-S_discrete'] = disc_Dn
        results['1-WD_discrete'] = disc_W1
    except Exception as e:
        print(f"Error in statistical similarity analysis: {e}")
        results['K-S_continuous'] = np.nan
        results['1-WD_continuous'] = np.nan
        results['K-S_discrete'] = np.nan
        results['1-WD_discrete'] = np.nan

    # Distance to Closest Record
    print("\\nEvaluating Distance to Closest Record...")
    try:
        # Standardize synthetic data - only use continuous columns
        train_cont = dataset.train[dataset.continuous].copy()
        syn_cont = syndata[dataset.continuous].copy()

        # Ensure all data is numeric
        train_cont = train_cont.apply(pd.to_numeric, errors='coerce')
        syn_cont = syn_cont.apply(pd.to_numeric, errors='coerce')

        # Standardize
        syn_cont = (syn_cont - syn_cont.mean()) / syn_cont.std()

        DCR = DCR_metric(train_cont, syn_cont)

        print(f'DCR (R&S): {DCR[0]:.3f}')
        print(f'DCR (R): {DCR[1]:.3f}')
        print(f'DCR (S): {DCR[2]:.3f}')

        results['DCR_R&S'] = DCR[0]
        results['DCR_R'] = DCR[1]
        results['DCR_S'] = DCR[2]
    except Exception as e:
        print(f"Error in DCR analysis: {e}")
        results['DCR_R&S'] = np.nan
        results['DCR_R'] = np.nan
        results['DCR_S'] = np.nan

    # Attribute Disclosure
    print("\\nEvaluating Attribute Disclosure...")
    try:
        # Create a standardized version of train_raw
        train_raw_numeric = dataset.train_raw.copy()

        # Sample a subset of records for compromise
        compromised_idx = np.random.choice(
            range(len(train_raw_numeric)),
            min(400, int(len(train_raw_numeric) * 0.01)),
            replace=False)

        # Standardize continuous columns
        train_raw_numeric[dataset.continuous] = (train_raw_numeric[dataset.continuous] -
                                            train_raw_numeric[dataset.continuous].mean()) / train_raw_numeric[dataset.continuous].std()

        # Get compromised records
        compromised = train_raw_numeric.iloc[compromised_idx].reset_index(drop=True)

        # Check for attribute disclosure with different K values
        attr_num = min(5, len(dataset.continuous))
        attr_compromised = dataset.continuous[:attr_num]

        for K in [1, 10, 100]:
            try:
                acc, f1 = attribute_disclosure(
                    K, compromised, syndata, attr_compromised, dataset)
                print(f'AD F1 (S={attr_num},K={K}): {f1:.3f}')
                results[f'AD_F1_K{K}'] = f1
            except Exception as e:
                print(f"Error in attribute disclosure evaluation for K={K}: {e}")
                results[f'AD_F1_K{K}'] = np.nan
    except Exception as e:
        print(f"Error in attribute disclosure setup: {e}")
        results['AD_F1_K1'] = np.nan
        results['AD_F1_K10'] = np.nan
        results['AD_F1_K100'] = np.nan

    # ML Utility - Regression
    print("\\nEvaluating Machine Learning Utility in Regression...")
    try:
        # For the real data baseline
        base_reg = regression_eval(
            dataset.train.copy(), test_dataset.test.copy(), dataset.RegTarget,
            dataset.mean[dataset.RegTarget], dataset.std[dataset.RegTarget])
        base_reg_score = np.mean([x[1] for x in base_reg])
        print(f'MARE (Baseline): {base_reg_score:.3f}')
        results['MARE_baseline'] = base_reg_score
    except Exception as e:
        print(f"Error in baseline regression evaluation: {e}")
        results['MARE_baseline'] = np.nan

    try:
        # Prepare synthetic data for ML evaluation
        syn_ml = pd.DataFrame()

        # Add continuous columns
        for col in dataset.continuous:
            if col in syndata.columns:
                syn_ml[col] = pd.to_numeric(syndata[col], errors='coerce')

        # Standardize continuous columns
        mean_vals = syn_ml[dataset.continuous].mean()
        std_vals = syn_ml[dataset.continuous].std()
        syn_ml[dataset.continuous] = (syn_ml[dataset.continuous] - mean_vals) / std_vals

        # Add dummy columns for discrete variables
        for i, dis in enumerate(dataset.discrete):
            # Get original dummy columns from training data
            original_dummies = [c for c in dataset.train.columns if c.startswith(f"{dis}_")]

            # Create numeric version of synthetic discrete column
            if dis in syndata.columns:
                syn_discrete = syndata[dis].copy()
                # Convert to numeric using mapping if necessary
                if syn_discrete.dtype == 'object':
                    syn_discrete = syn_discrete.map(dataset.discrete_dicts[i]).fillna(0).astype(int)
                # Create dummies
                syn_dummies = pd.get_dummies(syn_discrete, prefix=dis)

                # Ensure all original dummy columns exist
                for dummy_col in original_dummies:
                    if dummy_col not in syn_dummies.columns:
                        syn_dummies[dummy_col] = 0

                # Add to synthetic ML data
                for dummy_col in original_dummies:
                    if dummy_col in syn_dummies.columns:
                        syn_ml[dummy_col] = syn_dummies[dummy_col]
                    else:
                        syn_ml[dummy_col] = 0

        # Ensure all columns from original training data are present
        for col in dataset.train.columns:
            if col not in syn_ml.columns:
                syn_ml[col] = 0

        # Match column order to original training data
        syn_ml = syn_ml[dataset.train.columns]

        # Convert all to float type to avoid object type issues
        syn_ml = syn_ml.astype(float)

        # Evaluate regression performance with synthetic data
        syn_reg = regression_eval(
            syn_ml, test_dataset.test.copy(), dataset.RegTarget,
            mean_vals[dataset.RegTarget], std_vals[dataset.RegTarget])
        syn_reg_score = np.mean([x[1] for x in syn_reg])
        print(f'MARE (Synthetic): {syn_reg_score:.3f}')
        results['MARE_synthetic'] = syn_reg_score
    except Exception as e:
        print(f"Error in synthetic data regression evaluation: {e}")
        results['MARE_synthetic'] = np.nan

    # ML Utility - Classification
    print("\\nEvaluating Machine Learning Utility in Classification...")
    try:
        # Evaluate classification on real data
        base_clf = classification_eval(
            dataset.train.copy(), test_dataset.test.copy(), dataset.ClfTarget)
        base_clf_score = np.mean([x[1] for x in base_clf])
        print(f'F1 (Baseline): {base_clf_score:.3f}')
        results['F1_baseline'] = base_clf_score
    except Exception as e:
        print(f"Error in baseline classification evaluation: {e}")
        results['F1_baseline'] = np.nan

    try:
        # Evaluate classification on synthetic data
        syn_clf = classification_eval(
            syn_ml, test_dataset.test.copy(), dataset.ClfTarget)
        syn_clf_score = np.mean([x[1] for x in syn_clf])
        print(f'F1 (Synthetic): {syn_clf_score:.3f}')
        results['F1_synthetic'] = syn_clf_score
    except Exception as e:
        print(f"Error in synthetic data classification evaluation: {e}")
        results['F1_synthetic'] = np.nan

    # Create results directory
    os.makedirs('./assets/results/', exist_ok=True)

    # Save the results to CSV
    results_df = pd.DataFrame([results])
    results_path = f'./assets/results/{config["dataset"]}_evaluation.csv'
    results_df.to_csv(results_path, index=False)

    print(f"\\nEvaluation complete! Results saved to {results_path}")
    print("\\nResults summary:")
    print(results_df)

if __name__ == '__main__':
    main()
''')

print("Created comprehensive updated simple_synthesize.py that properly saves results")

Created comprehensive updated simple_synthesize.py that properly saves results


In [None]:
!python simple_synthesize.py --dataset adult

Evaluating synthetic data quality for adult dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is

In [None]:
# Evaluate synthetic data for covtype dataset
!python simple_synthesize.py --dataset covtype

Evaluating synthetic data quality for covtype dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value 

In [None]:
!python simple_synthesize.py --dataset credit

Evaluating synthetic data quality for credit dataset...
Loading synthetic data from ./assets/synthetic_credit.csv

Evaluating Correlation Structure...
Correlation Matrix Distance: 2.818

Evaluating Statistical Similarity...
K-S (continuous): 0.129
1-WD (continuous): 0.128
K-S (discrete): 0.022
1-WD (discrete): 0.046

Evaluating Distance to Closest Record...
DCR (R&S): 0.841
DCR (R): 0.558
DCR (S): 0.901

Evaluating Attribute Disclosure...
Majority vote...: 100% 169/169 [00:00<00:00, 310.61it/s]
AD F1 (S=5,K=1): 0.278
Majority vote...: 100% 169/169 [00:00<00:00, 312.96it/s]
AD F1 (S=5,K=10): 0.291
Majority vote...: 100% 169/169 [00:00<00:00, 315.51it/s]
AD F1 (S=5,K=100): 0.282

Evaluating Machine Learning Utility in Regression...
Error in baseline regression evaluation: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
[linear] MARE: nan
Error in synthetic data regression evaluation: Found array with 0 sample(s) (shape=(0, 46)) while a minimum of 1 is r

In [None]:
!python simple_synthesize.py --dataset loan


Evaluating synthetic data quality for loan dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is 

In [None]:
!python simple_synthesize.py --dataset cabs

Evaluating synthetic data quality for cabs dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is 

In [None]:
!python simple_synthesize.py --dataset kings

Evaluating synthetic data quality for kings dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is

# **SHADOW DATA SCRIPT**

In [None]:
# Fix shadow_data.py by adding batch_size to config
with open('shadow_data.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from modules.simulation import set_random_seed
from modules.model import VAE

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument('--seed', type=int, default=1,
                        help='random seed')

    return parser.parse_args()

def main():
    # Get configuration
    args = get_args()
    config = vars(args)

    # Create privacy directory
    if not os.path.exists(f'./privacy/{config["dataset"]}'):
        os.makedirs(f'./privacy/{config["dataset"]}')

    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print(f"Using device: {device}")
    print(f"Generating shadow data for dataset: {config['dataset']}")

    # Set random seed
    set_random_seed(config["seed"])
    torch.manual_seed(config["seed"])
    if config["cuda"]:
        torch.cuda.manual_seed(config["seed"])

    # Import dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    # Load dataset
    dataset = TabularDataset()
    test_dataset = TabularDataset(train=False)

    # Load model
    model_path = f'./assets/DistVAE_{config["dataset"]}.pth'
    print(f"Loading model from {model_path}")

    # Update config with dataset dimensions and other required parameters
    OutputInfo_list = dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim
    config["latent_dim"] = 2
    config["step"] = 0.1
    config["threshold"] = 1e-5
    config["beta"] = 0.5
    config["batch_size"] = 256  # Added missing batch_size

    # Initialize model
    model = VAE(config, device).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Generate shadow datasets
    print("Generating shadow data...")
    n_train = len(dataset.train)
    n_test = len(test_dataset.test)
    K = 1  # Number of shadow models

    for s in tqdm.tqdm(range(K), desc="Generating shadow train and test datasets..."):
        torch.manual_seed(s)

        # Generate synthetic train data
        with torch.no_grad():
            train_synthetic = model.generate_data(n_train, OutputInfo_list, dataset)

        train_path = f'./privacy/{config["dataset"]}/train_{config["seed"]}_synthetic{s}.csv'
        train_synthetic.to_csv(train_path)
        print(f"Shadow training data saved to {train_path}")

        # Generate synthetic test data
        with torch.no_grad():
            test_synthetic = model.generate_data(n_test, OutputInfo_list, dataset)

        test_path = f'./privacy/{config["dataset"]}/test_{config["seed"]}_synthetic{s}.csv'
        test_synthetic.to_csv(test_path)
        print(f"Shadow test data saved to {test_path}")

    print("Shadow data generation complete!")

if __name__ == '__main__':
    main()
''')

print("Fixed shadow_data.py by adding batch_size to config")

Fixed shadow_data.py by adding batch_size to config


# **SHADOW MAIN**

In [None]:
# Simplify the shadow_main.py approach
with open('shadow_main.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

from modules.simulation import set_random_seed
from modules.model import VAE
from modules.train import train_VAE

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--seed', type=int, default=1,
                        help='seed for repeatable results')
    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options')

    parser.add_argument("--latent_dim", default=2, type=int,
                        help="the latent dimension size")
    parser.add_argument("--step", default=0.1, type=float,
                        help="interval size of quantile levels")

    parser.add_argument('--epochs', default=5, type=int,
                        help='the number of epochs')
    parser.add_argument('--batch_size', default=256, type=int,
                        help='batch size')
    parser.add_argument('--lr', default=1e-3, type=float,
                        help='learning rate')
    parser.add_argument('--threshold', default=1e-5, type=float,
                        help='threshold for clipping alpha_tilde')

    parser.add_argument('--beta', default=0.5, type=float,
                        help='scale parameter')

    return parser.parse_args()

class ShadowDataset(Dataset):
    def __init__(self, file_path, dataset_template):
        """
        Creates a shadow dataset based on the original dataset's structure
        file_path: path to the shadow data CSV
        dataset_template: original TabularDataset instance to copy preprocessing from
        """
        self.template = dataset_template

        # Load data
        base = pd.read_csv(file_path)

        # Process continuous and discrete columns like the original dataset
        # Map discrete values to their indices
        for i, dis in enumerate(self.template.discrete):
            if dis in base.columns:
                mapping = self.template.discrete_dicts[i]
                base[dis] = base[dis].apply(lambda x: mapping.get(x, 0) if isinstance(x, str) else int(x))

        # One-hot encoding
        df_dummy = []
        for d in self.template.discrete:
            if d in base.columns:
                df_dummy.append(pd.get_dummies(base[d], prefix=d))

        # Combine continuous and one-hot encoded discrete
        base_dummy = pd.concat([base[self.template.continuous]] + df_dummy, axis=1)

        # Standardize continuous columns
        self.mean = base_dummy[self.template.continuous].mean(axis=0)
        self.std = base_dummy[self.template.continuous].std(axis=0)

        base_dummy[self.template.continuous] = (base_dummy[self.template.continuous] - self.mean) / self.std

        # Match columns from original dataset
        self.x_data = []
        for c in self.template.continuous:
            if c in base_dummy.columns:
                self.x_data.append(base_dummy[c].values.reshape(-1, 1))

        # Get expected one-hot columns from template
        template_dummy_cols = []
        for col in self.template.train.columns:
            if col not in self.template.continuous:
                template_dummy_cols.append(col)

        # Add one-hot columns in correct order
        for col in template_dummy_cols:
            if col in base_dummy.columns:
                self.x_data.append(base_dummy[col].values.reshape(-1, 1))
            else:
                # Add zero column if missing
                self.x_data.append(np.zeros((len(base_dummy), 1)))

        # Concatenate all columns
        self.x_data = np.hstack(self.x_data).astype(np.float32)
        print(f"Shadow dataset created with shape {self.x_data.shape}")

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.x_data[idx])

def main():
    # Get configuration
    config = vars(get_args())
    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print("Configuration:")
    for k, v in config.items():
        print(f"  {k}: {v}")

    # Set random seed
    set_random_seed(config["seed"])
    torch.manual_seed(config["seed"])
    if config["cuda"]:
        torch.cuda.manual_seed(config["seed"])

    # Import dataset module
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    # Get original dataset to use as template
    orig_dataset = TabularDataset()

    # Update config with dataset dimensions
    OutputInfo_list = orig_dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim

    print(f"Dataset dimensions - CRPS: {CRPS_dim}, Softmax: {softmax_dim}")
    print(f"Expected input size: {CRPS_dim + softmax_dim}")

    # Load shadow datasets
    K = 1  # Number of shadow models
    shadow_datasets = []

    for s in range(K):
        shadow_path = f'./privacy/{config["dataset"]}/train_{config["seed"]}_synthetic{s}.csv'
        if os.path.exists(shadow_path):
            try:
                # Create shadow dataset using original dataset as template
                shadow_dataset = ShadowDataset(shadow_path, orig_dataset)
                shadow_datasets.append(shadow_dataset)
                print(f"Loaded shadow dataset {s}")
            except Exception as e:
                print(f"Error loading shadow dataset {s}: {e}")
                import traceback
                traceback.print_exc()
        else:
            print(f"Shadow dataset {shadow_path} not found")

    if not shadow_datasets:
        print("No shadow datasets found. Please generate shadow data first.")
        return

    # Train shadow models
    for k, shadow_dataset in enumerate(shadow_datasets):
        print(f"\\nTraining {k}th shadow model...\\n")

        # Initialize model
        model = VAE(config, device).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
        model.train()

        # Create dataloader
        dataloader = DataLoader(shadow_dataset, batch_size=config["batch_size"], shuffle=True)

        # Training loop
        for epoch in range(config["epochs"]):
            logs = train_VAE(OutputInfo_list, dataloader, model, config, optimizer, device)

            print_input = f"[epoch {epoch + 1:03d}]"
            print_input += ''.join([f", {x}: {np.mean(y):.4f}" for x, y in logs.items()])
            print(print_input)

        # Save shadow model
        os.makedirs('./assets/shadow', exist_ok=True)
        torch.save(model.state_dict(), f'./assets/shadow/shadow_DistVAE_{config["dataset"]}_{k}.pth')
        print(f"Shadow model {k} saved to ./assets/shadow/shadow_DistVAE_{config['dataset']}_{k}.pth")

    print("\\nShadow model training complete!")

if __name__ == '__main__':
    main()
''')

print("Created simplified shadow_main.py that properly handles the data format")

Created simplified shadow_main.py that properly handles the data format


# **SHADOW ATTACK**

In [None]:
# Fix shadow_attack.py to handle empty shadow test data
with open('shadow_attack.py', 'w') as f:
    f.write('''
import os
import numpy as np
import pandas as pd
import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

from modules.simulation import set_random_seed
from modules.model import VAE

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Configuration
import argparse
def get_args():
    parser = argparse.ArgumentParser('parameters')

    parser.add_argument('--dataset', type=str, default='adult',
                        help='Dataset options: covtype, credit, loan, adult, cabs, kings')
    parser.add_argument('--seed', type=int, default=1,
                        help='random seed')

    return parser.parse_args()

# Custom Dataset class that matches format with original dataset
class CustomDataset(Dataset):
    def __init__(self, df, original_dataset):
        """
        Create a dataset with exact same column structure as the original
        """
        self.df = df
        self.orig = original_dataset

        # Create numeric versions of all features
        df_processed = pd.DataFrame()

        # Process continuous features
        for col in self.orig.continuous:
            if col in self.df.columns:
                df_processed[col] = pd.to_numeric(self.df[col], errors='coerce')

        # Standardize
        df_processed[self.orig.continuous] = (df_processed[self.orig.continuous] -
                                             df_processed[self.orig.continuous].mean()) / df_processed[self.orig.continuous].std()

        # Process discrete features
        for i, dis in enumerate(self.orig.discrete):
            if dis in self.df.columns:
                # Convert to numeric using original mapping
                if self.df[dis].dtype == 'object':
                    mapping = self.orig.discrete_dicts[i]
                    df_processed[dis] = self.df[dis].map(lambda x: mapping.get(x, 0) if x in mapping else 0)
                else:
                    df_processed[dis] = self.df[dis]

                # One-hot encode
                dummies = pd.get_dummies(df_processed[dis], prefix=dis)
                for col in dummies.columns:
                    df_processed[col] = dummies[col]

        # Make sure we have all columns expected by the model
        original_columns = self.orig.train.columns
        for col in original_columns:
            if col not in df_processed.columns:
                df_processed[col] = 0

        # Get data in the right order and format
        self.data = df_processed[original_columns].to_numpy().astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.data[idx])

def main():
    # Get configuration
    args = get_args()
    config = vars(args)

    # Add required configuration parameters
    config["latent_dim"] = 2
    config["step"] = 0.1
    config["threshold"] = 1e-5
    config["batch_size"] = 256  # Add batch_size

    # Basic setup
    if not os.path.exists('./privacy/results'):
        os.makedirs('./privacy/results')

    config["cuda"] = torch.cuda.is_available()
    device = torch.device('cuda:0') if config["cuda"] else torch.device('cpu')

    print(f"Using device: {device}")
    print(f"Running privacy attack for dataset: {config['dataset']}")

    # Set random seed
    set_random_seed(config["seed"])
    torch.manual_seed(config["seed"])
    if config["cuda"]:
        torch.cuda.manual_seed(config["seed"])

    # Import dataset module to get OutputInfo_list
    import importlib
    dataset_module = importlib.import_module(f'modules.{config["dataset"]}_datasets')
    TabularDataset = dataset_module.TabularDataset

    # Get OutputInfo_list from original dataset
    dataset = TabularDataset()
    test_dataset = TabularDataset(train=False)

    # Check if test dataset has data
    has_test_data = len(test_dataset.x_data) > 0
    if not has_test_data:
        print("Warning: Test dataset is empty. Using a subset of training data as test set.")

    OutputInfo_list = dataset.OutputInfo_list
    CRPS_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'CRPS'])
    softmax_dim = sum([x.dim for x in OutputInfo_list if x.activation_fn == 'softmax'])
    config["CRPS_dim"] = CRPS_dim
    config["softmax_dim"] = softmax_dim

    # Load shadow models
    K = 1  # Number of shadow models
    shadow_models = []
    for k in range(K):
        model_path = f'./assets/shadow/shadow_DistVAE_{config["dataset"]}_{k}.pth'
        if os.path.exists(model_path):
            model = VAE(config, device).to(device)
            model.load_state_dict(torch.load(model_path, map_location=device))
            model.eval()
            shadow_models.append(model)
            print(f"Loaded shadow model {k}")
        else:
            print(f"Shadow model {model_path} not found")

    if not shadow_models:
        print("No shadow models found. Please train shadow models first.")
        return

    # Determine classification target
    if config["dataset"] == "covtype":
        target = 'Cover_Type'
    elif config["dataset"] == "credit":
        target = 'TARGET'
    elif config["dataset"] == "loan":
        target = 'Personal Loan'
    elif config["dataset"] == "adult":
        target = 'income'
    elif config["dataset"] == "cabs":
        target = 'Gender'
    elif config["dataset"] == "kings":
        target = 'condition'
    else:
        raise ValueError('Not supported dataset!')

    # Load and process shadow data
    shadow_data_train = []
    shadow_targets_train = []
    for k in range(K):
        try:
            # Load training data (in)
            df = pd.read_csv(f'./privacy/{config["dataset"]}/train_{config["seed"]}_synthetic{k}.csv')

            # Create properly formatted dataset
            shadow_data_train.append(CustomDataset(df, dataset))

            # Get targets
            if target in df.columns:
                if df[target].dtype == 'object':
                    # For string targets
                    target_idx = dataset.discrete.index(target)
                    target_dict = dataset.discrete_dicts[target_idx]
                    shadow_targets_train.append(df[target].map(target_dict).fillna(0).astype(int).values)
                else:
                    shadow_targets_train.append(df[target].astype(int).values)
                print(f"Loaded shadow training dataset {k}")
            else:
                print(f"Target column {target} not found in shadow dataset {k}")
        except Exception as e:
            print(f"Error loading shadow training dataset {k}: {e}")
            import traceback
            traceback.print_exc()

    shadow_data_test = []
    shadow_targets_test = []
    for k in range(K):
        try:
            # Load test data (out)
            test_path = f'./privacy/{config["dataset"]}/test_{config["seed"]}_synthetic{k}.csv'
            if os.path.exists(test_path):
                df = pd.read_csv(test_path)

                if len(df) > 0:
                    # Create properly formatted dataset
                    shadow_data_test.append(CustomDataset(df, dataset))

                    # Get targets
                    if target in df.columns:
                        if df[target].dtype == 'object':
                            # For string targets
                            target_idx = dataset.discrete.index(target)
                            target_dict = dataset.discrete_dicts[target_idx]
                            shadow_targets_test.append(df[target].map(target_dict).fillna(0).astype(int).values)
                        else:
                            shadow_targets_test.append(df[target].astype(int).values)
                        print(f"Loaded shadow test dataset {k}")
                    else:
                        print(f"Target column {target} not found in shadow test dataset {k}")
                else:
                    print(f"Shadow test dataset {k} is empty")
            else:
                print(f"Shadow test dataset {test_path} not found")

            # If no test data, use a portion of train data as test
            if not shadow_data_test:
                print("No shadow test data available. Using a portion of shadow train data as test.")
                # Take the last 20% of the shadow training data as test
                if k < len(shadow_data_train) and k < len(shadow_targets_train):
                    train_size = len(shadow_data_train[k])
                    split_point = int(0.8 * train_size)

                    # Create a copy of the train dataset
                    train_dataset = shadow_data_train[k]
                    train_targets = shadow_targets_train[k]

                    # Split into train and test
                    # For targets, we can just split the numpy array
                    train_targets_split = train_targets[:split_point]
                    test_targets_split = train_targets[split_point:]
                    shadow_targets_train[k] = train_targets_split
                    shadow_targets_test.append(test_targets_split)

                    # For data, we need to create a new Dataset that fetches from the correct indices
                    class SubsetDataset(Dataset):
                        def __init__(self, dataset, indices):
                            self.dataset = dataset
                            self.indices = indices

                        def __len__(self):
                            return len(self.indices)

                        def __getitem__(self, idx):
                            return self.dataset[self.indices[idx]]

                    # Create train and test subset datasets
                    train_indices = list(range(split_point))
                    test_indices = list(range(split_point, train_size))

                    shadow_data_train[k] = SubsetDataset(train_dataset, train_indices)
                    shadow_data_test.append(SubsetDataset(train_dataset, test_indices))
                    print(f"Created test set from training data with {len(test_indices)} samples")
        except Exception as e:
            print(f"Error processing shadow test dataset {k}: {e}")
            import traceback
            traceback.print_exc()

    # Verify we have both train and test data
    if not shadow_data_train or not shadow_data_test:
        print("Error: Missing either shadow train or test datasets")
        return

    # Extract latent representations from training data (in)
    latents = []
    for k in range(len(shadow_models)):
        if k < len(shadow_data_train):
            dataloader = DataLoader(shadow_data_train[k], batch_size=config["batch_size"], shuffle=False)
            zs = []
            for x_batch in tqdm.tqdm(dataloader, desc=f"Extracting latents from shadow train {k}"):
                if config["cuda"]:
                    x_batch = x_batch.cuda()
                with torch.no_grad():
                    mean, _ = shadow_models[k].get_posterior(x_batch)
                zs.append(mean.cpu().numpy())
            if zs:
                zs = np.vstack(zs)
                latents.append(zs)
                print(f"Extracted {len(zs)} latent vectors from shadow train {k}")
            else:
                print(f"No latent vectors extracted from shadow train {k}")

    # Extract latent representations from test data (out)
    latents_test = []
    for k in range(len(shadow_models)):
        if k < len(shadow_data_test):
            dataloader = DataLoader(shadow_data_test[k], batch_size=config["batch_size"], shuffle=False)
            zs = []
            for x_batch in tqdm.tqdm(dataloader, desc=f"Extracting latents from shadow test {k}"):
                if config["cuda"]:
                    x_batch = x_batch.cuda()
                with torch.no_grad():
                    mean, _ = shadow_models[k].get_posterior(x_batch)
                zs.append(mean.cpu().numpy())
            if zs:
                zs = np.vstack(zs)
                latents_test.append(zs)
                print(f"Extracted {len(zs)} latent vectors from shadow test {k}")
            else:
                print(f"No latent vectors extracted from shadow test {k}")

    # Verify we have both train and test latent representations
    if not latents or not latents_test:
        print("Error: Missing latent representations from either train or test data")
        # If we have train latents but no test latents, create test latents from train
        if latents and not latents_test:
            print("Creating test latents by splitting train latents")
            for k in range(len(latents)):
                train_size = len(latents[k])
                split_point = int(0.8 * train_size)
                latents_test.append(latents[k][split_point:])
                latents[k] = latents[k][:split_point]

                # Also split the targets
                if k < len(shadow_targets_train):
                    shadow_targets_test.append(shadow_targets_train[k][split_point:])
                    shadow_targets_train[k] = shadow_targets_train[k][:split_point]
        else:
            return

    # Find unique classes
    unique_classes = set()
    for targets_list in shadow_targets_train:
        unique_classes.update(np.unique(targets_list))
    for targets_list in shadow_targets_test:
        unique_classes.update(np.unique(targets_list))

    target_num = len(unique_classes)
    print(f"Found {target_num} unique target classes")

    # Prepare attack training data
    attack_training = {}
    for t in unique_classes:
        # In samples (from training)
        in_samples = []
        for k in range(len(latents)):
            if k < len(shadow_targets_train):
                class_indices = np.where(shadow_targets_train[k] == t)[0]
                if len(class_indices) > 0:
                    in_samples.append(latents[k][class_indices])

        # Out samples (from testing)
        out_samples = []
        for k in range(len(latents_test)):
            if k < len(shadow_targets_test):
                class_indices = np.where(shadow_targets_test[k] == t)[0]
                if len(class_indices) > 0:
                    out_samples.append(latents_test[k][class_indices])

        if in_samples and out_samples:
            in_data = np.vstack(in_samples)
            in_data = np.hstack([in_data, np.ones((len(in_data), 1))])  # Add in/out label (1 = in)

            out_data = np.vstack(out_samples)
            out_data = np.hstack([out_data, np.zeros((len(out_data), 1))])  # Add in/out label (0 = out)

            attack_data = np.vstack([in_data, out_data])
            attack_training[t] = attack_data

            print(f"Class {t}: {len(in_data)} in samples, {len(out_data)} out samples")

    # Train attack models
    attackers = {}
    for t in attack_training:
        attack_data = attack_training[t]
        clf = GradientBoostingClassifier(random_state=0).fit(
            attack_data[:, :config["latent_dim"]],
            attack_data[:, -1])
        attackers[t] = clf
        print(f"Trained attacker for class {t}")

    # Load the target model
    model_path = f'./assets/DistVAE_{config["dataset"]}.pth'
    target_model = VAE(config, device).to(device)
    target_model.load_state_dict(torch.load(model_path, map_location=device))
    target_model.eval()

    # Get ground-truth latent representations
    dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=False)
    gt_latents = []
    for x_batch in tqdm.tqdm(dataloader, desc="Extracting latents from real train"):
        if config["cuda"]:
            x_batch = x_batch.cuda()
        with torch.no_grad():
            mean, _ = target_model.get_posterior(x_batch)
        gt_latents.append(mean.cpu().numpy())
    gt_latents = np.vstack(gt_latents)

    # Check if test data is available
    if has_test_data and len(test_dataset.x_data) > 0:
        dataloader_test = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)
        gt_latents_test = []
        for x_batch in tqdm.tqdm(dataloader_test, desc="Extracting latents from real test"):
            if config["cuda"]:
                x_batch = x_batch.cuda()
            with torch.no_grad():
                mean, _ = target_model.get_posterior(x_batch)
            gt_latents_test.append(mean.cpu().numpy())
        if gt_latents_test:
            gt_latents_test = np.vstack(gt_latents_test)
        else:
            # If no test latents were extracted, split training latents
            print("No test latents were extracted. Using portion of training latents as test.")
            train_size = len(gt_latents)
            split_point = int(0.8 * train_size)
            gt_latents_test = gt_latents[split_point:]
            gt_latents = gt_latents[:split_point]
    else:
        # If no test data, use a portion of training data as "test"
        train_size = len(gt_latents)
        split_point = int(0.8 * train_size)
        gt_latents_test = gt_latents[split_point:]
        gt_latents = gt_latents[:split_point]
        print(f"Using {len(gt_latents_test)} samples from training as test set")

    # Get ground-truth targets
    gt_targets = []
    gt_targets_test = []

    # Try to get target column from train_raw
    if hasattr(dataset, 'train_raw') and target in dataset.train_raw.columns:
        if dataset.train_raw[target].dtype == 'object':
            target_idx = dataset.discrete.index(target)
            gt_targets_all = np.array([dataset.discrete_dicts[target_idx].get(x, 0)
                                 for x in dataset.train_raw[target].values])

            # Split according to latent split if we used train data for test
            if len(gt_latents) < len(dataset.train_raw):
                split_point = len(gt_latents)
                gt_targets = gt_targets_all[:split_point]
                gt_targets_test = gt_targets_all[split_point:split_point+len(gt_latents_test)]
            else:
                gt_targets = gt_targets_all

                if has_test_data and hasattr(test_dataset, 'test_raw') and target in test_dataset.test_raw.columns:
                    gt_targets_test = np.array([dataset.discrete_dicts[target_idx].get(x, 0)
                                             for x in test_dataset.test_raw[target].values])
                else:
                    # If we couldn't get test targets, log an error
                    print("Could not get test targets. Privacy evaluation may not be accurate.")
                    # Try to create some targets for testing from training data
                    train_size = len(gt_targets)
                    split_point = int(0.8 * train_size)
                    gt_targets_test = gt_targets[split_point:]
                    gt_targets = gt_targets[:split_point]
        else:
            gt_targets_all = dataset.train_raw[target].values

            # Split according to latent split if we used train data for test
            if len(gt_latents) < len(dataset.train_raw):
                split_point = len(gt_latents)
                gt_targets = gt_targets_all[:split_point]
                gt_targets_test = gt_targets_all[split_point:split_point+len(gt_latents_test)]
            else:
                gt_targets = gt_targets_all

                if has_test_data and hasattr(test_dataset, 'test_raw') and target in test_dataset.test_raw.columns:
                    gt_targets_test = test_dataset.test_raw[target].values
                else:
                    # If we couldn't get test targets, use a portion of train targets
                    train_size = len(gt_targets)
                    split_point = int(0.8 * train_size)
                    gt_targets_test = gt_targets[split_point:]
                    gt_targets = gt_targets[:split_point]

    # If failed to get targets, try to derive from the one-hot columns
    if len(gt_targets) == 0:
        target_cols = [c for c in dataset.train.columns if c.startswith(f"{target}_")]
        if target_cols:
            try:
                gt_targets_all = dataset.train[target_cols].idxmax(axis=1).str.replace(f"{target}_", "").astype(int).values

                # Split according to latent split if we used train data for test
                if len(gt_latents) < len(dataset.train):
                    split_point = len(gt_latents)
                    gt_targets = gt_targets_all[:split_point]
                    gt_targets_test = gt_targets_all[split_point:split_point+len(gt_latents_test)]
                else:
                    gt_targets = gt_targets_all

                    if has_test_data:
                        gt_targets_test = test_dataset.test[target_cols].idxmax(axis=1).str.replace(f"{target}_", "").astype(int).values
                    else:
                        # Use portion of train targets as test
                        train_size = len(gt_targets)
                        split_point = int(0.8 * train_size)
                        gt_targets_test = gt_targets[split_point:]
                        gt_targets = gt_targets[:split_point]
            except:
                # Another attempt with different approach
                gt_targets_all = np.argmax(dataset.train[target_cols].values, axis=1)

                # Split according to latent split if we used train data for test
                if len(gt_latents) < len(dataset.train):
                    split_point = len(gt_latents)
                    gt_targets = gt_targets_all[:split_point]
                    gt_targets_test = gt_targets_all[split_point:split_point+len(gt_latents_test)]
                else:
                    gt_targets = gt_targets_all

                    if has_test_data:
                        gt_targets_test = np.argmax(test_dataset.test[target_cols].values, axis=1)
                    else:
                        # Use portion of train targets as test
                        train_size = len(gt_targets)
                        split_point = int(0.8 * train_size)
                        gt_targets_test = gt_targets[split_point:]
                        gt_targets = gt_targets[:split_point]

    if len(gt_targets) == 0 or len(gt_targets_test) == 0:
        print(f"Could not extract target values for {target}")
        return

    # Make sure targets match latents in length
    if len(gt_targets) > len(gt_latents):
        gt_targets = gt_targets[:len(gt_latents)]
    elif len(gt_targets) < len(gt_latents):
        gt_latents = gt_latents[:len(gt_targets)]

    if len(gt_targets_test) > len(gt_latents_test):
        gt_targets_test = gt_targets_test[:len(gt_latents_test)]
    elif len(gt_targets_test) < len(gt_latents_test):
        gt_latents_test = gt_latents_test[:len(gt_targets_test)]

    print(f"Extracted target values with {len(np.unique(gt_targets))} unique classes")
    print(f"Train samples: {len(gt_latents)}, Test samples: {len(gt_latents_test)}")

    # Use same number of samples from both sets for balanced evaluation
    min_samples = min(len(gt_latents), len(gt_latents_test))
    gt_latents = gt_latents[:min_samples]
    gt_targets = gt_targets[:min_samples]
    gt_latents_test = gt_latents_test[:min_samples]
    gt_targets_test = gt_targets_test[:min_samples]

    # Perform membership inference attack
    all_preds = []
    all_true = []

    # Test on real training data (should be "in")
    for t in attackers:
        class_indices = np.where(gt_targets == t)[0]
        if len(class_indices) > 0:
            latents_class = gt_latents[class_indices]
            preds = attackers[t].predict(latents_class)
            all_preds.extend(preds)
            all_true.extend(np.ones_like(preds))  # These are "in" samples

    # Test on real test data (should be "out")
    for t in attackers:
        class_indices = np.where(gt_targets_test == t)[0]
        if len(class_indices) > 0:
            latents_class = gt_latents_test[class_indices]
            preds = attackers[t].predict(latents_class)
            all_preds.extend(preds)
            all_true.extend(np.zeros_like(preds))  # These are "out" samples

    # Calculate attack performance metrics
    if all_preds and all_true:
        all_preds = np.array(all_preds)
        all_true = np.array(all_true)

        acc = accuracy_score(all_true, all_preds)
        f1 = f1_score(all_true, all_preds)
        try:
            auc = roc_auc_score(all_true, all_preds)
        except:
            auc = 0.5  # Default for random guessing

        print('\\nMembership Inference Attack Results:')
        print(f'Accuracy: {acc:.3f}')
        print(f'F1: {f1:.3f}')
        print(f'AUC: {auc:.3f}')

        # Save results
        privacy_results = {
            'dataset': config['dataset'],
            'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
            'MI_accuracy': acc,
            'MI_f1': f1,
            'MI_auc': auc
        }

        os.makedirs('./privacy/results', exist_ok=True)
        pd.DataFrame([privacy_results]).to_csv(
            f'./privacy/results/{config["dataset"]}_privacy_evaluation.csv', index=False)

        print(f"\\nPrivacy evaluation complete! Results saved to ./privacy/results/{config['dataset']}_privacy_evaluation.csv")
    else:
        print("No predictions made. Attack could not be completed.")

if __name__ == '__main__':
    main()
''')

print("Further improved shadow_attack.py to handle empty shadow test latents")

Further improved shadow_attack.py to handle empty shadow test latents


# **GENERATE THE SHADOW DATASETS**

In [None]:
# Create necessary directories
!mkdir -p ./privacy
!mkdir -p ./assets/shadow

# Generate shadow datasets
!python shadow_data.py --dataset adult

Using device: cuda:0
Generating shadow data for dataset: adult
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A 

## **SHADOW MODELS**

In [None]:
!python shadow_main.py --dataset adult

Configuration:
  seed: 1
  dataset: adult
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Dataset dimensions - CRPS: 5, Softmax: 100
Expected input size: 105
Shadow dataset created with shape (40000, 105)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 157/157 [00:03<00:00, 49.62it/s]
[epoch 0

In [None]:
!python shadow_attack.py --dataset adult

Using device: cuda:0
Running privacy attack for dataset: adult
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A 

In [None]:
!python shadow_data.py --dataset covtype

Using device: cuda:0
Generating shadow data for dataset: covtype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean


In [None]:
!python shadow_main.py --dataset covtype

Configuration:
  seed: 1
  dataset: covtype
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Dataset dimensions - CRPS: 10, Softmax: 7
Expected input size: 17
Shadow dataset created with shape (45000, 17)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 176/176 [00:04<00:00, 35.52it/s]
[epoch 00

In [None]:
!python shadow_attack.py --dataset covtype

Using device: cuda:0
Running privacy attack for dataset: covtype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean


In [None]:
!python shadow_data.py --dataset credit

Using device: cuda:0
Generating shadow data for dataset: credit
Loading model from ./assets/DistVAE_credit.pth
Generating shadow data...
Generating shadow train and test datasets...:   0% 0/1 [00:00<?, ?it/s]Shadow training data saved to ./privacy/credit/train_1_synthetic0.csv
Shadow test data saved to ./privacy/credit/test_1_synthetic0.csv
Generating shadow train and test datasets...: 100% 1/1 [00:00<00:00,  1.44it/s]
Shadow data generation complete!


In [None]:
!python shadow_main.py --dataset credit

Configuration:
  seed: 1
  dataset: credit
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
Dataset dimensions - CRPS: 10, Softmax: 37
Expected input size: 47
Shadow dataset created with shape (16953, 47)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 67/67 [00:02<00:00, 29.84it/s]
[epoch 001], loss: 11.9140, quantile: 11.8831, KL: 0.0617, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 37.18it/s]
[epoch 002], loss: 8.2069, quantile: 8.1859, KL: 0.0421, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 38.16it/s]
[epoch 003], loss: 7.8724, quantile: 7.8587, KL: 0.0274, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 36.67it/s]
[epoch 004], loss: 7.7954, quantile: 7.7685, KL: 0.0537, activated: 0.0000
inner loop: 100% 67/67 [00:01<00:00, 37.20it/s]
[epoch 005], loss: 7.7480, quantile: 7.6984, KL: 0.0993, activated: 0.0000
Shadow model 0 saved to ./assets/shadow/shadow_DistVAE_cre

In [None]:
!python shadow_attack.py --dataset credit

Using device: cuda:0
Running privacy attack for dataset: credit
Loaded shadow model 0
Loaded shadow training dataset 0
Shadow test dataset 0 is empty
No shadow test data available. Using a portion of shadow train data as test.
Created test set from training data with 3391 samples
Extracting latents from shadow train 0: 100% 53/53 [00:00<00:00, 310.62it/s]
Extracted 13562 latent vectors from shadow train 0
Extracting latents from shadow test 0: 100% 14/14 [00:00<00:00, 567.25it/s]
Extracted 3391 latent vectors from shadow test 0
Found 2 unique target classes
Class 0: 12561 in samples, 3163 out samples
Class 1: 1001 in samples, 228 out samples
Trained attacker for class 0
Trained attacker for class 1
Extracting latents from real train: 100% 67/67 [00:00<00:00, 589.56it/s]
Using 3391 samples from training as test set
Extracted target values with 2 unique classes
Train samples: 13562, Test samples: 3391

Membership Inference Attack Results:
Accuracy: 0.486
F1: 0.451
AUC: 0.486

Privacy eva

In [None]:
!python shadow_data.py --dataset loan

Using device: cuda:0
Generating shadow data for dataset: loan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A v

In [None]:
!python shadow_main.py --dataset loan

Configuration:
  seed: 1
  dataset: loan
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Dataset dimensions - CRPS: 5, Softmax: 14
Expected input size: 19
Shadow dataset created with shape (4000, 19)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 16/16 [00:00<00:00, 22.63it/s]
[epoch 001], lo

In [None]:
!python shadow_attack.py --dataset loan

Using device: cuda:0
Running privacy attack for dataset: loan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A v

In [None]:
!python shadow_data.py --dataset cabs

Using device: cuda:0
Generating shadow data for dataset: cabs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A v

In [None]:
!python shadow_main.py --dataset cabs

Configuration:
  seed: 1
  dataset: cabs
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Dataset dimensions - CRPS: 6, Softmax: 44
Expected input size: 50
Shadow dataset created with shape (22329, 50)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 88/88 [00:02<00:00, 43.83it/s]
[epoch 001], l

In [None]:
!python shadow_attack.py --dataset cabs

Using device: cuda:0
Running privacy attack for dataset: cabs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A v

In [None]:
!python shadow_data.py --dataset kings

Using device: cuda:0
Generating shadow data for dataset: kings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A 

In [None]:
!python shadow_main.py --dataset kings

Configuration:
  seed: 1
  dataset: kings
  latent_dim: 2
  step: 0.1
  epochs: 5
  batch_size: 256
  lr: 0.001
  threshold: 1e-05
  beta: 0.5
  cuda: True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
Dataset dimensions - CRPS: 11, Softmax: 73
Expected input size: 84
Shadow dataset created with shape (20000, 84)
Loaded shadow dataset 0

Training 0th shadow model...

inner loop: 100% 79/79 [00:02<00:00, 28.28it/s]
[epoch 001],

In [None]:
!python shadow_attack.py --dataset kings

Using device: cuda:0
Running privacy attack for dataset: kings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] /= self.std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[self.continuous] = df[self.continuous] - self.mean
A 

In [None]:
# Create a summary of all evaluation results
import pandas as pd
import os
import glob

# Find all evaluation files
result_files = glob.glob('./assets/results/*_evaluation.csv')

# Read and combine results
dfs = []
for file in result_files:
    df = pd.read_csv(file)
    dfs.append(df)

if dfs:
    all_results = pd.concat(dfs)

    # Create a summary table
    summary = all_results[['dataset', 'K-S_continuous', '1-WD_continuous',
                          'DCR_R&S', 'AD_F1_K10']]

    # Save summary
    summary.to_csv('./assets/results/summary.csv', index=False)

    # Display the summary
    print("Summary of evaluation results:")
    print(summary)
else:
    print("No evaluation results found")

Summary of evaluation results:
   dataset  K-S_continuous  1-WD_continuous   DCR_R&S  AD_F1_K10
0  covtype        0.043640         0.064336  0.810164   0.000000
0    kings        0.145727         0.122728  0.573692   0.127746
0     loan        0.128200         0.178420  0.314732   0.392122
0     cabs        0.055981         0.078423  0.401437   0.158693
0    adult        0.142070         0.151013  0.066024   0.146588
0   credit        0.128874         0.127680  0.841083   0.291246
