In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [18]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 

In [25]:
X = adult.data.features
y = adult.data.targets

In [189]:
X

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [44]:
distinct_values = {col: X[col].unique() for col in X.columns}

# Optional: print in readable format
for col, values in distinct_values.items():
    print(f"{col} size: {len(values)}")

age size: 74
workclass size: 10
fnlwgt size: 28523
education size: 16
education-num size: 16
marital-status size: 7
occupation size: 16
relationship size: 6
race size: 5
sex size: 2
capital-gain size: 123
capital-loss size: 99
hours-per-week size: 96
native-country size: 43


All except fnlwgt are categorical.

In [46]:
X.drop('fnlwgt', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('fnlwgt', inplace=True, axis=1)


In [48]:
discrete_columns = X.columns
discrete_columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [191]:
from ctgan.data_transformer import DataTransformer

transformer = DataTransformer()
transformer.fit(X, discrete_columns)
train_data = transformer.transform(X)

train_data

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [192]:
transformer.output_info_list

[[SpanInfo(dim=74, activation_fn='softmax')],
 [SpanInfo(dim=10, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=7, activation_fn='softmax')],
 [SpanInfo(dim=16, activation_fn='softmax')],
 [SpanInfo(dim=6, activation_fn='softmax')],
 [SpanInfo(dim=5, activation_fn='softmax')],
 [SpanInfo(dim=2, activation_fn='softmax')],
 [SpanInfo(dim=123, activation_fn='softmax')],
 [SpanInfo(dim=99, activation_fn='softmax')],
 [SpanInfo(dim=96, activation_fn='softmax')],
 [SpanInfo(dim=43, activation_fn='softmax')]]

In [193]:
from ctgan.data_sampler import DataSampler

data_sampler = DataSampler(train_data, transformer.output_info_list, log_frequency=True)

In [194]:
c1, m1, col, opt = data_sampler.sample_condvec(3)
print(c1.shape, m1.shape, col, opt)

(3, 513) (3, 13) [7 5 7] [1 9 0]


In [167]:
class Discriminator(nn.Module):
    """Discriminator for the CTGAN."""

    def __init__(self, input_dim, discriminator_dim, pac=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pac
        self.pac = pac
        self.pacdim = dim
        seq = []
        for item in list(discriminator_dim):
            seq += [nn.Linear(dim, item), nn.LeakyReLU(0.2), nn.Dropout(0.5)]
            dim = item

        seq += [nn.Linear(dim, 1)]
        self.seq = nn.Sequential(*seq)

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):
        """Compute the gradient penalty."""
        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
        )[0]

        gradients_view = gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

        return gradient_penalty

    def forward(self, input_):
        """Apply the Discriminator to the `input_`."""
        assert input_.size()[0] % self.pac == 0
        return self.seq(input_.view(-1, self.pacdim))


# Bayesian Residual Layer
class BayesianResidual(nn.Module):
    def __init__(self, i, o):
        super(BayesianResidual, self).__init__()
        self.fc_mu = nn.Linear(i, o)
        self.fc_logvar = nn.Linear(i, o)
        self.bn = nn.BatchNorm1d(o)
        self.relu = nn.ReLU()

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, input_):
        mu = self.fc_mu(input_)
        logvar = self.fc_logvar(input_)
        out = self.reparameterize(mu, logvar)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input_], dim=1)

    def kl_divergence(self):
        kl = 0
        for param_mu, param_logvar in [(self.fc_mu.weight, self.fc_logvar.weight), (self.fc_mu.bias, self.fc_logvar.bias)]:
            kl += -0.5 * torch.sum(1 + param_logvar - param_mu.pow(2) - param_logvar.exp())
        return kl

# Bayesian Generator
class BayesianGenerator(nn.Module):
    def __init__(self, embedding_dim, generator_dim, data_dim):
        super(BayesianGenerator, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(generator_dim):
            seq += [BayesianResidual(dim, item)]
            dim += item
        self.residuals = nn.ModuleList(seq)
        self.final_mu = nn.Linear(dim, data_dim)
        self.final_logvar = nn.Linear(dim, data_dim)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, input_):
        x = input_
        #print(x.shape)
        for residual in self.residuals:
            x = residual(x)
        mu = self.final_mu(x)
        logvar = self.final_logvar(x)
        data = self.reparameterize(mu, logvar)
        return data

    def kl_divergence(self):
        kl = 0
        for residual in self.residuals:
            kl += residual.kl_divergence()
        for param_mu, param_logvar in [(self.final_mu.weight, self.final_logvar.weight), (self.final_mu.bias, self.final_logvar.bias)]:
            kl += -0.5 * torch.sum(1 + param_logvar - param_mu.pow(2) - param_logvar.exp())
        return kl

# Apply Activate (unchanged)
def _apply_activate(data, transformer):
    """Apply proper activation function to the output of the generator."""
    data_t = []
    st = 0
    for column_info in transformer.output_info_list:
        for span_info in column_info:
            if span_info.activation_fn == 'tanh':
                ed = st + span_info.dim
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif span_info.activation_fn == 'softmax':
                ed = st + span_info.dim
                transformed = F.gumbel_softmax(data[:, st:ed], tau=0.2)
                data_t.append(transformed)
                st = ed
            else:
                raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')
    return torch.cat(data_t, dim=1)

# Cond Loss (unchanged)
def cond_loss(data, c, m, transformer):
    """Compute the cross entropy loss on the fixed discrete column."""
    loss = []
    st = 0
    st_c = 0
    for column_info in transformer.output_info_list:
        for span_info in column_info:
            if len(column_info) != 1 or span_info.activation_fn != 'softmax':
                # not discrete column
                st += span_info.dim
            else:
                ed = st + span_info.dim
                ed_c = st_c + span_info.dim
                tmp = F.cross_entropy(
                    data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none'
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c
    loss = torch.stack(loss, dim=1)
    return (loss * m).sum() / data.size()[0]

def gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
    return F.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)

batch_size = 500
embedding_dim = 128
generator_dim = (256, 256)
discriminator_dim = (256, 256)
data_dim = transformer.output_dimensions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

generator = BayesianGenerator(embedding_dim + data_sampler.dim_cond_vec(), generator_dim, data_dim).to(device)
discriminator = Discriminator(data_dim + data_sampler.dim_cond_vec(), discriminator_dim).to(device)

optimizerG = torch.optim.Adam(
            generator.parameters(),
            lr=2e-4,
            betas=(0.5, 0.9),
            weight_decay=1e-6
        )

optimizerD = torch.optim.Adam(
            discriminator.parameters(),
            lr=2e-4,
            betas=(0.5, 0.9),
            weight_decay=1e-6
        )

mean = torch.zeros(batch_size, embedding_dim, device=device)
std = mean + 1

# Training Loop
num_epochs = 300
discriminator_steps = 1
steps_per_epoch = max(len(train_data) // batch_size, 1)
kl_weight = 0.005  # Added for Bayesian regularization


for i in range(num_epochs):
    for id_ in range(steps_per_epoch):
        # Discriminator Training
        for n in range(discriminator_steps):
            fakez = torch.normal(mean=mean, std=std).to(device)

            condvec = data_sampler.sample_condvec(batch_size)
            if condvec is None:
                c1, m1, col, opt = None, None, None, None
                real = data_sampler.sample_data(train_data, batch_size, col, opt)
            else:
                c1, m1, col, opt = condvec
                c1 = torch.from_numpy(c1).to(device)
                m1 = torch.from_numpy(m1).to(device)
                #print(fakez.shape, c1.shape)
                fakez = torch.cat([fakez, c1], dim=1)

                perm = np.arange(batch_size)
                np.random.shuffle(perm)
                real = data_sampler.sample_data(train_data, batch_size, col[perm], opt[perm])
                c2 = c1[perm]
            fake = generator(fakez)
            fakeact = _apply_activate(fake, transformer=transformer)

            real = torch.from_numpy(real.astype('float32')).to(device)

            if c1 is not None:
                fake_cat = torch.cat([fakeact, c1], dim=1)
                real_cat = torch.cat([real, c2], dim=1)
            else:
                real_cat = real
                fake_cat = fakeact

            y_fake = discriminator(fake_cat)
            y_real = discriminator(real_cat)

            pen = discriminator.calc_gradient_penalty(real_cat, fake_cat, device, pac=10)
            loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

            optimizerD.zero_grad(set_to_none=False)
            pen.backward(retain_graph=True)
            loss_d.backward()
            optimizerD.step()

        # Generator Training
        fakez = torch.normal(mean=mean, std=std).to(device)
        condvec = data_sampler.sample_condvec(batch_size)

        if condvec is None:
            c1, m1, col, opt = None, None, None, None
        else:
            c1, m1, col, opt = condvec
            c1 = torch.from_numpy(c1).to(device)
            m1 = torch.from_numpy(m1).to(device)
            fakez = torch.cat([fakez, c1], dim=1)

        fake = generator(fakez)
        fakeact = _apply_activate(fake, transformer=transformer)

        if c1 is not None:
            y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
        else:
            y_fake = discriminator(fakeact)

        if condvec is None:
            cross_entropy = 0
        else:
            cross_entropy = cond_loss(fake, c1, m1, transformer=transformer)

        kl_div = generator.kl_divergence()  # Bayesian KL divergence term
        loss_g = -torch.mean(y_fake) + cross_entropy + kl_weight * kl_div

        optimizerG.zero_grad(set_to_none=False)
        loss_g.backward()
        optimizerG.step()

    generator_loss = loss_g.detach().cpu().item()
    discriminator_loss = loss_d.detach().cpu().item()
    kl_div_value = kl_div.detach().cpu().item()

    print(f"Epoch [{i+1}/{num_epochs}], D Loss: {discriminator_loss:.4f}, G Loss: {generator_loss:.4f}, KL Div: {kl_div_value:.4f}")

Epoch [1/300], D Loss: -1.1629, G Loss: 1.2222, KL Div: 117.2592
Epoch [2/300], D Loss: 0.0464, G Loss: 1.5376, KL Div: 89.6965
Epoch [3/300], D Loss: 0.0601, G Loss: 1.2585, KL Div: 76.8216
Epoch [4/300], D Loss: 0.0739, G Loss: 1.6931, KL Div: 67.9400
Epoch [5/300], D Loss: 0.1331, G Loss: 1.6046, KL Div: 62.2727
Epoch [6/300], D Loss: 0.0905, G Loss: 1.4813, KL Div: 59.1348
Epoch [7/300], D Loss: -0.0828, G Loss: 1.7664, KL Div: 58.4815
Epoch [8/300], D Loss: -0.0865, G Loss: 1.9000, KL Div: 60.3811
Epoch [9/300], D Loss: 0.0575, G Loss: 1.6614, KL Div: 65.1289
Epoch [10/300], D Loss: -0.1302, G Loss: 1.6682, KL Div: 72.5939
Epoch [11/300], D Loss: -0.0907, G Loss: 1.4978, KL Div: 80.8934
Epoch [12/300], D Loss: -0.0232, G Loss: 1.2334, KL Div: 86.8840
Epoch [13/300], D Loss: 0.1047, G Loss: 1.1794, KL Div: 90.2241
Epoch [14/300], D Loss: -0.0516, G Loss: 0.5057, KL Div: 91.6800
Epoch [15/300], D Loss: 0.0612, G Loss: 1.2616, KL Div: 92.0751
Epoch [16/300], D Loss: -0.2855, G Loss: 

In [None]:
def sample(n, transformer, generator, batch_size, condition_column=None, condition_value=None):
    """Sample data similar to the training data.

    Choosing a condition_column and condition_value will increase the probability of the
    discrete condition_value happening in the condition_column.

    Args:
        n (int):
            Number of rows to sample.
        condition_column (string):
            Name of a discrete column.
        condition_value (string):
            Name of the category in the condition_column which we wish to increase the
            probability of happening.

    Returns:
        numpy.ndarray or pandas.DataFrame
    """
    if condition_column is not None and condition_value is not None:
        condition_info = transformer.convert_column_name_value_to_id(
            condition_column, condition_value
        )
        global_condition_vec = data_sampler.generate_cond_from_condition_column_info(
            condition_info, batch_size
        )
    else:
        global_condition_vec = None

    steps = n // batch_size + 1
    data = []
    for i in range(steps):
        mean = torch.zeros(batch_size, embedding_dim)
        std = mean + 1
        fakez = torch.normal(mean=mean, std=std).to(device)

        if global_condition_vec is not None:
            condvec = global_condition_vec.copy()
        else:
            condvec = data_sampler.sample_original_condvec(batch_size)

        if condvec is None:
            pass
        else:
            c1 = condvec
            c1 = torch.from_numpy(c1).to(device)
            fakez = torch.cat([fakez, c1], dim=1)

        with torch.no_grad():
            outputs = [generator(fakez).cpu().numpy() for _ in range(10)]
            variance = np.var(outputs, axis=0)
            print(f"Output variance: {variance.mean():.4f}")
            
        fake = generator(fakez)
        fakeact = _apply_activate(fake, transformer=transformer)
        data.append(fakeact.detach().cpu().numpy())

    data = np.concatenate(data, axis=0)
    data = data[:n]

    return transformer.inverse_transform(data)

sample(4, transformer=transformer, generator=generator, batch_size=batch_size)

Output variance: 3.4168


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,43,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Wife,White,Female,0,0,40,United-States
1,35,Private,HS-grad,9,Divorced,Prof-specialty,Husband,White,Female,0,2603,70,United-States
2,34,Private,HS-grad,9,Divorced,Transport-moving,Not-in-family,White,Female,0,0,60,United-States
3,40,Private,7th-8th,3,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
