In [5]:
import pandas as pd

df_train = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/train.csv')
df_submission = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/sample_submission.csv')
df_test = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/test.csv')

In [4]:
df_train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [2]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.mixture import GaussianMixture
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [13]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class ModeSpecificNormalization:
    def __init__(self, n_modes):
        self.n_modes = n_modes
        self.models = {}

    def fit(self, data, col_idx):
        """Fit VGM to the specified column data."""
        col_data = data[:, col_idx].reshape(-1, 1)
        gmm = GaussianMixture(n_components=self.n_modes)
        gmm.fit(col_data)
        self.models[col_idx] = gmm

    def transform(self, data):
        """Transform the data using mode-specific normalization."""
        transformed_data = []

        for col_idx, gmm in self.models.items():
            col_data = data[:, col_idx].cpu().numpy().reshape(-1, 1)  # CPU로 이동 후 NumPy 변환
            probs = gmm.predict_proba(col_data)
            modes = gmm.means_.reshape(-1)
            stds = gmm.covariances_.reshape(-1) ** 0.5

            mode_idx = torch.tensor(probs.argmax(axis=1), device=device)
            # 원-핫 인코딩된 모드 벡터 제외
            normalized_values = (col_data.squeeze() - modes[mode_idx.cpu().numpy()]) / stds[mode_idx.cpu().numpy()]

            transformed_data.append(torch.tensor(normalized_values, device=device).unsqueeze(1))

        transformed_data = torch.cat(transformed_data, dim=1)
        return transformed_data

class ConditionalGenerator(nn.Module):
    def __init__(self, noise_dim, condition_dim, output_dim):
        super(ConditionalGenerator, self).__init__()
        self.fc1 = nn.Linear(noise_dim + condition_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, output_dim)
        self.tanh = nn.Tanh()
        self.drop = nn.Dropout(0.2)

    def forward(self, noise, cond):
        x = torch.cat([noise, cond], dim=1)
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop(x)
        x = self.tanh(self.fc3(x))
        return x

class Discriminator(nn.Module):
    def __init__(self, input_dim, condition_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim + condition_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)
        self.drop = nn.Dropout(0.2)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, x, cond):
        x = torch.cat([x, cond], dim=1)
        x = self.leaky_relu(self.fc1(x))
        x = self.drop(x)
        x = self.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        return x

# WGAN-GP Loss functions
def gradient_penalty(discriminator, real_data, fake_data, conditions):
    alpha = torch.rand(real_data.size(0), 1, device=device)
    interpolates = (alpha * real_data + (1 - alpha) * fake_data).requires_grad_(True)
    disc_interpolates = discriminator(interpolates, conditions)
    gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                                    grad_outputs=torch.ones(disc_interpolates.size(), device=device),
                                    create_graph=True, retain_graph=True, only_inputs=True)[0]
    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10
    return gradient_penalty


Using device: cuda


In [23]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/test.csv')
df_submission = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/sample_submission.csv')

# Data preprocessing
continuous_cols = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']
categorical_cols = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
target_col = 'credit'

data = df_train.drop(columns=[target_col])
target_data = df_train[target_col]

# Impute missing values
imputer_categorical = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer_categorical.fit_transform(data[categorical_cols])
df_test[categorical_cols] = imputer_categorical.transform(df_test[categorical_cols])

print(data.isna().sum().sum())

scaler = StandardScaler()
continuous_data = scaler.fit_transform(data[continuous_cols])
continuous_data_test = scaler.transform(df_test[continuous_cols])

encoder = OneHotEncoder(sparse_output=False)
categorical_data = encoder.fit_transform(data[categorical_cols])
categorical_data_test = encoder.transform(df_test[categorical_cols])

# Prepare the ModeSpecificNormalization
normalizer = ModeSpecificNormalization(n_modes=3)
for col_idx in range(len(continuous_cols)):
    normalizer.fit(continuous_data, col_idx)

normalized_data = normalizer.transform(torch.tensor(continuous_data, dtype=torch.float))
normalized_data_test = normalizer.transform(torch.tensor(continuous_data_test, dtype=torch.float))

# Convert to torch tensors
normalized_data = normalized_data.clone().detach().to(device).float()
categorical_data = torch.tensor(categorical_data, dtype=torch.float, device=device)
combined_data = torch.cat([normalized_data, categorical_data], dim=1).float()

normalized_data_test = normalized_data_test.clone().detach().to(device).float()
categorical_data_test = torch.tensor(categorical_data_test, dtype=torch.float, device=device)
combined_data_test = torch.cat([normalized_data_test, categorical_data_test], dim=1).float()

# Prepare conditions
conditions = torch.tensor(target_data.values, dtype=torch.long, device=device)
conditions = F.one_hot(conditions, num_classes=3).float()

print("Combined data shape (train):", combined_data.shape)
print("Combined data shape (test):", combined_data_test.shape)

0
Combined data shape (train): torch.Size([26457, 50])
Combined data shape (test): torch.Size([10000, 50])


In [24]:
# Training parameters
noise_dim = 10
condition_dim = conditions.shape[1]
output_dim = combined_data.shape[1]
batch_size = 64
epochs = 1000
learning_rate = 0.0001

# Initialize generator and discriminator
generator = ConditionalGenerator(noise_dim, condition_dim, output_dim).to(device)
discriminator = Discriminator(output_dim, condition_dim).to(device)

# Optimizers
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate, betas=(0.5, 0.9))
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate, betas=(0.5, 0.9))

# Compute class sampling weights to handle class imbalance
class_counts = target_data.value_counts().sort_index().values
class_weights = (1 / class_counts) / np.sum(1 / class_counts)

# Extend class weights to match the size of the conditions array
condition_weights = class_weights[conditions.argmax(dim=1).cpu().numpy()]
condition_weights = condition_weights / condition_weights.sum()

# Training loop
for epoch in range(epochs):
    for _ in range(5):  # Train discriminator more
        # Generate fake data
        noise = torch.randn(batch_size, noise_dim, device=device)
        condition_indices = np.random.choice(len(conditions), size=batch_size, p=condition_weights)
        sampled_conditions = conditions[condition_indices]
        fake_data = generator(noise, sampled_conditions)

        # Real data (replace this with your actual tabular data)
        real_data = combined_data[condition_indices]

        # Train Discriminator
        optimizer_d.zero_grad()
        real_output = discriminator(real_data, sampled_conditions)
        fake_output = discriminator(fake_data.detach(), sampled_conditions)

        d_loss_real = -torch.mean(real_output)
        d_loss_fake = torch.mean(fake_output)

        gp = gradient_penalty(discriminator, real_data, fake_data, sampled_conditions)
        d_loss = d_loss_real + d_loss_fake + gp

        d_loss.backward(retain_graph=True)
        optimizer_d.step()

    # Train Generator
    noise = torch.randn(batch_size, noise_dim, device=device)
    condition_indices = np.random.choice(len(conditions), size=batch_size, p=condition_weights)
    sampled_conditions = conditions[condition_indices]
    fake_data = generator(noise, sampled_conditions)

    optimizer_g.zero_grad()
    fake_output = discriminator(fake_data, sampled_conditions)
    g_loss = -torch.mean(fake_output)

    g_loss.backward()
    optimizer_g.step()

    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{epochs}], d_loss: {d_loss.item()}, g_loss: {g_loss.item()}')

print("Training finished.")

Epoch [0/1000], d_loss: 7.476580619812012, g_loss: 0.019571388140320778
Epoch [100/1000], d_loss: -0.45576077699661255, g_loss: -1.0589337348937988
Epoch [200/1000], d_loss: -0.571284830570221, g_loss: -0.15224477648735046
Epoch [300/1000], d_loss: -0.5501803755760193, g_loss: 0.08545323461294174
Epoch [400/1000], d_loss: -0.39934515953063965, g_loss: 0.03738342970609665
Epoch [500/1000], d_loss: -0.40719279646873474, g_loss: -0.17261779308319092
Epoch [600/1000], d_loss: -0.4385802149772644, g_loss: -0.32440847158432007
Epoch [700/1000], d_loss: -0.46238023042678833, g_loss: -0.29440826177597046
Epoch [800/1000], d_loss: -0.5487584471702576, g_loss: -0.19212721288204193
Epoch [900/1000], d_loss: -0.47673773765563965, g_loss: -0.31255751848220825
Training finished.


In [25]:
# Initialize generator and discriminator
generator = ConditionalGenerator(noise_dim, condition_dim, output_dim).to(device)
discriminator = Discriminator(output_dim, condition_dim).to(device)

# 생성된 데이터 샘플링 함수
def generate_samples(generator, num_samples, noise_dim, condition_dim, device):
    noise = torch.randn(num_samples, noise_dim, device=device)
    # 조건부 벡터를 원-핫 인코딩된 형태로 생성
    condition_indices = torch.randint(0, condition_dim, (num_samples,), device=device)
    conditions = F.one_hot(condition_indices, num_classes=condition_dim).float()
    with torch.no_grad():
        samples = generator(noise, conditions)
    # category_indices를 numpy 배열로 변환하여 샘플의 마지막 열로 추가
    category_indices = condition_indices.cpu().numpy().reshape(-1, 1)
    samples = samples.cpu().numpy()
    # 기존 샘플에 조건부 벡터 추가
    samples_with_labels = np.hstack((samples, category_indices))
    return samples_with_labels

# 샘플 생성
num_samples = 10
generated_samples = generate_samples(generator, num_samples, noise_dim, condition_dim, device)

print("Generated samples shape:", generated_samples.shape)

Generated samples shape: (10, 51)


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal, Categorical

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, hidden_dim)
        self.fc_logvar = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        h2 = self.relu(self.fc2(h1))
        mu = self.fc_mu(h2)
        logvar = self.fc_logvar(h2)
        return mu, logvar

class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim_cont, output_dim_cat):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        # Continuous variables
        self.fc_mu = nn.Linear(hidden_dim, output_dim_cont)
        self.fc_logvar = nn.Linear(hidden_dim, output_dim_cont)

        # Categorical variables
        self.fc_alpha = nn.Linear(hidden_dim, output_dim_cat)
        self.fc_beta = nn.Linear(hidden_dim, output_dim_cat)
        self.fc_delta = nn.Linear(hidden_dim, output_dim_cat)

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, z):
        h1 = self.relu(self.fc1(z))
        h2 = self.relu(self.fc2(h1))

        # Continuous
        mu_cont = self.fc_mu(h2)
        logvar_cont = self.fc_logvar(h2)

        # Categorical
        alpha = self.tanh(self.fc_alpha(h2))
        beta = self.softmax(self.fc_beta(h2))
        delta = self.softmax(self.fc_delta(h2))

        return mu_cont, logvar_cont, alpha, beta, delta

class TVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, output_dim_cont, output_dim_cat):
        super(TVAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(latent_dim, hidden_dim, output_dim_cont, output_dim_cat)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        mu_cont, logvar_cont, alpha, beta, delta = self.decoder(z)
        return mu_cont, logvar_cont, alpha, beta, delta, mu, logvar

def loss_function(recon_x_cont, x_cont, recon_alpha, x_alpha, recon_beta, x_beta, mu, logvar):
    # Reconstruction loss for continuous variables
    recon_loss_cont = nn.functional.mse_loss(recon_x_cont, x_cont, reduction='sum')

    # Reconstruction loss for categorical variables
    recon_loss_alpha = nn.functional.cross_entropy(recon_alpha, x_alpha, reduction='sum')
    recon_loss_beta = nn.functional.cross_entropy(recon_beta, x_beta, reduction='sum')

    # KL divergence loss
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return recon_loss_cont + recon_loss_alpha + recon_loss_beta + kld_loss

def train(model, data_loader, epochs, learning_rate):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()

            x_cont, x_alpha, x_beta = batch
            recon_x_cont, logvar_cont, recon_alpha, recon_beta, recon_delta, mu, logvar = model(x_cont)

            loss = loss_function(recon_x_cont, x_cont, recon_alpha, x_alpha, recon_beta, x_beta, mu, logvar)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader.dataset)}')


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import torch.nn.functional as F

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the dataset
df_train = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/test.csv')
df_submission = pd.read_csv('/content/drive/MyDrive/235713_신용카드 사용자 연체 예측 AI 경진대회_data/open/sample_submission.csv')

# Data preprocessing
continuous_cols = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']
categorical_cols = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']
target_col = 'credit'

data = df_train.drop(columns=[target_col])
target_data = df_train[target_col]

# Impute missing values
imputer_categorical = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer_categorical.fit_transform(data[categorical_cols])
df_test[categorical_cols] = imputer_categorical.transform(df_test[categorical_cols])

print(data.isna().sum().sum())

scaler = StandardScaler()
continuous_data = scaler.fit_transform(data[continuous_cols])
continuous_data_test = scaler.transform(df_test[continuous_cols])

encoder = OneHotEncoder(sparse_output=False)
categorical_data = encoder.fit_transform(data[categorical_cols])
categorical_data_test = encoder.transform(df_test[categorical_cols])

# Convert to torch tensors
continuous_data = torch.tensor(continuous_data, dtype=torch.float, device=device)
categorical_data = torch.tensor(categorical_data, dtype=torch.float, device=device)
combined_data = torch.cat([continuous_data, categorical_data], dim=1).float()

continuous_data_test = torch.tensor(continuous_data_test, dtype=torch.float, device=device)
categorical_data_test = torch.tensor(categorical_data_test, dtype=torch.float, device=device)
combined_data_test = torch.cat([continuous_data_test, categorical_data_test], dim=1).float()

# Prepare conditions
conditions = torch.tensor(target_data.values, dtype=torch.long, device=device)
conditions = F.one_hot(conditions, num_classes=3).float()

print("Combined data shape (train):", combined_data.shape)
print("Combined data shape (test):", combined_data_test.shape)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, conditions):
        self.data = data
        self.conditions = conditions

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.conditions[idx]

train_dataset = CustomDataset(combined_data, conditions)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define Encoder and Decoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        h2 = self.relu(self.fc2(h1))
        mu = self.fc_mu(h2)
        logvar = self.fc_logvar(h2)
        return mu, logvar

class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim_cont, output_dim_cat):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        # Continuous variables
        self.fc_mu = nn.Linear(hidden_dim, output_dim_cont)
        self.fc_logvar = nn.Linear(hidden_dim, output_dim_cont)

        # Categorical variables
        self.fc_cat = nn.Linear(hidden_dim, output_dim_cat)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, z):
        h1 = self.relu(self.fc1(z))
        h2 = self.relu(self.fc2(h1))

        # Continuous
        mu_cont = self.fc_mu(h2)
        logvar_cont = self.fc_logvar(h2)

        # Categorical
        cat_logits = self.fc_cat(h2)

        return mu_cont, logvar_cont, cat_logits


class TVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, output_dim_cont, output_dim_cat):
        super(TVAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim)
        self.decoder = Decoder(latent_dim, hidden_dim, output_dim_cont, output_dim_cat)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        mu_cont, logvar_cont, cat_logits = self.decoder(z)
        return mu_cont, logvar_cont, cat_logits, mu, logvar


def loss_function(recon_x_cont, x_cont, recon_cat, x_cat, mu, logvar):
    recon_loss_cont = nn.functional.mse_loss(recon_x_cont, x_cont, reduction='sum')
    recon_loss_cat = nn.functional.cross_entropy(recon_cat, x_cat.argmax(dim=1), reduction='sum')
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss_cont + recon_loss_cat + kld_loss



def train(model, data_loader, epochs, learning_rate):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()
            x, conditions = batch
            recon_x_cont, logvar_cont, recon_cat, mu, logvar = model(x)
            loss = loss_function(recon_x_cont, x[:, :output_dim_cont], recon_cat, x[:, output_dim_cont:], mu, logvar)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader.dataset)}')

input_dim = combined_data.shape[1]  # Should be 50 (5 continuous + 45 categorical)
hidden_dim = 128
latent_dim = 64
output_dim_cont = 5  # Number of continuous features
output_dim_cat = 45  # Number of categorical features

model = TVAE(input_dim, hidden_dim, latent_dim, output_dim_cont, output_dim_cat).to(device)
train(model, train_loader, epochs=10, learning_rate=1e-2)

0
Combined data shape (train): torch.Size([26457, 50])
Combined data shape (test): torch.Size([10000, 50])
Epoch 1, Loss: 5.253577198369273
Epoch 2, Loss: 4.787064900333191
Epoch 3, Loss: 4.541385061063116
Epoch 4, Loss: 4.517228416301282
Epoch 5, Loss: 4.491863594324801
Epoch 6, Loss: 4.5306490596677955
Epoch 7, Loss: 4.497359748491693
Epoch 8, Loss: 4.489550388259908
Epoch 9, Loss: 4.469125096181643
Epoch 10, Loss: 4.471826140615735


In [32]:
def generate_synthetic_data(model, num_samples):
    model.eval()
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim).to(device)
        mu_cont, logvar_cont, cat_logits = model.decoder(z)

        # For continuous variables
        std_cont = torch.exp(0.5 * logvar_cont)
        eps_cont = torch.randn_like(std_cont)
        generated_cont = mu_cont + eps_cont * std_cont

        # For categorical variables
        generated_cat = cat_logits

        # Concatenate all generated data
        print(generated_cont.shape, generated_cat.shape)
        generated_data = torch.cat([generated_cont, generated_cat], dim=1)

        return generated_data.cpu().numpy()

# Generate new synthetic data
num_samples = 10
synthetic_data = generate_synthetic_data(model, num_samples)
print(synthetic_data.shape)  # 출력 형태 확인

torch.Size([10, 5]) torch.Size([10, 45])
(10, 50)


In [33]:
synthetic_data

array([[ 1.93938780e+00,  1.16848552e+00,  3.10837626e-02,
         1.25575042e+00, -8.73213828e-01,  4.18477154e+00,
         2.29797959e+00, -4.17949982e+01, -4.10353279e+01,
        -4.21837196e+01, -3.96843643e+01, -4.08985481e+01,
        -4.19957314e+01, -4.11915054e+01, -4.13144608e+01,
        -4.02735710e+01, -4.13967628e+01, -4.24532166e+01,
        -3.81071777e+01, -4.46095161e+01, -4.07484818e+01,
        -3.99831314e+01, -4.16856384e+01, -4.09219360e+01,
        -4.02073936e+01, -3.98976898e+01, -3.94478989e+01,
        -4.01754456e+01, -4.07278214e+01, -3.97370682e+01,
        -3.86423454e+01, -4.16986771e+01, -4.02304802e+01,
        -3.96603279e+01, -4.14991226e+01, -4.23510551e+01,
        -4.04643326e+01, -4.00885010e+01, -3.96647415e+01,
        -4.05934334e+01, -4.16506577e+01, -3.94071503e+01,
        -4.11521988e+01, -3.80369225e+01, -4.08037872e+01,
        -4.03212280e+01, -4.11377106e+01, -4.19138603e+01,
        -4.26980019e+01, -3.98244781e+01],
       [ 2.40