In [1]:
import os
import inspect
import pandas as pd

from utils import encode_preprocess
from katabatic.utils.split_dataset import split_dataset

from katabatic.models.medgan.models import MEDGAN
from katabatic.models.tabsyn.models import TabSyn
from katabatic.models.ganblr.models import GANBLR
from katabatic.models.pategan.models import PATEGAN
from katabatic.models.codi.models import CODI

from katabatic.evaluate.tstr.evaluation import TSTREvaluation

# Make sure we're in the repo root
print(os.getcwd())
os.chdir("/Users/sindhujaghosh/Downloads/Data_bytes_ketabatics/Katabatic_copy")
print("CWD:", os.getcwd())


  from .autonotebook import tqdm as notebook_tqdm


/Users/sindhujaghosh/Downloads/Data_bytes_ketabatics/Katabatic_copy
CWD: /Users/sindhujaghosh/Downloads/Data_bytes_ketabatics/Katabatic_copy


In [2]:
print("Top-level:")
print(os.listdir("."))

print("\nraw_data contents:")
print(os.listdir("raw_data"))


Top-level:
['train_test_data', 'trainer_great', 'Untitled1.ipynb', '.DS_Store', 'LICENSE', 'synthetic_data', 'dev_deps.py', 'Untitled.ipynb', 'Makefile', 'pyproject.toml', 'raw_data', '__pycache__', 'MODEL_CONTRIBUTIONS.md', 'README.md', 'example.ipynb', 'Results', 'sample_data', '.gitignore', 'utils.py', 'CONTRIBUTING.md', 'examples', 'scripts', 'synthetic', '.python-version', '.ipynb_checkpoints', 'poetry.lock', '.git', 'main.py', 'encoded_data', 'discretized_data', 'katabatic']

raw_data contents:
['adult.csv', 'magic.csv', 'Untitled.ipynb', 'car.csv', 'shuttle.csv', '.ipynb_checkpoints', 'car1.csv', 'nursery.csv']


In [3]:
# 1. Encode / preprocess adult.csv
raw_path = "raw_data/adult.csv"
encoded_path = "encoded_data/adult.csv"

encode_preprocess(raw_path, encoded_path)
print("Encoded dataset saved to:", encoded_path)

# 2. Train/test split into X/y + full CSVs
output_dir = "train_test_data"

split_dataset(
    input_csv=encoded_path,
    output_dir=output_dir
)

print("\ntrain_test_data contents:", os.listdir(output_dir))


Preprocessing: raw_data/adult.csv
Saved preprocessed discrete dataset to: encoded_data/adult.csv
Encoded dataset saved to: encoded_data/adult.csv
Loaded data with shape: (32561, 15)
Saved train/test full data
Train size: (26048, 15), Test size: (6513, 15)
Train label distribution:
 14
0    0.759175
1    0.240825
Name: proportion, dtype: float64
Test label distribution:
 14
0    0.759251
1    0.240749
Name: proportion, dtype: float64
Saved X/y split
Training shape: (26048, 14) (26048,)
Test shape: (6513, 14) (6513,)

train_test_data contents: ['X_num_train.npy', 'X_num_test.npy', 'y_train.npy', 'train_full.csv', 'x_train.csv', 'test_full.csv', 'y_test.npy', 'y_train.csv', 'y_test.csv', 'x_test.csv']


In [4]:
X_train = pd.read_csv(os.path.join(output_dir, "x_train.csv"))
y_train = pd.read_csv(os.path.join(output_dir, "y_train.csv")).values.ravel()
X_test  = pd.read_csv(os.path.join(output_dir, "x_test.csv"))
y_test  = pd.read_csv(os.path.join(output_dir, "y_test.csv")).values.ravel()

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((26048, 14), (6513, 14), (26048,), (6513,))

In [5]:
import os
import inspect

def run_generator_pipeline(model_cls, model_name, dataset_dir, synthetic_root, model_kwargs=None):
    """
    Train a generator model on dataset_dir and write synthetic data to 
    synthetic_root/model_name.

    It auto-detects whether the model.train(...) expects:
        - dataset_dir=...
        - or data_dir=...
    and always passes synthetic_dir if supported.

    Returns:
        model, synthetic_dir
    """
    if model_kwargs is None:
        model_kwargs = {}

    # e.g. synthetic_data/adult/medgan
    synthetic_dir = os.path.join(synthetic_root, model_name)
    os.makedirs(synthetic_dir, exist_ok=True)

    # Instantiate model
    model = model_cls(**model_kwargs)

    print(f"\n=== Training {model_name} ===")
    print("dataset_dir :", dataset_dir)
    print("synthetic_dir:", synthetic_dir)

    # Inspect train signature
    train_sig = inspect.signature(model.train)
    params = train_sig.parameters

    call_kwargs = {}

    # Map our dataset_dir -> whatever the model expects
    if "dataset_dir" in params:
        call_kwargs["dataset_dir"] = dataset_dir
    elif "data_dir" in params:
        call_kwargs["data_dir"] = dataset_dir
    else:
        # fallback: first non-self positional parameter
        first_param = [p for p in params if p != "self"][0]
        call_kwargs[first_param] = dataset_dir

    # Add synthetic_dir if supported
    if "synthetic_dir" in params:
        call_kwargs["synthetic_dir"] = synthetic_dir

    # Call train with the detected kwargs
    print("Calling train with:", call_kwargs)
    model.train(**call_kwargs)

    print(f"{model_name} training completed. Synthetic data saved to: {synthetic_dir}")
    return model, synthetic_dir


In [6]:
dataset_dir = "train_test_data"
synthetic_root = "synthetic_data/adult"

medgan_kwargs = dict(
    encoder_dim=128,
    latent_dim=128,
    generator_hidden_dim=256,
    discriminator_hidden_dim=256,
    generator_num_layers=3,
    discriminator_num_layers=3,
    ae_pretrain_epochs=150,
    gan_epochs=1500,
    batch_size=256,
    ae_lr=1e-3,
    generator_lr=5e-4,
    discriminator_lr=5e-4,
    dropout=0.1,
    bn_decay=0.99,
    random_state=42,
    device="cpu",
)

medgan, medgan_synth_dir = run_generator_pipeline(
    model_cls=MEDGAN,
    model_name="medgan",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root,
    model_kwargs=medgan_kwargs,
)





INFO:katabatic.models.medgan.models:Training MedGAN Model
INFO:katabatic.models.medgan.models:Loaded training data: (26048, 14)
INFO:katabatic.models.medgan.models:Data normalized to [0, 1] range
INFO:katabatic.models.medgan.models:Original range: [0.00, 1484705.00]
INFO:katabatic.models.medgan.models:Normalized range: [0.00, 1.00]
INFO:katabatic.models.medgan.models:
Phase 1: Pretraining Autoencoder for 150 epochs...



=== Training medgan ===
dataset_dir : train_test_data
synthetic_dir: synthetic_data/adult/medgan
Calling train with: {'dataset_dir': 'train_test_data', 'synthetic_dir': 'synthetic_data/adult/medgan'}


INFO:katabatic.models.medgan.models:Epoch 1/150: AE Loss = 0.459054
INFO:katabatic.models.medgan.models:Epoch 10/150: AE Loss = 0.325585
INFO:katabatic.models.medgan.models:Epoch 20/150: AE Loss = 0.323615
INFO:katabatic.models.medgan.models:Epoch 30/150: AE Loss = 0.322873
INFO:katabatic.models.medgan.models:Epoch 40/150: AE Loss = 0.322578
INFO:katabatic.models.medgan.models:Epoch 50/150: AE Loss = 0.322313
INFO:katabatic.models.medgan.models:Epoch 60/150: AE Loss = 0.322141
INFO:katabatic.models.medgan.models:Epoch 70/150: AE Loss = 0.322078
INFO:katabatic.models.medgan.models:Epoch 80/150: AE Loss = 0.321955
INFO:katabatic.models.medgan.models:Epoch 90/150: AE Loss = 0.321866
INFO:katabatic.models.medgan.models:Epoch 100/150: AE Loss = 0.321785
INFO:katabatic.models.medgan.models:Epoch 110/150: AE Loss = 0.321715
INFO:katabatic.models.medgan.models:Epoch 120/150: AE Loss = 0.321679
INFO:katabatic.models.medgan.models:Epoch 130/150: AE Loss = 0.321626
INFO:katabatic.models.medgan.mo

medgan training completed. Synthetic data saved to: synthetic_data/adult/medgan


In [7]:
import os
import numpy as np
import pandas as pd

output_dir = "train_test_data"

print("Files in train_test_data before creating .npy:")
print(os.listdir(output_dir))

# Load the CSV splits we already created
X_train_df = pd.read_csv(os.path.join(output_dir, "x_train.csv"))
X_test_df  = pd.read_csv(os.path.join(output_dir, "x_test.csv"))
y_train_df = pd.read_csv(os.path.join(output_dir, "y_train.csv"))
y_test_df  = pd.read_csv(os.path.join(output_dir, "y_test.csv"))

# Convert to numpy
X_train = X_train_df.values.astype("float32")
X_test  = X_test_df.values.astype("float32")
y_train = y_train_df.values.ravel()
y_test  = y_test_df.values.ravel()

# Save in the format TabSyn expects
np.save(os.path.join(output_dir, "X_num_train.npy"), X_train)
np.save(os.path.join(output_dir, "X_num_test.npy"), X_test)
np.save(os.path.join(output_dir, "y_train.npy"), y_train)
np.save(os.path.join(output_dir, "y_test.npy"), y_test)

print("\nCreated .npy files for TabSyn.")
print("Files in train_test_data now:")
print(os.listdir(output_dir))


Files in train_test_data before creating .npy:
['X_num_train.npy', 'X_num_test.npy', 'y_train.npy', 'train_full.csv', 'x_train.csv', 'test_full.csv', 'y_test.npy', 'y_train.csv', 'y_test.csv', 'x_test.csv']

Created .npy files for TabSyn.
Files in train_test_data now:
['X_num_train.npy', 'X_num_test.npy', 'y_train.npy', 'train_full.csv', 'x_train.csv', 'test_full.csv', 'y_test.npy', 'y_train.csv', 'y_test.csv', 'x_test.csv']


In [8]:
tabsyn, tabsyn_synth = run_generator_pipeline(
    model_cls=TabSyn,
    model_name="tabsyn",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root
)




=== Training tabsyn ===
dataset_dir : train_test_data
synthetic_dir: synthetic_data/adult/tabsyn
Calling train with: {'data_dir': 'train_test_data'}


                                                                                                                        

[TabSyn] Synthetic data saved:
  X -> synthetic/train_test_data/tabsyn/x_synth.csv
  y -> synthetic/train_test_data/tabsyn/y_synth.csv
tabsyn training completed. Synthetic data saved to: synthetic_data/adult/tabsyn


In [None]:
ganblr, ganblr_synth = run_generator_pipeline(
    model_cls=GANBLR,
    model_name="ganblr",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root
)



=== Training ganblr ===
dataset_dir : train_test_data
synthetic_dir: synthetic_data/adult/ganblr
Calling train with: {'dataset': 'train_test_data'}
Loaded X shape: (26048, 14), y shape: (26048,)
warmup run:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:



ganblr, ganblr_synth = run_generator_pipeline(
    model_cls=GANBLR,
    model_name="ganblr",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root
)

pategan, pategan_synth = run_generator_pipeline(
    model_cls=PATEGAN,
    model_name="pategan",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root
)

codi, codi_synth = run_generator_pipeline(
    model_cls=CODI,
    model_name="codi",
    dataset_dir=dataset_dir,
    synthetic_root=synthetic_root
)