In [6]:
import os
os.getcwd()



'/Users/sindhujaghosh/Downloads/Data_bytes_ketabatics/Katabatic'

In [5]:
from utils import encode_preprocess

dataset_path = "raw_data/car.csv"
encoded_path = "encoded_data/car.csv"

encode_preprocess(dataset_path, encoded_path)
print("Encoded dataset saved to:", encoded_path)


Preprocessing: raw_data/car.csv
Saved preprocessed discrete dataset to: encoded_data/car.csv
Encoded dataset saved to: encoded_data/car.csv


In [4]:
import inspect
import katabatic.models.medgan.models as medgan_module

print([name for name, obj in inspect.getmembers(medgan_module) if inspect.isclass(obj)])


['Autoencoder', 'Discriminator', 'Generator', 'MEDGAN', 'Model', 'Path']


In [2]:
import inspect
from katabatic.models.medgan.models import MEDGAN

print(inspect.signature(MEDGAN))


(encoder_dim: int = 128, latent_dim: int = 128, generator_hidden_dim: int = 128, discriminator_hidden_dim: int = 128, generator_num_layers: int = 2, discriminator_num_layers: int = 2, ae_pretrain_epochs: int = 100, gan_epochs: int = 1000, batch_size: int = 1000, ae_lr: float = 0.001, generator_lr: float = 0.001, discriminator_lr: float = 0.001, dropout: float = 0.1, bn_decay: float = 0.99, random_state: int = 42, device: Optional[str] = None)


In [3]:
from utils import encode_preprocess
from katabatic.pipeline.train_test_split.pipeline import TrainTestSplitPipeline
from katabatic.models.medgan.models import MEDGAN

# 0. (Optional but recommended) Limit threads to avoid OpenMP/BLAS craziness
import os
os.environ["OMP_NUM_THREADS"] = "1"

# 1. Preprocess raw data into encoded/discrete format
dataset_path = "raw_data/car.csv"
encoded_path = "encoded_data/car.csv"

encode_preprocess(dataset_path, encoded_path)
print("Encoded dataset saved to:", encoded_path)

# 2. Set pipeline paths
input_csv = encoded_path          # encoded data
output_dir = "sample_data/car"    # where train/test splits go

# 3. Build a *lightweight* MEDGAN factory for the pipeline
def medgan_factory():
    return MEDGAN(
        encoder_dim=64,
        latent_dim=64,
        generator_hidden_dim=64,
        discriminator_hidden_dim=64,
        generator_num_layers=1,
        discriminator_num_layers=1,
        ae_pretrain_epochs=10,   # was 100
        gan_epochs=50,           # was 1000
        batch_size=128,          # was 1000
        ae_lr=0.001,
        generator_lr=0.001,
        discriminator_lr=0.001,
        dropout=0.1,
        bn_decay=0.99,
        random_state=42,
        device="cpu",            # force CPU, avoid weird device issues
    )

# 4. Create pipeline with MEDGAN
pipeline = TrainTestSplitPipeline(model=medgan_factory)

# 5. Run pipeline end-to-end
results = pipeline.run(
    input_csv=input_csv,
    output_dir=output_dir,
    synthetic_dir="synthetic/car/medgan",
    real_test_dir="sample_data/car"
)

print(results)


Preprocessing: raw_data/car.csv


INFO:katabatic.models.medgan.models:Training MedGAN Model
INFO:katabatic.models.medgan.models:Loaded training data: (1382, 6)


Saved preprocessed discrete dataset to: encoded_data/car.csv
Encoded dataset saved to: encoded_data/car.csv
Loaded data with shape: (1728, 7)
Saved train/test full data
Train size: (1382, 7), Test size: (346, 7)
Train label distribution:
 6
2    0.700434
0    0.222142
1    0.039797
3    0.037627
Name: proportion, dtype: float64
Test label distribution:
 6
2    0.699422
0    0.222543
1    0.040462
3    0.037572
Name: proportion, dtype: float64
Saved X/y split
Training shape: (1382, 6) (1382,)
Test shape: (346, 6) (346,)


INFO:katabatic.models.medgan.models:Data normalized to [0, 1] range
INFO:katabatic.models.medgan.models:Original range: [0.00, 3.00]
INFO:katabatic.models.medgan.models:Normalized range: [0.00, 1.00]
INFO:katabatic.models.medgan.models:
Phase 1: Pretraining Autoencoder for 10 epochs...
INFO:katabatic.models.medgan.models:Epoch 1/10: AE Loss = 0.576262
INFO:katabatic.models.medgan.models:Epoch 10/10: AE Loss = 0.353090
INFO:katabatic.models.medgan.models:
Phase 2: Training GAN for 50 epochs...
INFO:katabatic.models.medgan.models:Epoch 1/50: D Loss = 1.402180, G Loss = 0.678335
INFO:katabatic.models.medgan.models:
Generating 1382 synthetic samples...
INFO:katabatic.models.medgan.models:Adding dummy samples to ensure all classes are present...
INFO:katabatic.models.medgan.models:Added 1 dummy samples
INFO:katabatic.models.medgan.models:
Synthetic data saved to: synthetic/car/medgan
INFO:katabatic.models.medgan.models:Training complete!



Results saved to: Results/car/medgan_tstr.csv

TSTR Evaluation Results:

LR:
Accuracy: 0.4104
F1 Score: 0.4740

MLP:
Accuracy: 0.6040
F1 Score: 0.5821

RF:
Accuracy: 0.4422
F1 Score: 0.4968

XGBoost:
Accuracy: 0.5520
F1 Score: 0.5665
Train test split pipeline executed successfully.


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
import pandas as pd

t_medgan = pd.read_csv("Results/car/medgan_tstr.csv")
t_ganblr = pd.read_csv("Results/car/ganblr_tstr.csv")
t_tabsyn = pd.read_csv("Results/car/tabsyn_tstr.csv")

t_ganblr, t_tabsyn, t_medgan


(     Model    Metric   Value
 0       LR  Accuracy  0.7023
 1       LR  F1 Score  0.6459
 2      MLP  Accuracy  0.8266
 3      MLP  F1 Score  0.8202
 4       RF  Accuracy  0.8728
 5       RF  F1 Score  0.8769
 6  XGBoost  Accuracy  0.8671
 7  XGBoost  F1 Score  0.8746,
      Model    Metric   Value
 0       LR  Accuracy  0.6012
 1       LR  F1 Score  0.5880
 2      MLP  Accuracy  0.4538
 3      MLP  F1 Score  0.4832
 4       RF  Accuracy  0.3410
 5       RF  F1 Score  0.4043
 6  XGBoost  Accuracy  0.3121
 7  XGBoost  F1 Score  0.3745,
      Model    Metric   Value
 0       LR  Accuracy  0.4104
 1       LR  F1 Score  0.4740
 2      MLP  Accuracy  0.6040
 3      MLP  F1 Score  0.5821
 4       RF  Accuracy  0.4422
 5       RF  F1 Score  0.4968
 6  XGBoost  Accuracy  0.5520
 7  XGBoost  F1 Score  0.5665)

In [3]:
from utils import encode_preprocess
from katabatic.pipeline.train_test_split.pipeline import TrainTestSplitPipeline
from katabatic.models.medgan.models import MEDGAN
import os

# Make threading gentler
os.environ["OMP_NUM_THREADS"] = "1"

# 1. Preprocess (again, just to be safe)
dataset_path = "raw_data/car.csv"
encoded_path = "encoded_data/car.csv"
encode_preprocess(dataset_path, encoded_path)
print("Encoded dataset saved to:", encoded_path)

input_csv = encoded_path
output_dir = "sample_data/car"

# 2. Slightly stronger MedGAN
def medgan_tuned():
    return MEDGAN(
        encoder_dim=128,
        latent_dim=64,
        generator_hidden_dim=128,
        discriminator_hidden_dim=128,
        generator_num_layers=2,
        discriminator_num_layers=2,
        ae_pretrain_epochs=20,   # 10 â†’ 20
        gan_epochs=300,          # 50 â†’ 100
        batch_size=256,          # keep same (safe)
        ae_lr=0.001,
        generator_lr=0.001,
        discriminator_lr=0.001,
        dropout=0.1,
        bn_decay=0.99,
        random_state=42,
        device="cpu",
    )

pipeline = TrainTestSplitPipeline(model=medgan_tuned)

results = pipeline.run(
    input_csv=input_csv,
    output_dir=output_dir,
    synthetic_dir="synthetic/car/medgan_tuned",
    real_test_dir="sample_data/car"
)

print(results)


INFO:katabatic.models.medgan.models:Training MedGAN Model
INFO:katabatic.models.medgan.models:Loaded training data: (1382, 6)
INFO:katabatic.models.medgan.models:Data normalized to [0, 1] range
INFO:katabatic.models.medgan.models:Original range: [0.00, 3.00]
INFO:katabatic.models.medgan.models:Normalized range: [0.00, 1.00]
INFO:katabatic.models.medgan.models:
Phase 1: Pretraining Autoencoder for 20 epochs...
INFO:katabatic.models.medgan.models:Epoch 1/20: AE Loss = 0.612333
INFO:katabatic.models.medgan.models:Epoch 10/20: AE Loss = 0.340160
INFO:katabatic.models.medgan.models:Epoch 20/20: AE Loss = 0.327602
INFO:katabatic.models.medgan.models:
Phase 2: Training GAN for 300 epochs...


Preprocessing: raw_data/car.csv
Saved preprocessed discrete dataset to: encoded_data/car.csv
Encoded dataset saved to: encoded_data/car.csv
Loaded data with shape: (1728, 7)
Saved train/test full data
Train size: (1382, 7), Test size: (346, 7)
Train label distribution:
 6
2    0.700434
0    0.222142
1    0.039797
3    0.037627
Name: proportion, dtype: float64
Test label distribution:
 6
2    0.699422
0    0.222543
1    0.040462
3    0.037572
Name: proportion, dtype: float64
Saved X/y split
Training shape: (1382, 6) (1382,)
Test shape: (346, 6) (346,)


INFO:katabatic.models.medgan.models:Epoch 1/300: D Loss = 1.356985, G Loss = 0.688650
INFO:katabatic.models.medgan.models:Epoch 100/300: D Loss = 0.080323, G Loss = 4.843256
INFO:katabatic.models.medgan.models:Epoch 200/300: D Loss = 0.203447, G Loss = 4.628125
INFO:katabatic.models.medgan.models:Epoch 300/300: D Loss = 0.268759, G Loss = 4.071495
INFO:katabatic.models.medgan.models:
Generating 1382 synthetic samples...
INFO:katabatic.models.medgan.models:Adding dummy samples to ensure all classes are present...
INFO:katabatic.models.medgan.models:Added 2 dummy samples
INFO:katabatic.models.medgan.models:
Synthetic data saved to: synthetic/car/medgan_tuned
INFO:katabatic.models.medgan.models:Training complete!



Results saved to: Results/car/medgan_tuned_tstr.csv

TSTR Evaluation Results:

LR:
Accuracy: 0.6098
F1 Score: 0.5597

MLP:
Accuracy: 0.6994
F1 Score: 0.5757

RF:
Accuracy: 0.6936
F1 Score: 0.5870

XGBoost:
Accuracy: 0.6387
F1 Score: 0.5525
Train test split pipeline executed successfully.


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [2]:
import pandas as pd

medgan_base = pd.read_csv("Results/car/medgan_tstr.csv")
medgan_tuned = pd.read_csv("Results/car/medgan_tuned_tstr.csv")

print("Base MedGAN:")
display(medgan_base)

print("Tuned MedGAN:")
display(medgan_tuned)


Base MedGAN:


Unnamed: 0,Model,Metric,Value
0,LR,Accuracy,0.4104
1,LR,F1 Score,0.474
2,MLP,Accuracy,0.604
3,MLP,F1 Score,0.5821
4,RF,Accuracy,0.4422
5,RF,F1 Score,0.4968
6,XGBoost,Accuracy,0.552
7,XGBoost,F1 Score,0.5665


Tuned MedGAN:


Unnamed: 0,Model,Metric,Value
0,LR,Accuracy,0.4855
1,LR,F1 Score,0.4957
2,MLP,Accuracy,0.5434
3,MLP,F1 Score,0.5311
4,RF,Accuracy,0.4509
5,RF,F1 Score,0.472
6,XGBoost,Accuracy,0.4017
7,XGBoost,F1 Score,0.4321


In [15]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load real splits saved by pipeline
X_train = pd.read_csv("sample_data/car/X_train.csv")
y_train = pd.read_csv("sample_data/car/y_train.csv").values.ravel()
X_test  = pd.read_csv("sample_data/car/X_test.csv")
y_test  = pd.read_csv("sample_data/car/y_test.csv").values.ravel()

# Strong baseline: RandomForest
clf_real = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
clf_real.fit(X_train, y_train)
y_pred_real = clf_real.predict(X_test)

acc_real = accuracy_score(y_test, y_pred_real)
print("Baseline REAL-data accuracy:", acc_real)


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


Baseline REAL-data accuracy: 0.9797687861271677


In [16]:
import pandas as pd

X_train = pd.read_csv("sample_data/car/X_train.csv")
y_train = pd.read_csv("sample_data/car/y_train.csv").values.ravel()

# Combine features + label for MedGAN
train_data = pd.concat(
    [X_train.reset_index(drop=True),
     pd.Series(y_train, name="label")],
    axis=1
)

print(train_data.shape)  # should be (1382, 7)
train_data.head()


(1382, 7)


Unnamed: 0,0,1,2,3,4,5,label
0,2,2,1,1,1,2,0
1,2,0,1,0,2,2,2
2,1,0,3,2,2,2,0
3,2,1,0,1,1,0,1
4,3,1,1,0,2,0,2


In [20]:
from katabatic.models.medgan.models import MEDGAN

medgan = MEDGAN(
    encoder_dim=128,
    latent_dim=128,
    generator_hidden_dim=128,
    discriminator_hidden_dim=128,
    generator_num_layers=2,
    discriminator_num_layers=2,
    ae_pretrain_epochs=150,   # â†‘ stabilize autoencoder more
    gan_epochs=800,           # â†“ slightly less than 1000 to avoid collapse
    batch_size=256,           # â†‘ better than default huge 1000
    ae_lr=1e-3,
    generator_lr=2e-4,        # â†“ slow learning â†’ more stable training
    discriminator_lr=2e-4,    # â†“ match generator
    dropout=0.1,
    bn_decay=0.99,
    random_state=42,
    device="cpu"              # change to "cuda" if you have GPU
)

medgan.fit(train_data.values)   # ðŸ‘ˆ NO epochs here!


AttributeError: 'MEDGAN' object has no attribute 'fit'

In [21]:
from katabatic.models.medgan.models import MEDGAN
import inspect

medgan = MEDGAN(
    encoder_dim=128,
    latent_dim=128,
    generator_hidden_dim=128,
    discriminator_hidden_dim=128,
    generator_num_layers=2,
    discriminator_num_layers=2,
    ae_pretrain_epochs=150,
    gan_epochs=800,
    batch_size=256,
    ae_lr=1e-3,
    generator_lr=2e-4,
    discriminator_lr=2e-4,
    dropout=0.1,
    bn_decay=0.99,
    random_state=42,
    device="cpu"
)

# List all *public* methods/attributes
print([name for name in dir(medgan) if not name.startswith("_")])


['ae_lr', 'ae_pretrain_epochs', 'autoencoder', 'batch_size', 'bn_decay', 'check_dependencies', 'device', 'discriminator', 'discriminator_hidden_dim', 'discriminator_lr', 'discriminator_num_layers', 'dropout', 'encoder_dim', 'evaluate', 'gan_epochs', 'generator', 'generator_hidden_dim', 'generator_lr', 'generator_num_layers', 'get_required_dependencies', 'input_dim_', 'is_fitted', 'latent_dim', 'random_state', 'sample', 'train']


In [22]:
import inspect
print(inspect.signature(medgan.sample))


(n: int) -> numpy.ndarray


In [24]:
# Just train â€“ epochs etc. are already set in __init__
medgan.train(train_data.values)

print("Is fitted?", medgan.is_fitted)


TypeError: MEDGAN.train() missing 1 required positional argument: 'synthetic_dir'

In [23]:
from collections import Counter
import pandas as pd

n_synth = len(train_data) * 3   # 3Ã— synthetic dataset

# Generate synthetic samples
synth = medgan.sample(n_synth)

# Build a dataframe with same column names as train_data
synth_df = pd.DataFrame(synth, columns=train_data.columns)

# Extract features + label
X_synth = synth_df.drop(columns=["label"])
y_synth = synth_df["label"].round().astype(int)  # ensure integer labels

print("Synthetic label counts:", Counter(y_synth))


RuntimeError: Model must be trained before sampling