In [2]:
# Imports
import sys
from pathlib import Path

# Resolve project root and ensure it's on sys.path
ROOT = Path.cwd().resolve()
for _ in range(5):
    if (ROOT / "pyproject.toml").exists() or (ROOT / "raw_data").exists():
        break
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from katabatic.pipeline.train_test_split.pipeline import TrainTestSplitPipeline
from utils import discretize_preprocess
from katabatic.models.medgan.models import MEDGAN


In [3]:
# Preprocess data
from pathlib import Path

dataset_path = ROOT / "raw_data" / "car.csv"
output_path = ROOT / "discretized_data" / "car.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)

discretize_preprocess(str(dataset_path), str(output_path))



Preprocessing: /Users/vikumdabare/Documents/Work/Katabatic/katabatic/raw_data/car.csv
Saved preprocessed discrete dataset to: /Users/vikumdabare/Documents/Work/Katabatic/katabatic/discretized_data/car.csv


In [4]:
# Run pipeline
input_csv = str(output_path)
output_dir = str(ROOT / "sample_data" / "car")
real_test_dir = output_dir

synthetic_dir = str(ROOT / "synthetic" / "car" / "medgan")

pipeline = TrainTestSplitPipeline(model=MEDGAN)
result = pipeline.run(
    input_csv=input_csv,
    output_dir=output_dir,
    synthetic_dir=synthetic_dir,
    real_test_dir=real_test_dir,
)
print(result)



INFO:katabatic.models.medgan.models:Training MedGAN Model
INFO:katabatic.models.medgan.models:Loaded training data: (1382, 6)
INFO:katabatic.models.medgan.models:Data normalized to [0, 1] range
INFO:katabatic.models.medgan.models:Original range: [0.00, 3.00]
INFO:katabatic.models.medgan.models:Normalized range: [0.00, 1.00]
INFO:katabatic.models.medgan.models:
Phase 1: Pretraining Autoencoder for 100 epochs...


Loaded data with shape: (1728, 7)
Saved train/test full data
Train size: (1382, 7), Test size: (346, 7)
Train label distribution:
 6
2    0.700434
0    0.222142
1    0.039797
3    0.037627
Name: proportion, dtype: float64
Test label distribution:
 6
2    0.699422
0    0.222543
1    0.040462
3    0.037572
Name: proportion, dtype: float64
Saved X/y split
Training shape: (1382, 6) (1382,)
Test shape: (346, 6) (346,)


INFO:katabatic.models.medgan.models:Epoch 1/100: AE Loss = 0.662855
INFO:katabatic.models.medgan.models:Epoch 10/100: AE Loss = 0.369596
INFO:katabatic.models.medgan.models:Epoch 20/100: AE Loss = 0.342510
INFO:katabatic.models.medgan.models:Epoch 30/100: AE Loss = 0.332869
INFO:katabatic.models.medgan.models:Epoch 40/100: AE Loss = 0.329690
INFO:katabatic.models.medgan.models:Epoch 50/100: AE Loss = 0.323772
INFO:katabatic.models.medgan.models:Epoch 60/100: AE Loss = 0.325450
INFO:katabatic.models.medgan.models:Epoch 70/100: AE Loss = 0.321960
INFO:katabatic.models.medgan.models:Epoch 80/100: AE Loss = 0.320639
INFO:katabatic.models.medgan.models:Epoch 90/100: AE Loss = 0.320572
INFO:katabatic.models.medgan.models:Epoch 100/100: AE Loss = 0.320972
INFO:katabatic.models.medgan.models:
Phase 2: Training GAN for 1000 epochs...
INFO:katabatic.models.medgan.models:Epoch 1/1000: D Loss = 1.369953, G Loss = 0.698235
INFO:katabatic.models.medgan.models:Epoch 100/1000: D Loss = 0.052621, G Los


Results saved to: Results/car/medgan_tstr.csv

TSTR Evaluation Results:

LR:
Accuracy: 0.3844
F1 Score: 0.4365

MLP:
Accuracy: 0.5751
F1 Score: 0.5380

RF:
Accuracy: 0.5491
F1 Score: 0.5370

XGBoost:
Accuracy: 0.5231
F1 Score: 0.5215
Train test split pipeline executed successfully.
