In [1]:
!pip -q install sdv==0.18.3 ctgan==0.7.4 pandas==2.2.2 numpy==1.26.4 scikit-learn==1.5.1 scipy==1.13.1 tqdm==4.66.4

ERROR: Ignored the following yanked versions: 1.13.0
ERROR: Ignored the following versions that require a different python version: 0.10.0 Requires-Python >=3.6,<3.9; 0.10.0.dev0 Requires-Python >=3.6,<3.9; 0.10.1 Requires-Python >=3.6,<3.9; 0.10.1.dev0 Requires-Python >=3.6,<3.9; 0.11.0 Requires-Python >=3.6,<3.9; 0.11.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0 Requires-Python >=3.6,<3.9; 0.12.0.dev0 Requires-Python >=3.6,<3.9; 0.12.0.dev1 Requires-Python >=3.6,<3.9; 0.12.1 Requires-Python >=3.6,<3.9; 0.12.1.dev0 Requires-Python >=3.6,<3.9; 0.13.0 Requires-Python >=3.6,<3.10; 0.13.0.dev0 Requires-Python >=3.6,<3.10; 0.13.1 Requires-Python >=3.6,<3.10; 0.13.1.dev0 Requires-Python >=3.6,<3.10; 0.14.0 Requires-Python >=3.6,<3.10; 0.14.0.dev0 Requires-Python >=3.6,<3.10; 0.14.0.dev1 Requires-Python >=3.6,<3.10; 0.14.0.dev2 Requires-Python >=3.6,<3.10; 0.14.1 Requires-Python >=3.6,<3.10; 0.14.1.dev0 Requires-Python >=3.6,<3.10; 0.15.0 Requires-Python >=3.6,<3.10; 0.15.0.dev0 Requires-Python

In [None]:
# Full pipeline: Step 1 -> Step 3 (continuous-only)
# Paste into Colab/Jupyter. If Colab, uncomment pip installs below.



import os
import json
import time
import math
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from scipy.stats import ks_2samp, wasserstein_distance, entropy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import torch

# Generators
from sdv.tabular import GaussianCopula
from ctgan import TVAE  # TVAE from ctgan is fine for continuous data

warnings.simplefilter("ignore", UserWarning)
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 200)

# ----------------------------- CONFIG ----------------------------------------
CONFIG = {
    # === Data ===
    "CSV_PATH": "your_data.csv",      # <-- Change: path to your CSV with 55 numeric features
    "TARGET_COLUMNS": [],             # leave empty: we treat all columns as features unless you list targets
    "FEATURE_COLUMNS": None,          # None => auto-use all non-target columns
    "ALL_CONTINUOUS": True,           # True since you said all 55 are continuous real values

    # === Generator choice & sizes ===
    "GENERATORS": ["GAUSSIAN_COPULA", "TVAE"],  # Both will run and produce separate outputs
    "SYNTHETIC_SIZE": 100_000,
    "RANDOM_SEED": 42,

    # TVAE training hyperparams
    "TVAE_PARAMS": {
        "epochs": 300,           # increase if underfitting; reduce if too slow
        "batch_size": 512,
        "compress_dims": (256, 256),
        "decompress_dims": (256, 256),
        "embedding_dim": 128,
        "l2scale": 1e-6,
        "cuda": torch.cuda.is_available(),
        "verbose": True
    },

    # GaussianCopula has minimal hyperparams (fast)
    "GAUSSIANCOPULA_PARAMS": {
        # SDV GaussianCopula uses default internals; we expose nothing here but keep a placeholder.
    },

    # === Validation ===
    "EVAL_SAMPLE_SIZE": 50_000,
    "QC_TOPK": 10,
    "QUALITY_THRESHOLDS": {
        "KS_warn": 0.2,
        "WD_norm_warn": 0.1,
        "JS_warn": 0.1,
        "Corr_Frob_warn": 0.2,
        "RF_AUC_warn": 0.65,
        "MMD_warn": 0.1
    },

    # === Outputs ===
    "OUT_DIR": "synth_outputs",   # folder to save synthetic CSVs & reports
}

os.makedirs(CONFIG["OUT_DIR"], exist_ok=True)

# ----------------------------- Utilities -------------------------------------

def set_seed(seed=42):
    np.random.seed(seed)
    try:
        import random
        random.seed(seed)
    except Exception:
        pass
    try:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
    except Exception:
        pass

def load_data(path):
    assert os.path.exists(path), f"CSV not found: {path}"
    df = pd.read_csv(path)
    return df

def infer_features(df, target_cols=None, feature_cols=None):
    target_cols = target_cols or []
    if feature_cols is None:
        feature_cols = [c for c in df.columns if c not in target_cols]
    X = df[feature_cols].copy()
    return X, feature_cols

def compute_ranges(X, overrides=None):
    overrides = overrides or {}
    rng = {}
    for col in X.columns:
        if pd.api.types.is_numeric_dtype(X[col]):
            lo = float(np.nanmin(X[col].values))
            hi = float(np.nanmax(X[col].values))
            rng[col] = (lo, hi)
    rng.update(overrides)
    return rng

def basic_impute_numeric(X):
    # Numeric: replace NaN with median
    Xc = X.copy()
    for col in Xc.columns:
        if pd.api.types.is_numeric_dtype(Xc[col]):
            med = np.nanmedian(Xc[col].values)
            Xc[col] = Xc[col].fillna(med)
        else:
            # Should not happen for all-continuous, but keep safe fallback
            Xc[col] = Xc[col].fillna("UNK")
    return Xc

# ----------------------- Sampling & models -----------------------------------

def train_gaussian_copula(X):
    model = GaussianCopula()
    t0 = time.time()
    model.fit(X)
    print(f"GaussianCopula fit done in {(time.time()-t0)/60:.2f} min.")
    return model

def train_tvae(X, params):
    model = TVAE(**{k: v for k, v in params.items() if k in TVAE.__init__.__code__.co_varnames})
    t0 = time.time()
    model.fit(X)
    print(f"TVAE fit done in {(time.time()-t0)/60:.2f} min.")
    return model

def sample_and_clip(model, n, feature_ranges, integer_cols=None, clip=True):
    S = model.sample(n)
    # Ensure numeric dtype alignment and same columns as feature_ranges
    for col in S.columns:
        if pd.api.types.is_numeric_dtype(S[col]):
            if clip and col in feature_ranges:
                lo, hi = feature_ranges[col]
                S[col] = S[col].clip(lower=lo, upper=hi)
    # Round integer columns if any
    if integer_cols:
        for c in integer_cols:
            if c in S.columns:
                S[c] = np.round(S[c]).astype("Int64").astype(float)
                if c in feature_ranges:
                    lo, hi = feature_ranges[c]
                    S[c] = S[c].clip(lower=math.ceil(lo), upper=math.floor(hi))
    return S

# ----------------------- Validation metrics ----------------------------------

def jensen_shannon_divergence(p, q, eps=1e-12):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p /= p.sum()
    q /= q.sum()
    m = 0.5 * (p + q)
    return 0.5 * (entropy(p, m) + entropy(q, m))

def numeric_hist_js(x, y, bins=50):
    lo = np.nanmin([np.min(x), np.min(y)])
    hi = np.nanmax([np.max(x), np.max(y)])
    if not np.isfinite(lo) or not np.isfinite(hi) or lo == hi:
        return np.nan
    px, _ = np.histogram(x, bins=bins, range=(lo, hi), density=True)
    py, _ = np.histogram(y, bins=bins, range=(lo, hi), density=True)
    return jensen_shannon_divergence(px, py)

def normalized_wasserstein(x, y):
    lo = np.nanmin([np.min(x), np.min(y)])
    hi = np.nanmax([np.max(x), np.max(y)])
    if not np.isfinite(lo) or not np.isfinite(hi) or lo == hi:
        return np.nan
    return wasserstein_distance(x, y) / (hi - lo)

def per_feature_report_numeric(real_df, synth_df, qc_topk=10):
    rows = []
    for col in real_df.columns:
        xr = pd.to_numeric(real_df[col], errors='coerce').dropna()
        xs = pd.to_numeric(synth_df[col], errors='coerce').dropna()
        if len(xr) == 0 or len(xs) == 0:
            ks = np.nan; wd = np.nan; js = np.nan
        else:
            ks = ks_2samp(xr, xs).statistic
            wd = normalized_wasserstein(xr, xs)
            js = numeric_hist_js(xr, xs)
        rows.append({"column": col, "KS": ks, "Wasserstein_norm": wd, "JS": js})
    df = pd.DataFrame(rows)
    print("\nTop drift by KS:")
    print(df.sort_values("KS", ascending=False).head(qc_topk)[["column", "KS"]])
    print("\nTop drift by Wasserstein_norm:")
    print(df.sort_values("Wasserstein_norm", ascending=False).head(qc_topk)[["column", "Wasserstein_norm"]])
    print("\nTop drift by JS:")
    print(df.sort_values("JS", ascending=False).head(qc_topk)[["column", "JS"]])
    return df

def corr_frobenius_diff(real_df, synth_df):
    num_cols = [c for c in real_df.columns if pd.api.types.is_numeric_dtype(real_df[c])]
    if not num_cols:
        return {"pearson_frob": np.nan, "spearman_frob": np.nan}
    rr = real_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(method="ffill")
    ss = synth_df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(method="ffill")
    Cr = rr.corr(method="pearson").fillna(0).to_numpy()
    Cs = ss.corr(method="pearson").fillna(0).to_numpy()
    Spr = rr.corr(method="spearman").fillna(0).to_numpy()
    Sps = ss.corr(method="spearman").fillna(0).to_numpy()
    pear = np.linalg.norm(Cr - Cs, ord="fro") / Cr.size
    spear = np.linalg.norm(Spr - Sps, ord="fro") / Spr.size
    return {"pearson_frob": pear, "spearman_frob": spear}

def gaussian_mmd(X, Y, sigma=None, max_samples=5000, random_state=42):
    rng = np.random.default_rng(random_state)
    if len(X) > max_samples:
        X = X[rng.choice(len(X), max_samples, replace=False)]
    if len(Y) > max_samples:
        Y = Y[rng.choice(len(Y), max_samples, replace=False)]

    scaler = StandardScaler()
    XY = np.vstack([X, Y])
    XY = scaler.fit_transform(XY)
    Xs = XY[:len(X)]
    Ys = XY[len(X):]

    if sigma is None:
        Z = np.vstack([Xs[:1000], Ys[:1000]]) if len(Xs) > 1000 and len(Ys) > 0 else np.vstack([Xs, Ys])
        dists = np.sum((Z[:, None, :] - Z[None, :, :]) ** 2, axis=2)
        med = np.median(dists[np.triu_indices_from(dists, k=1)])
        sigma = max(np.sqrt(med / 2), 1e-6)

    def k(a, b):
        return np.exp(-np.sum((a[:, None, :] - b[None, :, :]) ** 2, axis=2) / (2 * sigma ** 2))

    Kxx = k(Xs, Xs)
    Kyy = k(Ys, Ys)
    Kxy = k(Xs, Ys)

    n = len(Xs)
    m = len(Ys)
    mmd2 = (Kxx.sum() - np.trace(Kxx)) / (n * (n - 1)) \
         + (Kyy.sum() - np.trace(Kyy)) / (m * (m - 1)) \
         - 2 * Kxy.mean()
    return float(np.sqrt(max(mmd2, 0.0)))

def classifier_two_sample_auc(real_df, synth_df, random_state=42):
    X = pd.concat([real_df, synth_df], axis=0).copy()
    y = np.array([0] * len(real_df) + [1] * len(synth_df))
    # Numeric pipeline
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    transformers = []
    if num_cols:
        transformers.append(("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols))
    pre = ColumnTransformer(transformers=transformers, remainder="drop")
    X_proc = pre.fit_transform(X)
    clf = RandomForestClassifier(n_estimators=300, random_state=random_state, n_jobs=-1)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    aucs = cross_val_score(clf, X_proc, y, cv=cv, scoring="roc_auc", n_jobs=-1)
    return float(aucs.mean())

# ----------------------------- Runner ----------------------------------------

def run_pipeline(config):
    set_seed(config["RANDOM_SEED"])
    df = load_data(config["CSV_PATH"])
    X, feat_cols = infer_features(df, target_cols=config["TARGET_COLUMNS"], feature_cols=config["FEATURE_COLUMNS"])
    print(f"Loaded data {df.shape}; using {len(feat_cols)} features (first 8): {feat_cols[:8]}")

    # All continuous: impute numeric missing values
    X_clean = basic_impute_numeric(X)

    # Compute per-column numeric ranges (empirical)
    feature_ranges = compute_ranges(X_clean, overrides={})
    print("Sample feature ranges (first 8):")
    for i, (k, v) in enumerate(list(feature_ranges.items())[:8]):
        print(f"  {k}: {v}")
        if i >= 7: break

    results_summary = {}

    for gen_name in config["GENERATORS"]:
        print("\n" + "=" * 80)
        print(f"RUN GENERATOR: {gen_name}")
        print("=" * 80)

        if gen_name == "GAUSSIAN_COPULA":
            model = train_gaussian_copula(X_clean)
        elif gen_name == "TVAE":
            model = train_tvae(X_clean, config["TVAE_PARAMS"])
        else:
            raise ValueError("Unknown generator: " + str(gen_name))

        # Sample synthetic rows (in one shot if memory allows; generators often stream)
        n = int(config["SYNTHETIC_SIZE"])
        print(f"Sampling {n:,} rows from {gen_name} ...")
        S = sample_and_clip(model, n, feature_ranges, integer_cols=None, clip=True)
        # Align columns
        S = S[feat_cols]
        out_csv = os.path.join(config["OUT_DIR"], f"synthetic_{gen_name.lower()}_{n:,}.csv".replace(",", ""))
        S.to_csv(out_csv, index=False)
        print(f"Saved synthetic CSV: {out_csv}")

        # Validation: use subsample for speed
        eval_n = min(config["EVAL_SAMPLE_SIZE"], len(S))
        S_eval = S.sample(eval_n, random_state=config["RANDOM_SEED"]) if len(S) > eval_n else S.copy()
        X_eval = X_clean.sample(eval_n, random_state=config["RANDOM_SEED"]) if len(X_clean) > eval_n else X_clean.copy()

        # 1) Per-feature metrics
        per_feat = per_feature_report_numeric(X_eval, S_eval, qc_topk=config["QC_TOPK"])
        per_feat_csv = os.path.join(config["OUT_DIR"], f"drift_per_feature_{gen_name.lower()}.csv")
        per_feat.to_csv(per_feat_csv, index=False)

        # 2) Correlation differences
        corr_stats = corr_frobenius_diff(X_eval, S_eval)

        # 3) Classifier two-sample AUC
        try:
            auc = classifier_two_sample_auc(X_eval, S_eval, random_state=config["RANDOM_SEED"])
        except Exception as e:
            print("Classifier test failed:", e)
            auc = float("nan")

        # 4) MMD on numeric subset
        num_cols_eval = [c for c in X_eval.columns if pd.api.types.is_numeric_dtype(X_eval[c])]
        if num_cols_eval:
            Xn = X_eval[num_cols_eval].to_numpy(dtype=float)
            Sn = S_eval[num_cols_eval].to_numpy(dtype=float)
            mmd = gaussian_mmd(Xn, Sn, sigma=None, max_samples=5000, random_state=config["RANDOM_SEED"])
        else:
            mmd = float("nan")

        # Summary printing
        print("\nSUMMARY for", gen_name)
        print(f"  Worst KS (numeric): {float(per_feat.loc[:, 'KS'].max()):.4f}")
        print(f"  Worst Wasserstein_norm: {float(per_feat.loc[:, 'Wasserstein_norm'].max()):.4f}")
        print(f"  Worst JS: {float(per_feat.loc[:, 'JS'].max()):.4f}")
        print(f"  Pearson corr Frobenius diff: {corr_stats['pearson_frob']:.4f}")
        print(f"  Spearman corr Frobenius diff: {corr_stats['spearman_frob']:.4f}")
        print(f"  Classifier two-sample AUC: {auc:.4f} (0.5 ~ indistinguishable)")
        print(f"  MMD (Gaussian kernel): {mmd:.4f}")

        # Save generator summary
        results_summary[gen_name] = {
            "synthetic_csv": out_csv,
            "per_feature_csv": per_feat_csv,
            "per_feature_stats": per_feat.to_dict(orient="records"),
            "corr_stats": corr_stats,
            "two_sample_auc": float(auc),
            "mmd": float(mmd),
        }

        # Save a small JSON summary per generator
        summary_path = os.path.join(config["OUT_DIR"], f"summary_{gen_name.lower()}.json")
        with open(summary_path, "w") as f:
            json.dump(results_summary[gen_name], f, indent=2)
        print(f"Saved summary JSON: {summary_path}")

    # Save combined summary
    combined_path = os.path.join(config["OUT_DIR"], "combined_summaries.json")
    with open(combined_path, "w") as f:
        json.dump(results_summary, f, indent=2)
    print(f"\nAll generator runs complete. Combined summary saved to: {combined_path}")
    return results_summary

# ----------------------------- Execute ---------------------------------------
if __name__ == "__main__":
    # EDIT: change CONFIG["CSV_PATH"] before running
    if not os.path.exists(CONFIG["CSV_PATH"]):
        raise SystemExit(f"Please set CONFIG['CSV_PATH'] to your dataset CSV (current: {CONFIG['CSV_PATH']})")
    results = run_pipeline(CONFIG)


In [None]:
pip install sdv

In [5]:
from sdv.tabular import GaussianCopula

ModuleNotFoundError: No module named 'sdv.tabular'

In [6]:
import sdv
help(sdv)

Help on package sdv:

NAME
    sdv - Top-level package for SDV.

PACKAGE CONTENTS
    _utils
    cag (package)
    constraints (package)
    data_processing (package)
    datasets (package)
    errors
    evaluation (package)
    io (package)
    lite (package)
    logging (package)
    metadata (package)
    metrics (package)
    multi_table (package)
    sampling (package)
    sequential (package)
    single_table (package)
    utils (package)
    version (package)

DATA
    __all__ = ['constraints', 'data_processing', 'datasets', 'evaluation',...
    __email__ = 'info@sdv.dev'

VERSION
    1.25.0

AUTHOR
    DataCebo, Inc.

FILE
    c:\users\joy otto\appdata\local\programs\python\python312\lib\site-packages\sdv\__init__.py


