# Synthetic Dataset Expansion using Conditional Tabular Generative Adversarial Network (CTGAN) method.

The cleaned dataset Sleep_Study.csv is used for expansion

In [2]:
%pip install sdv

Collecting sdv
  Using cached sdv-1.32.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.18-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.18-py3-none-any.whl.metadata (5.9 kB)
Collecting cloudpickle>=2.1.0 (from sdv)
  Downloading cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting graphviz>=0.13.2 (from sdv)
  Using cached graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting tqdm>=4.29 (from sdv)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting copulas>=0.12.1 (from sdv)
  Using cached copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Using cached ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Using cached deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Using cached rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting



In [3]:
# ============================================================
# Sleep_Study → expand to 1000 rows with CTGAN (SDV)
# - Handles mixed types (categorical + numeric)
# - Enforces rounding for integer-like fields
# - Enforces min/max bounds for numerics
# - Validates proportions & numeric drift
# ============================================================
from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata


In [4]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

In [5]:
# -------------------------------
# 1) Load data
# -------------------------------
df = pd.read_csv(r'..\Data_Cleaning\Sleep_Study.csv')
print("Loaded:", df.shape)

Loaded: (253, 23)


In [6]:
# -------------------------------
# 2) Column lists (from your dataset introspection)
# -------------------------------
categorical_cols = ['GENDER', 'CL_YEAR', 'LNO', 'DEP_STATUS', 'ANX_STATUS', 'STRESS', 'ALCO_USE']
numeric_cols = ['GPA', 'COG_ZSCORE', 'PS_QUAL', 'DEP_SCORE', 'ANX_SCORE', 'STR_SCORE',
                'DAS_SCORE', 'HAPPINESS', 'DRINKS', 'WD_BED', 'WD_RISE', 'WD_SLEEP',
                'WE_BED', 'WeekendRise', 'WE_SLEEP', 'AVG_SLEEP']

# Integer-like numeric columns (scores, counts) to round after sampling
integer_cols = ['PS_QUAL','DEP_SCORE','ANX_SCORE','STR_SCORE','DAS_SCORE','HAPPINESS','DRINKS']


In [7]:
# -------------------------------
# 3) Prepare training data
# -------------------------------
train_df = df.copy()

# IMPORTANT: categoricals must be plain object for CTGAN (not pandas 'category')
for c in categorical_cols:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype(object)

# Drop constant/empty columns (rare, but safe)
const_or_allna = [c for c in train_df.columns if train_df[c].nunique(dropna=True) <= 1]
if const_or_allna:
    train_df = train_df.drop(columns=const_or_allna)
    categorical_cols = [c for c in categorical_cols if c not in const_or_allna]
    numeric_cols     = [c for c in numeric_cols if c not in const_or_allna]
    integer_cols     = [c for c in integer_cols if c not in const_or_allna]


In [8]:
# -------------------------------
# 4) SDV metadata
# -------------------------------
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_df)

# Force intended sdtypes
for c in categorical_cols:
    if c in metadata.columns:
        metadata.update_column(c, sdtype="categorical")

for c in numeric_cols:
    if c in metadata.columns:
        metadata.update_column(c, sdtype="numerical")

print("Metadata ready.")

Metadata ready.


In [17]:
# -------------------------------
# 5) Train CTGAN (tuned for stability on this size)
# -------------------------------
synth = CTGANSynthesizer(
    metadata,
    epochs=800,                 # a bit longer for tighter marginals
    batch_size=100,             # multiple of pac
    pac=10,                     # default = 10; matches batch_size
    verbose=True,
    enforce_min_max_values=True,
    enforce_rounding=True,      # helps keep integer-like numerics as ints
    cuda=False                  # force CPU (safer if CUDA not configured)
)

# Fit
synth.fit(train_df)

Gen. (-4.30) | Discrim. (0.25): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 800/800 [02:18<00:00,  5.79it/s]


In [19]:
# -------------------------------
# 6) Sample to reach 1000 rows
# -------------------------------
TARGET_ROWS = 1000
need_rows = max(0, TARGET_ROWS - len(df))
print("Need to generate:", need_rows)

if need_rows > 0:
    synth_new = synth.sample(num_rows=need_rows)

    # Post-process: ensure types + light rounding/clipping safety
    for c in categorical_cols:
        if c in synth_new.columns:
            synth_new[c] = synth_new[c].astype(object)

    for c in numeric_cols:
        if c in synth_new.columns and c in df.columns:
            synth_new[c] = pd.to_numeric(synth_new[c], errors="coerce")
            # Optional extra safety: clip to observed range
            synth_new[c] = synth_new[c].clip(df[c].min(), df[c].max())

    # Ensure integer-like columns are whole numbers
    for c in integer_cols:
        if c in synth_new.columns:
            synth_new[c] = pd.to_numeric(synth_new[c], errors="coerce").round().astype('Int64')

    # Reassemble in original column order
    out = pd.concat(
        [df.reset_index(drop=True),
         synth_new.reindex(columns=df.columns)],
        ignore_index=True
    )
else:
    out = df.copy()

print("Final shape:", out.shape)


Need to generate: 747
Final shape: (1000, 23)


In [21]:
# -------------------------------
# 7) Validation summaries
# -------------------------------
def cat_props(frame, cols, k=20):
    rep = {}
    for c in cols:
        if c in frame.columns:
            rep[c] = frame[c].value_counts(normalize=True, dropna=False).head(k).round(3)
    return rep

print("\n=== Categorical Proportions (original vs expanded) ===")
orig_cat = cat_props(df, categorical_cols)
new_cat  = cat_props(out, categorical_cols)

for c in categorical_cols:
    if c in orig_cat:
        print(f"\n[{c}]")
        print("Original:\n", orig_cat[c])
        print("Expanded:\n", new_cat[c])

def numeric_summary(frame, cols):
    if not cols: 
        return pd.DataFrame()
    return frame[cols].describe().T[['mean','std','min','25%','50%','75%','max']].round(3)

print("\n=== Numeric Summary (original vs expanded) ===")
print("Original:\n", numeric_summary(df, numeric_cols))
print("\nExpanded:\n", numeric_summary(out, numeric_cols))

# KS tests to quantify distribution similarity (lower KS_stat & higher p are better)
ks_rows = []
for c in [col for col in numeric_cols if col in out.columns]:
    dstat, pval = ks_2samp(df[c].dropna(), out[c].dropna())
    ks_rows.append((c, float(dstat), float(pval)))
ks_df = pd.DataFrame(ks_rows, columns=["column","KS_stat","p_value"]).sort_values("KS_stat")
print("\n=== Kolmogorov–Smirnov tests (numeric) ===")
print(ks_df)

# Correlation drift
common_num = [c for c in numeric_cols if c in df.columns and c in out.columns]
if common_num:
    corr_orig = df[common_num].corr(numeric_only=True)
    corr_new  = out[common_num].corr(numeric_only=True)
    corr_diff = (corr_new - corr_orig).abs().mean().mean()
    print("\nMean abs correlation drift:", round(float(corr_diff), 3))



=== Categorical Proportions (original vs expanded) ===

[GENDER]
Original:
 GENDER
Male      0.597
Female    0.403
Name: proportion, dtype: float64
Expanded:
 GENDER
Male      0.666
Female    0.334
Name: proportion, dtype: float64

[CL_YEAR]
Original:
 CL_YEAR
Sophomore    0.375
Senior       0.225
Junior       0.213
Freshman     0.186
Name: proportion, dtype: float64
Expanded:
 CL_YEAR
Sophomore    0.386
Junior       0.262
Senior       0.193
Freshman     0.159
Name: proportion, dtype: float64

[LNO]
Original:
 LNO
Neither    0.644
Owl        0.194
Lark       0.162
Name: proportion, dtype: float64
Expanded:
 LNO
Neither    0.659
Owl        0.187
Lark       0.154
Name: proportion, dtype: float64

[DEP_STATUS]
Original:
 DEP_STATUS
normal      0.826
moderate    0.134
severe      0.040
Name: proportion, dtype: float64
Expanded:
 DEP_STATUS
normal      0.794
moderate    0.151
severe      0.055
Name: proportion, dtype: float64

[ANX_STATUS]
Original:
 ANX_STATUS
normal      0.715
moderate  

In [23]:
# -------------------------------
# 8) Save
# -------------------------------
OUT_CSV = "SStudy.csv"
out.to_csv(OUT_CSV, index=False)
print(f"\nSaved → {OUT_CSV}")


Saved → SStudy.csv


In [25]:
# -------------------------------
# 9) OPTIONAL: Gaussian Copula baseline (toggle if desired)
# -------------------------------
USE_COPULA = False  # set to True to also generate a copula-based expansion
if USE_COPULA:
    gc = GaussianCopulaSynthesizer(
        metadata,
        enforce_min_max_values=True,
        enforce_rounding=True
    )
    gc.fit(df)
    need_rows_gc = max(0, TARGET_ROWS - len(df))
    gc_new = gc.sample(need_rows_gc)
    out_gc = pd.concat([df, gc_new.reindex(columns=df.columns)], ignore_index=True)
    out_gc.to_csv("SStudy.csv", index=False)
    print("Also saved → SStudy.csv")