In [4]:
!pip install sdv pandas scikit-learn matplotlib seaborn


Collecting sdv
  Downloading sdv-1.26.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.30-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.30-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.18.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.23.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [5]:
from google.colab import files
uploaded = files.upload()

Saving creditcard.csv to creditcard.csv


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load uploaded dataset
df = pd.read_csv("creditcard.csv")
print("Shape:", df.shape)

# Scale Time and Amount
scaler = StandardScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

# Split train/test
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42, stratify=df['Class'])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Shape: (284807, 31)
Train shape: (227845, 31)
Test shape: (56962, 31)


In [7]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

# Auto-detect metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_df)

# CTGAN synthesizer
synthesizer = CTGANSynthesizer(metadata=metadata, epochs=100, batch_size=500, verbose=True)

t0 = time.time()
synthesizer.fit(train_df)
print("Training finished in {:.1f} seconds".format(time.time() - t0))

# Generate synthetic
synthetic = synthesizer.sample(num_rows=len(train_df))
print("Generated synthetic shape:", synthetic.shape)

# Preview synthetic
synthetic.head()


Gen. (-3.18) | Discrim. (-0.12): 100%|██████████| 10/10 [04:04<00:00, 24.46s/it]


Training finished in 1242.1 seconds
Generated synthetic shape: (227845, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.392422,1.376191,0.678161,-2.479935,-1.746897,1.838316,-0.89781,-1.071403,0.099064,-0.736785,...,0.434339,0.541519,0.236915,0.02755,-0.917447,0.425847,0.401674,0.226557,0.294491,1
1,0.662781,-1.561263,1.165459,0.238129,3.221139,0.401492,-0.444623,0.515828,0.990073,1.537424,...,0.307103,0.229585,-0.112959,-1.555788,-0.127399,-0.098131,0.071738,-0.057105,-0.295764,0
2,-0.629615,1.338936,-1.400478,-0.733326,-2.032423,-1.448417,3.922522,-1.496456,-0.391093,0.942305,...,-0.159078,0.075532,0.078456,0.03933,-0.446409,0.98529,0.045776,0.113445,-0.010296,0
3,0.239907,-12.762601,1.991095,1.278242,0.884015,1.652225,-5.905386,-3.972871,-2.293907,-2.067733,...,-0.807002,0.084885,0.379799,-0.047703,-0.204243,0.637151,2.586005,0.025401,1.620489,0
4,1.543337,-3.133889,-1.643872,-2.238136,0.613339,0.448849,-1.544689,-0.862014,0.116371,-1.234519,...,0.511263,-0.586558,0.248476,-1.559365,-0.252168,-0.546865,0.202233,-0.063994,-0.158495,1


In [8]:
synthetic.to_csv("synthetic_train.csv", index=False)
from google.colab import files
files.download("synthetic_train.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
# Utility evaluation: train on REAL vs train on SYNTHETIC, test on REAL test set
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Use the existing dataframes we already created in Colab
real_train = train_df.copy()
real_test = test_df.copy()
synth_train = synthetic.copy()

TARGET = "Class"

# Prepare X/y
X_real = real_train.drop(columns=[TARGET])
y_real = real_train[TARGET].astype(int)

X_test = real_test.drop(columns=[TARGET])
y_test = real_test[TARGET].astype(int)

X_synth = synth_train.drop(columns=[TARGET])
y_synth = synth_train[TARGET].astype(int)

def train_and_eval(X_train, y_train, X_test, y_test, label):
    clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    probas = clf.predict_proba(X_test)[:,1]
    print("\n Eval:", label, "\n")
    print(classification_report(y_test, preds, digits=4))
    print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas)))

# 1) Train on real, test on real
train_and_eval(X_real, y_real, X_test, y_test, "Train on REAL")

# 2) Train on synthetic, test on real
train_and_eval(X_synth, y_synth, X_test, y_test, "Train on SYNTHETIC")



=== Eval: Train on REAL ===

              precision    recall  f1-score   support

           0     0.9997    0.9999    0.9998     56864
           1     0.9419    0.8265    0.8804        98

    accuracy                         0.9996     56962
   macro avg     0.9708    0.9132    0.9401     56962
weighted avg     0.9996    0.9996    0.9996     56962

ROC AUC: 0.9623

=== Eval: Train on SYNTHETIC ===

              precision    recall  f1-score   support

           0     0.9998    0.9839    0.9918     56864
           1     0.0868    0.8878    0.1582        98

    accuracy                         0.9837     56962
   macro avg     0.5433    0.9358    0.5750     56962
weighted avg     0.9982    0.9837    0.9904     56962

ROC AUC: 0.9652


In [12]:
# Workaround: generate synthetic pool and extract fraud (Class==1) samples
import pandas as pd
import math
from time import time

target_n = 2000   # how many fraud examples we want (adjustable)
batch_size = 10000
max_rounds = 25   # stop after this many batches to avoid infinite loops

fraud_rows = []
generated = 0
t0 = time()

for r in range(max_rounds):
    batch = synthesizer.sample(num_rows=batch_size)
    generated += len(batch)
    # ensure 'Class' is present and integer-like
    if 'Class' not in batch.columns:
        print("ERROR: 'Class' column missing in generated data.")
        break
    fraud_batch = batch[batch['Class'] == 1]
    if len(fraud_batch) > 0:
        fraud_rows.append(fraud_batch)
    found = sum(len(df) for df in fraud_rows) if fraud_rows else 0
    print(f"Round {r+1}: generated {len(batch)}, found fraud so far: {found}")
    if found >= target_n:
        break

total_fraud = pd.concat(fraud_rows, ignore_index=True) if fraud_rows else pd.DataFrame(columns=batch.columns)
total_fraud = total_fraud.head(target_n)  # trim to exact target
print("\n Result ")
print("Total synthetic rows generated:", generated)
print("Total synthetic fraud rows collected:", len(total_fraud))
print("\nFirst 5 fraud examples:\n")
print(total_fraud.head().to_string(index=False))


Round 1: generated 10000, found fraud so far: 3285

=== Result ===
Total synthetic rows generated: 10000
Total synthetic fraud rows collected: 2000

First 5 fraud examples:

     Time         V1       V2        V3        V4         V5        V6        V7        V8        V9       V10      V11       V12       V13        V14       V15       V16       V17       V18       V19       V20       V21       V22       V23       V24       V25       V26       V27       V28    Amount  Class
-1.125966 -15.094955 8.659724 -8.507307 13.224168 -10.397174 -2.951941 -6.774206  2.149823  2.442560 -7.632504 4.067225 -2.307019  1.119052  -3.172735 -0.108829 -0.911656 -0.459504 -3.733025  0.634175  0.319689  1.787806 -3.738201 -0.094587  0.137328  0.806928 -0.246123  1.469582  0.810483 -0.353229      1
-0.275223  -2.740046 7.611116 -2.190586  2.938401  -0.207021 -0.332565 -3.866952  0.691045 -0.644362  1.009611 3.600105 -0.636049  1.010557 -13.474025 -0.661921 -1.673700 -2.386826 -2.744681 -0.351444 -0.215214

In [13]:
# Build hybrid training set: real negatives + synthetic fraud positives, then evaluate
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import shuffle
import numpy as np

# Use variables already in memory:
# real_train, real_test, total_fraud (from previous step), synthetic (full pool)
# If names differ, adapt them accordingly.

# 1) get all real negative (Class==0) from real_train
real_neg = real_train[real_train['Class'] == 0].copy()
len_neg = len(real_neg)
print("Real negatives (train):", len_neg)

# 2) use the synthetic fraud examples we collected (total_fraud). If you used a different var, replace it.
synthetic_pos = total_fraud.copy()  # should contain 'Class'==1
print("Synthetic fraud examples available:", len(synthetic_pos))

# If synthetic_pos has fewer rows than needed, we can sample with replacement; here we will use up to len_neg to avoid extreme imbalance
# Choose number of positives to use (you can adjust). We'll use min(len_neg//50, len(synthetic_pos)) to create ~2% positives similar to original ratio.
target_pos = min(max(len_neg // 50, len(synthetic_pos)), len(synthetic_pos))
# Explanation: original fraud ratio ~0.17% -> len_neg//50 gives ~2% positives; adjust as needed.
print("Using synthetic positives (target):", target_pos)

synthetic_pos_sample = synthetic_pos.sample(n=target_pos, replace=(target_pos > len(synthetic_pos)), random_state=42)

# 3) build hybrid df: combine real_neg + synthetic_pos_sample
hybrid_df = pd.concat([real_neg, synthetic_pos_sample], ignore_index=True)
hybrid_df = shuffle(hybrid_df, random_state=42).reset_index(drop=True)

print("Hybrid training shape (rows, cols):", hybrid_df.shape)
print("Hybrid class distribution:\n", hybrid_df['Class'].value_counts())

# 4) train & evaluate
X_h = hybrid_df.drop(columns=['Class'])
y_h = hybrid_df['Class'].astype(int)

X_test = real_test.drop(columns=['Class'])
y_test = real_test['Class'].astype(int)

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_h, y_h)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)[:,1]

print("\nEval: Train on HYBRID (real negatives + synthetic positives)\n")
print(classification_report(y_test, preds, digits=4))
print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas)))


Real negatives (train): 227451
Synthetic fraud examples available: 2000
Using synthetic positives (target): 2000
Hybrid training shape (rows, cols): (229451, 31)
Hybrid class distribution:
 Class
0    227451
1      2000
Name: count, dtype: int64

=== Eval: Train on HYBRID (real negatives + synthetic positives) ===

              precision    recall  f1-score   support

           0     0.9988    0.9999    0.9993     56864
           1     0.8182    0.2755    0.4122        98

    accuracy                         0.9986     56962
   macro avg     0.9085    0.6377    0.7058     56962
weighted avg     0.9984    0.9986    0.9983     56962

ROC AUC: 0.9449


In [14]:
# Rebuild hybrid set with higher proportion of synthetic frauds (~10% of negatives)
target_pos = min(len(real_neg) // 10, len(synthetic_pos))  # ~22k frauds if available, but we only have 2000
print("Using synthetic positives (target):", target_pos)

synthetic_pos_sample = synthetic_pos.sample(n=target_pos, replace=(target_pos > len(synthetic_pos)), random_state=42)

hybrid_df2 = pd.concat([real_neg, synthetic_pos_sample], ignore_index=True)
hybrid_df2 = shuffle(hybrid_df2, random_state=42).reset_index(drop=True)

print("Hybrid2 training shape:", hybrid_df2.shape)
print("Class distribution:\n", hybrid_df2['Class'].value_counts())

X_h2 = hybrid_df2.drop(columns=['Class'])
y_h2 = hybrid_df2['Class'].astype(int)

clf2 = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf2.fit(X_h2, y_h2)
preds2 = clf2.predict(X_test)
probas2 = clf2.predict_proba(X_test)[:,1]

print("\n=== Eval: Train on HYBRID2 (more synthetic frauds) ===\n")
print(classification_report(y_test, preds2, digits=4))
print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas2)))


Using synthetic positives (target): 2000
Hybrid2 training shape: (229451, 31)
Class distribution:
 Class
0    227451
1      2000
Name: count, dtype: int64

=== Eval: Train on HYBRID2 (more synthetic frauds) ===

              precision    recall  f1-score   support

           0     0.9988    0.9999    0.9993     56864
           1     0.8182    0.2755    0.4122        98

    accuracy                         0.9986     56962
   macro avg     0.9085    0.6377    0.7058     56962
weighted avg     0.9984    0.9986    0.9983     56962

ROC AUC: 0.9449


In [39]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

os.makedirs("images", exist_ok=True)

# 1) Save distribution plots for each feature
features_to_plot = ["Amount", "V1", "V2", "V3"]
for feat in features_to_plot:
    plt.figure(figsize=(8,4))
    sns.kdeplot(train_df[feat], label="Real", fill=True, alpha=0.4)
    sns.kdeplot(synthetic[feat], label="Synthetic", fill=True, alpha=0.4)
    plt.title(f"Distribution of {feat}: Real vs Synthetic")
    plt.legend()
    plt.tight_layout()
    fname = f"images/dist_{feat}.png"
    plt.savefig(fname, dpi=180)
    plt.close()
    print("Saved:", fname)

# 2) Nearest-neighbor privacy distances
real_no_class = real_train.drop(columns=['Class'])
synth_no_class = synthetic.drop(columns=['Class'])

nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(real_no_class.values)
distances, indices = nbrs.kneighbors(synth_no_class.values)
distances = distances.ravel()

metrics = {
    "nearest_neighbor": {
        "min": float(np.min(distances)),
        "25%": float(np.percentile(distances, 25)),
        "median": float(np.median(distances)),
        "mean": float(np.mean(distances)),
        "75%": float(np.percentile(distances, 75)),
        "max": float(np.max(distances))
    },
    "n_synthetic": int(len(synth_no_class)),
    "n_real": int(len(real_no_class))
}

# Save JSON metrics
with open("images/privacy_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved: images/privacy_metrics.json")

md_text = f"""## Privacy metrics (Nearest-neighbor distances)

- Min: {metrics['nearest_neighbor']['min']:.4f}
- 25%: {metrics['nearest_neighbor']['25%']:.4f}
- Median: {metrics['nearest_neighbor']['median']:.4f}
- Mean: {metrics['nearest_neighbor']['mean']:.4f}
- 75%: {metrics['nearest_neighbor']['75%']:.4f}
- Max: {metrics['nearest_neighbor']['max']:.4f}
"""
with open("images/privacy_metrics.md", "w") as f:
    f.write(md_text)
print("Saved: images/privacy_metrics.md")

# 3) Save distances histogram
plt.figure(figsize=(8,4))
plt.hist(distances, bins=100, color="purple", alpha=0.7)
plt.title("Histogram of Synthetic→Nearest Real Distance")
plt.xlabel("Distance")
plt.ylabel("Frequency")
plt.tight_layout()
hist_fname = "images/distance_histogram.png"
plt.savefig(hist_fname, dpi=180)
plt.close()
print("Saved:", hist_fname)

# 4)  Save small sample CSVs
real_sample = real_train.sample(n=500, random_state=42)
synth_sample = synthetic.sample(n=500, random_state=42)
real_sample.to_csv("images/real_sample_500.csv", index=False)
synth_sample.to_csv("images/synth_sample_500.csv", index=False)
print("Saved: images/real_sample_500.csv and images/synth_sample_500.csv")

# 5) Print summary
print("\nSummary metrics:")
for k,v in metrics['nearest_neighbor'].items():
    print(f"{k}: {v:.4f}")


Saved: images/dist_Amount.png
Saved: images/dist_V1.png
Saved: images/dist_V2.png
Saved: images/dist_V3.png
Saved: images/privacy_metrics.json
Saved: images/privacy_metrics.md
Saved: images/distance_histogram.png
Saved: images/real_sample_500.csv and images/synth_sample_500.csv

Summary metrics:
min: 1.4385
25%: 3.5252
median: 4.3770
mean: 6.1002
75%: 7.4857
max: 26.0296


In [41]:
!zip -r images.zip images
from google.colab import files
files.download("images.zip")


  adding: images/ (stored 0%)
  adding: images/real_sample_500.csv (deflated 54%)
  adding: images/privacy_metrics.md (deflated 19%)
  adding: images/privacy_metrics.json (deflated 36%)
  adding: images/synth_sample_500.csv (deflated 53%)
  adding: images/dist_V3.png (deflated 12%)
  adding: images/distance_histogram.png (deflated 21%)
  adding: images/dist_V2.png (deflated 15%)
  adding: images/dist_V1.png (deflated 13%)
  adding: images/dist_Amount.png (deflated 18%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>