In [None]:
!pip install sdv pandas scikit-learn matplotlib seaborn


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load uploaded dataset
df = pd.read_csv("creditcard.csv")
print("Shape:", df.shape)

# Scale Time and Amount
scaler = StandardScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

# Split train/test
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42, stratify=df['Class'])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


In [None]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import time

# Auto-detect metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_df)

# CTGAN synthesizer
synthesizer = CTGANSynthesizer(metadata=metadata, epochs=100, batch_size=500, verbose=True)

t0 = time.time()
synthesizer.fit(train_df)
print("Training finished in {:.1f} seconds".format(time.time() - t0))

# Generate synthetic
synthetic = synthesizer.sample(num_rows=len(train_df))
print("Generated synthetic shape:", synthetic.shape)

# Preview synthetic
synthetic.head()


In [None]:
synthetic.to_csv("synthetic_train.csv", index=False)
from google.colab import files
files.download("synthetic_train.csv")


In [None]:
# Utility evaluation: train on REAL vs train on SYNTHETIC, test on REAL test set
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Use the existing dataframes we already created in Colab
real_train = train_df.copy()
real_test = test_df.copy()
synth_train = synthetic.copy()

TARGET = "Class"

# Prepare X/y
X_real = real_train.drop(columns=[TARGET])
y_real = real_train[TARGET].astype(int)

X_test = real_test.drop(columns=[TARGET])
y_test = real_test[TARGET].astype(int)

X_synth = synth_train.drop(columns=[TARGET])
y_synth = synth_train[TARGET].astype(int)

def train_and_eval(X_train, y_train, X_test, y_test, label):
    clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    probas = clf.predict_proba(X_test)[:,1]
    print("\n Eval:", label, "\n")
    print(classification_report(y_test, preds, digits=4))
    print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas)))

# 1) Train on real, test on real
train_and_eval(X_real, y_real, X_test, y_test, "Train on REAL")

# 2) Train on synthetic, test on real
train_and_eval(X_synth, y_synth, X_test, y_test, "Train on SYNTHETIC")


In [None]:
# Workaround: generate synthetic pool and extract fraud (Class==1) samples
import pandas as pd
import math
from time import time

target_n = 2000   # how many fraud examples we want (adjustable)
batch_size = 10000
max_rounds = 25   # stop after this many batches to avoid infinite loops

fraud_rows = []
generated = 0
t0 = time()

for r in range(max_rounds):
    batch = synthesizer.sample(num_rows=batch_size)
    generated += len(batch)
    # ensure 'Class' is present and integer-like
    if 'Class' not in batch.columns:
        print("ERROR: 'Class' column missing in generated data.")
        break
    fraud_batch = batch[batch['Class'] == 1]
    if len(fraud_batch) > 0:
        fraud_rows.append(fraud_batch)
    found = sum(len(df) for df in fraud_rows) if fraud_rows else 0
    print(f"Round {r+1}: generated {len(batch)}, found fraud so far: {found}")
    if found >= target_n:
        break

total_fraud = pd.concat(fraud_rows, ignore_index=True) if fraud_rows else pd.DataFrame(columns=batch.columns)
total_fraud = total_fraud.head(target_n)  # trim to exact target
print("\n Result ")
print("Total synthetic rows generated:", generated)
print("Total synthetic fraud rows collected:", len(total_fraud))
print("\nFirst 5 fraud examples:\n")
print(total_fraud.head().to_string(index=False))


In [None]:
# Build hybrid training set: real negatives + synthetic fraud positives, then evaluate
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import shuffle
import numpy as np

# Use variables already in memory:
# real_train, real_test, total_fraud (from previous step), synthetic (full pool)
# If names differ, adapt them accordingly.

# 1) get all real negative (Class==0) from real_train
real_neg = real_train[real_train['Class'] == 0].copy()
len_neg = len(real_neg)
print("Real negatives (train):", len_neg)

# 2) use the synthetic fraud examples we collected (total_fraud). If you used a different var, replace it.
synthetic_pos = total_fraud.copy()  # should contain 'Class'==1
print("Synthetic fraud examples available:", len(synthetic_pos))

# If synthetic_pos has fewer rows than needed, we can sample with replacement; here we will use up to len_neg to avoid extreme imbalance
# Choose number of positives to use (you can adjust). We'll use min(len_neg//50, len(synthetic_pos)) to create ~2% positives similar to original ratio.
target_pos = min(max(len_neg // 50, len(synthetic_pos)), len(synthetic_pos))
# Explanation: original fraud ratio ~0.17% -> len_neg//50 gives ~2% positives; adjust as needed.
print("Using synthetic positives (target):", target_pos)

synthetic_pos_sample = synthetic_pos.sample(n=target_pos, replace=(target_pos > len(synthetic_pos)), random_state=42)

# 3) build hybrid df: combine real_neg + synthetic_pos_sample
hybrid_df = pd.concat([real_neg, synthetic_pos_sample], ignore_index=True)
hybrid_df = shuffle(hybrid_df, random_state=42).reset_index(drop=True)

print("Hybrid training shape (rows, cols):", hybrid_df.shape)
print("Hybrid class distribution:\n", hybrid_df['Class'].value_counts())

# 4) train & evaluate
X_h = hybrid_df.drop(columns=['Class'])
y_h = hybrid_df['Class'].astype(int)

X_test = real_test.drop(columns=['Class'])
y_test = real_test['Class'].astype(int)

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_h, y_h)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)[:,1]

print("\nEval: Train on HYBRID (real negatives + synthetic positives)\n")
print(classification_report(y_test, preds, digits=4))
print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas)))


In [None]:
# Rebuild hybrid set with higher proportion of synthetic frauds (~10% of negatives)
target_pos = min(len(real_neg) // 10, len(synthetic_pos))  # ~22k frauds if available, but we only have 2000
print("Using synthetic positives (target):", target_pos)

synthetic_pos_sample = synthetic_pos.sample(n=target_pos, replace=(target_pos > len(synthetic_pos)), random_state=42)

hybrid_df2 = pd.concat([real_neg, synthetic_pos_sample], ignore_index=True)
hybrid_df2 = shuffle(hybrid_df2, random_state=42).reset_index(drop=True)

print("Hybrid2 training shape:", hybrid_df2.shape)
print("Class distribution:\n", hybrid_df2['Class'].value_counts())

X_h2 = hybrid_df2.drop(columns=['Class'])
y_h2 = hybrid_df2['Class'].astype(int)

clf2 = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf2.fit(X_h2, y_h2)
preds2 = clf2.predict(X_test)
probas2 = clf2.predict_proba(X_test)[:,1]

print("\n=== Eval: Train on HYBRID2 (more synthetic frauds) ===\n")
print(classification_report(y_test, preds2, digits=4))
print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, probas2)))


In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

os.makedirs("images", exist_ok=True)

# 1) Save distribution plots for each feature
features_to_plot = ["Amount", "V1", "V2", "V3"]
for feat in features_to_plot:
    plt.figure(figsize=(8,4))
    sns.kdeplot(train_df[feat], label="Real", fill=True, alpha=0.4)
    sns.kdeplot(synthetic[feat], label="Synthetic", fill=True, alpha=0.4)
    plt.title(f"Distribution of {feat}: Real vs Synthetic")
    plt.legend()
    plt.tight_layout()
    fname = f"images/dist_{feat}.png"
    plt.savefig(fname, dpi=180)
    plt.close()
    print("Saved:", fname)

# 2) Nearest-neighbor privacy distances
real_no_class = real_train.drop(columns=['Class'])
synth_no_class = synthetic.drop(columns=['Class'])

nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(real_no_class.values)
distances, indices = nbrs.kneighbors(synth_no_class.values)
distances = distances.ravel()

metrics = {
    "nearest_neighbor": {
        "min": float(np.min(distances)),
        "25%": float(np.percentile(distances, 25)),
        "median": float(np.median(distances)),
        "mean": float(np.mean(distances)),
        "75%": float(np.percentile(distances, 75)),
        "max": float(np.max(distances))
    },
    "n_synthetic": int(len(synth_no_class)),
    "n_real": int(len(real_no_class))
}

# Save JSON metrics
with open("images/privacy_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved: images/privacy_metrics.json")

md_text = f"""## Privacy metrics (Nearest-neighbor distances)

- Min: {metrics['nearest_neighbor']['min']:.4f}
- 25%: {metrics['nearest_neighbor']['25%']:.4f}
- Median: {metrics['nearest_neighbor']['median']:.4f}
- Mean: {metrics['nearest_neighbor']['mean']:.4f}
- 75%: {metrics['nearest_neighbor']['75%']:.4f}
- Max: {metrics['nearest_neighbor']['max']:.4f}
"""
with open("images/privacy_metrics.md", "w") as f:
    f.write(md_text)
print("Saved: images/privacy_metrics.md")

# 3) Save distances histogram
plt.figure(figsize=(8,4))
plt.hist(distances, bins=100, color="purple", alpha=0.7)
plt.title("Histogram of Synthetic→Nearest Real Distance")
plt.xlabel("Distance")
plt.ylabel("Frequency")
plt.tight_layout()
hist_fname = "images/distance_histogram.png"
plt.savefig(hist_fname, dpi=180)
plt.close()
print("Saved:", hist_fname)

# 4)  Save small sample CSVs
real_sample = real_train.sample(n=500, random_state=42)
synth_sample = synthetic.sample(n=500, random_state=42)
real_sample.to_csv("images/real_sample_500.csv", index=False)
synth_sample.to_csv("images/synth_sample_500.csv", index=False)
print("Saved: images/real_sample_500.csv and images/synth_sample_500.csv")

# 5) Print summary
print("\nSummary metrics:")
for k,v in metrics['nearest_neighbor'].items():
    print(f"{k}: {v:.4f}")


In [None]:
!zip -r images.zip images
from google.colab import files
files.download("images.zip")
