In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Read real data
df=pd.read_csv('Original_embeddings/kripkeOutput.csv')
X_real=df.drop(columns=['relative_runtime'])
y_real=df['relative_runtime']

# Read MIN generated Data
df=pd.read_csv('MIN_embeddings/kripke_MIN.csv')
X_min=df.drop(columns=['relative_runtime'])
y_min=df['relative_runtime']

# Read be_great generated data
df1=pd.read_csv('be_greatEmbeddings/kripke_beGreat.csv')
X_beGreat=df1.drop(columns=['relative_runtime'])
y_beGreat=df1['relative_runtime']

# Read be_great generated data
df1=pd.read_csv('CTGAN_embeddings/kripke_CTGAN.csv')
X_ctgan=df1.drop(columns=['relative_runtime'])
y_ctgan=df1['relative_runtime']

In [3]:
print(X_real.shape)
print(y_real.shape)

(210, 65)
(210,)


In [4]:
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(
    X_real, y_real, test_size=0.3, random_state=42
)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_real_train)

X_real_train = scaler.transform(X_real_train)
X_real_test  = scaler.transform(X_real_test)
X_ctgan      = scaler.transform(X_ctgan)
X_min        = scaler.transform(X_min)
X_beGreat    = scaler.transform(X_beGreat)

## Maximum Mean Discrepancy
#### It tests the difference between the real and generated data distrbution

In [6]:
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel

def compute_mmd(X, Y, gamma=None):
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K_xx = rbf_kernel(X, X, gamma=gamma)
    K_yy = rbf_kernel(Y, Y, gamma=gamma)
    K_xy = rbf_kernel(X, Y, gamma=gamma)

    return K_xx.mean() + K_yy.mean() - 2 * K_xy.mean()

mmd_ctgan = compute_mmd(X_real_test, X_ctgan)
mmd_min   = compute_mmd(X_real_test, X_min)
mmd_beGreat=compute_mmd(X_real_test,X_beGreat)
print(mmd_ctgan)
print(mmd_min)
print(mmd_beGreat)

0.05716934221972658
0.2008693647029487
0.3438467085717972


In [7]:
print(X_real_test.shape)
print(X_ctgan.shape)
print(X_beGreat.shape)

(63, 65)
(500, 65)
(10, 65)


# Co-relation


In [8]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.0)
X_real_f = vt.fit_transform(X_real_test)
X_ctgan_f = vt.transform(X_ctgan)
X_min_f = vt.transform(X_min)
X_beGreat_f=vt.transform(X_beGreat)

In [9]:
def marginal_error(X_real, X_gen):
    mean_err = np.mean(np.abs(X_real.mean(axis=0) - X_gen.mean(axis=0)))
    std_err  = np.mean(np.abs(X_real.std(axis=0)  - X_gen.std(axis=0)))
    return mean_err, std_err

mean_ctgan, std_ctgan   = marginal_error(X_real_f, X_ctgan_f)
mean_min,   std_min     = marginal_error(X_real_f, X_min_f)
mean_beGreat,std_beGreat= marginal_error(X_real_f,X_beGreat_f)

In [10]:
def remove_constant_columns(*arrays):
    std = np.std(arrays[0], axis=0)
    mask = std > 0
    return [a[:, mask] for a in arrays]

X_real_f, X_ctgan_f, X_min_f ,X_beGreat_f = remove_constant_columns(
    X_real_f, X_real_f, X_min_f, X_beGreat_f
)

In [11]:
def corr_error(X_real, X_gen):
    corr_real = np.corrcoef(X_real, rowvar=False)
    corr_gen  = np.corrcoef(X_gen,  rowvar=False)
    return np.linalg.norm(corr_real - corr_gen, ord='fro')

corr_ctgan = corr_error(X_real_f, X_ctgan_f)
corr_min   = corr_error(X_real_f, X_min_f)
corr_beGreat = corr_error(X_real_f, X_beGreat_f)

print(corr_ctgan)
print(corr_min)
print(corr_beGreat)

0.0
10.297698458586014
5.655544883697136


# TSTR (Train on Synthetic, Test on Real):

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def tstr_regression(X_syn, y_syn, X_real, y_real):
    reg = RandomForestRegressor(
        n_estimators=200,
        random_state=42
    )
    reg.fit(X_syn, y_syn)
    y_pred = reg.predict(X_real)

    mse  = mean_squared_error(y_real, y_pred)
    rmse = np.sqrt(mse)

    return {
        "RMSE": rmse,
        "MAE": mean_absolute_error(y_real, y_pred),
        "R2": r2_score(y_real, y_pred)
    }

res_ctgan   = tstr_regression(X_ctgan,   y_ctgan,   X_real_test, y_real_test)
res_min     = tstr_regression(X_min,     y_min,     X_real_test, y_real_test)
res_beGreat = tstr_regression(X_beGreat, y_beGreat, X_real_test, y_real_test)

print(res_ctgan)
print(res_min)
print(res_beGreat)

{'RMSE': np.float64(0.5297977941547412), 'MAE': 0.42313490379209545, 'R2': -0.27194253895341425}
{'RMSE': np.float64(0.7089765974007567), 'MAE': 0.6486178296349218, 'R2': -1.2777759348710553}
{'RMSE': np.float64(0.43024165270371467), 'MAE': 0.2979784727746034, 'R2': 0.16117356796081073}


# Coverage

In [None]:
from sklearn.neighbors import NearestNeighbors

def coverage(X_real, X_gen, epsilon):
    nn = NearestNeighbors(n_neighbors=1).fit(X_gen)
    distances, _ = nn.kneighbors(X_real)
    return np.mean(distances < epsilon)

coverage_ctgan    = coverage(X_real_test, X_ctgan, epsilon=2)
coverage_min      = coverage(X_real_test, X_min,   epsilon=2)
coverage_beGreat  = coverage(X_real_test, X_beGreat,   epsilon=2)
print(coverage_ctgan)
print(coverage_min)
print(coverage_beGreat)

With epsilon=1, the MIN dataset shows better coverage than CTGAN, because:
coverage is literally counting the fraction of real points that have a generated neighbor within distance 1.
MIN achieves ~14% coverage, while CTGAN achieves 0%.(epilson=1 is strict coverage)
So for this strict “pointwise closeness” metric, MIN data is “closer” to some real points than CTGAN.

When epilson=1, I got 0.0 and 0.14 (which is really strict)

When epilson=2, I got
0.7142857142857143 and 
1.0
But when I keep this to more than 1 like 5, both of them have coverage of 1

In [None]:
from sklearn.neighbors import NearestNeighbors

def nn_distance_ratio(X_gen, X_real):
    nn_real = NearestNeighbors(n_neighbors=1).fit(X_real)
    d_real, _ = nn_real.kneighbors(X_gen)

    nn_gen = NearestNeighbors(n_neighbors=2).fit(X_gen)
    d_gen, _ = nn_gen.kneighbors(X_gen)

    return np.mean(d_gen[:,1] / (d_real[:,0] + 1e-8))

nn_ratio_ctgan     = nn_distance_ratio(X_ctgan, X_real_train)
nn_ratio_min       = nn_distance_ratio(X_min,   X_real_train)
nn_ratio_beGreat   = nn_distance_ratio(X_beGreat,   X_real_train)
print(nn_ratio_ctgan)
print(nn_ratio_min)
print(nn_ratio_beGreat)

# Privacy / Memorization Check

Positive gap is expected in most synthetic data generators — they see the training data, so points tend to be closer to it.
Smaller positive gap → generator generalizes better (less overfitting)

Larger positive gap → generator may be memorizing training points

In [None]:
def train_test_nn_gap(X_gen, X_train, X_test):
    nn_train = NearestNeighbors(n_neighbors=1).fit(X_train)
    nn_test  = NearestNeighbors(n_neighbors=1).fit(X_test)

    d_train, _ = nn_train.kneighbors(X_gen)
    d_test,  _ = nn_test.kneighbors(X_gen)

    return d_test.mean() - d_train.mean()

gap_ctgan       = train_test_nn_gap(X_ctgan, X_real_train, X_real_test)
gap_min         = train_test_nn_gap(X_min,   X_real_train, X_real_test)
gap_beGreat     = train_test_nn_gap(X_beGreat,X_real_train, X_real_test)
print(gap_ctgan)
print(gap_min)
print(gap_beGreat)