In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
original_df = pd.read_csv('path')  
synthetic_df = pd.read_csv('path') 


# **MVAE-EA**

In [6]:
# INFERENCE ATTACK MODEL
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# aligning the datasets by matching its indices
common_indices = original_df.index.intersection(synthetic_df.index)

# filter both datasets to only include common indices
aligned_original_df = original_df.loc[common_indices]
aligned_synthetic_df = synthetic_df.loc[common_indices]

# creating a binary label indicating if the data point is in the training set
aligned_original_df['is_training'] = 1  # 1 for real (training) 
aligned_synthetic_df['is_training'] = 0  # 0 for synthetic (non-training) 

# combine and shuffle the aligned data
combined_df = pd.concat([aligned_original_df, aligned_synthetic_df]).sample(frac=1, random_state=42)

# extracting the features and labels
X_combined = combined_df['Rating'].values.reshape(-1, 1)  # features from both real and synthetic data
y_combined = combined_df['is_training'].values  # target labels (1 if real, 0 if synthetic)

# splitting the combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# initialising and training a logreg binary classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# predicting on the test set
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)[:, 1]  # Probability of belonging to the training set

# using using classification metrics for evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# calculating the privacy loss (epsilon)
train_indices = y_test == 1
non_train_indices = y_test == 0

epsilon = np.abs(y_pred_proba[train_indices].mean() - y_pred_proba[non_train_indices].mean())
print(f"Privacy Loss (epsilon): {epsilon}")


Accuracy: 0.4461538461538462
Precision: 0.375
Recall: 0.04285714285714286
F1 Score: 0.07692307692307693
Privacy Loss (epsilon): 0.00018479879692573364
