In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

original_df = pd.read_csv('path')

# normalising the data
rating_data = original_df['Rating'].values.reshape(-1, 1)
rating_min = rating_data.min()
rating_max = rating_data.max()
rating_normalized = (rating_data - rating_min) / (rating_max - rating_min)

# setting hyperparameters
noise_dim = 100
batch_size = 64
epochs = 100
epsilon = 10  # adjusted privacy budget
delta = 1e-5   # probability of privacy violation

# calculate noise multiplier (sigma) for DP-SGD
def compute_sigma(epsilon, delta, batch_size, num_steps):
    return np.sqrt(2 * np.log(1.25 / delta)) / epsilon

num_steps = len(rating_normalized) // batch_size * epochs
sigma = compute_sigma(epsilon, delta, batch_size, num_steps)

# defining the generator
def make_generator():
    model = keras.Sequential([
        layers.Dense(256, activation="relu", input_shape=(noise_dim,)),
        layers.BatchNormalization(),
        layers.Dense(512, activation="relu"),
        layers.BatchNormalization(),
        layers.Dense(1024, activation="relu"),
        layers.BatchNormalization(),
        layers.Dense(1, activation="sigmoid")  # Sigmoid for normalized data
    ])
    return model

# defining the discriminator
def make_discriminator():
    model = keras.Sequential([
        layers.Dense(1024, activation="relu", input_shape=(1,)),
        layers.Dropout(0.3),
        layers.Dense(512, activation="relu"),
        layers.Dropout(0.3),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.3),
        layers.Dense(1, activation="sigmoid")
    ])
    return model

# creating generator and discriminator
generator = make_generator()
discriminator = make_discriminator()

# defining the DPGAN
class DPGAN(keras.Model):
    def __init__(self, discriminator, generator, noise_dim):
        super(DPGAN, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.noise_dim = noise_dim
        
    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super(DPGAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn
        
    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        noise = tf.random.normal([batch_size, self.noise_dim])
        
        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(noise, training=True)
            
            real_output = self.discriminator(real_data, training=True)
            fake_output = self.discriminator(generated_data, training=True)
            
            gen_loss = self.loss_fn(tf.ones_like(fake_output), fake_output)
            disc_loss = self.loss_fn(tf.ones_like(real_output), real_output) + \
                        self.loss_fn(tf.zeros_like(fake_output), fake_output)
        
        # applying DP-SGD to discriminator
        disc_grads = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)
        disc_grads = [g + tf.random.normal(shape=g.shape, mean=0.0, stddev=sigma * g.dtype.as_numpy_dtype(sigma)) 
                      for g in disc_grads]
        self.d_optimizer.apply_gradients(zip(disc_grads, self.discriminator.trainable_variables))
        
        # trainging generator
        gen_grads = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        self.g_optimizer.apply_gradients(zip(gen_grads, self.generator.trainable_variables))
        
        return {"d_loss": disc_loss, "g_loss": gen_loss}

# custom loss function with wasserstein distance
def wasserstein_loss(y_true, y_pred):
    return tf.reduce_mean(y_true * y_pred)

# instnatiating and compiling the DPGAN
dpgan = DPGAN(discriminator, generator, noise_dim)
dpgan.compile(
    d_optimizer=keras.optimizers.RMSprop(learning_rate=0.00005),
    g_optimizer=keras.optimizers.RMSprop(learning_rate=0.00005),
    loss_fn=wasserstein_loss
)

# training the DPGAN
history = dpgan.fit(rating_normalized, batch_size=batch_size, epochs=epochs)

# generating the synthetic data
num_samples = len(rating_normalized)
noise = tf.random.normal([num_samples, noise_dim])
generated_data = generator(noise).numpy()

# clipping generated data to [0, 1] range
generated_data = np.clip(generated_data, 0, 1)

# denormalise the generated data - to get it back under the form of ratings
synthetic_ratings = generated_data * (rating_max - rating_min) + rating_min

# creating a new df with synthetic ratings
synthetic_df = pd.DataFrame({'Rating': synthetic_ratings.flatten()})

print("Original data shape:", original_df.shape)
print("Synthetic data shape:", synthetic_df.shape)
print("Original data mean:", original_df['Rating'].mean())
print("Synthetic data mean:", synthetic_df['Rating'].mean())
print("Original data std:", original_df['Rating'].std())
print("Synthetic data std:", synthetic_df['Rating'].std())


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - d_loss: 0.5030 - g_loss: 0.5031
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - d_loss: 0.5025 - g_loss: 0.5032
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - d_loss: 0.5023 - g_loss: 0.5028
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - d_loss: 0.5025 - g_loss: 0.5022
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - d_loss: 0.5024 - g_loss: 0.5019
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - d_loss: 0.5023 - g_loss: 0.5018
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - d_loss: 0.5019 - g_loss: 0.5020
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - d_loss: 0.5018 - g_loss: 0.5020
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [3]:
synthetic_df.to_csv('path')