In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

# Step 1: Generate Mock Dataset
num_users = 100
num_items = 50
data = np.random.randint(0, 6, size=(num_users, num_items))  # Random ratings from 0 to 5
data = np.where(data > 0, 1, 0)  # Binarize ratings (0 or 1 for interaction)

# Step 2: Define the GAN Architecture
def build_generator():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=10))  # Input noise dimension
    model.add(layers.Dense(num_items, activation='sigmoid'))  # Output size = number of items
    return model

def build_discriminator():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(num_items,)))
    model.add(layers.Dense(1, activation='sigmoid'))  # Binary classification (real or fake)
    return model

# Step 3: Compile the GAN
generator = build_generator()
discriminator = build_discriminator()
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

discriminator.trainable = False  # Freeze discriminator during generator training
gan_input = layers.Input(shape=(10,))
generated_data = generator(gan_input)
gan_output = discriminator(generated_data)
gan = keras.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Step 4: Train the GAN with Data Saving
def train_gan(epochs, batch_size, save_interval):
    for epoch in range(epochs):
        # Train Discriminator
        idx = np.random.randint(0, num_users, batch_size)
        real_data = data[idx]
        
        noise = np.random.rand(batch_size, 10)  # Random noise for generator
        fake_data = generator.predict(noise)
        
        d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))  # Real = 1
        d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((batch_size, 1)))  # Fake = 0
        
        # Train Generator
        noise = np.random.rand(batch_size, 10)  # Random noise
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))  # Try to trick discriminator

        # Print losses
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss Real: {d_loss_real[0]}, D Loss Fake: {d_loss_fake[0]}, G Loss: {g_loss}")

        # Save generated data every `save_interval` epochs
        if epoch % save_interval == 0:
            all_generated_data = generator.predict(np.random.rand(num_users, 10))
            all_generated_data_binary = (all_generated_data > 0.5).astype(int)
            np.savetxt(f"generated_data_epoch_{epoch}.csv", all_generated_data_binary, delimiter=",")
            print(f"Generated data saved at epoch {epoch}")

# Train the GAN
train_gan(epochs=1000, batch_size=32, save_interval=100)

# Step 5: Load and Compare Original and Generated Data Using SVD
# Convert original data to DataFrame and save it for comparison
pd.DataFrame(data).to_csv('original_data.csv', index=False, header=False)

# Load original and generated data
original_data = pd.read_csv('original_data.csv', header=None).values
generated_data = pd.read_csv('generated_data_epoch_100.csv', header=None).values  # Load generated data at epoch 100

# Apply SVD to both datasets
svd = TruncatedSVD(n_components=10)

# Fit and transform original data
original_svd = svd.fit_transform(original_data)

# Fit and transform generated data
generated_svd = svd.fit_transform(generated_data)

# Step 6: Compare the Results
mse = mean_squared_error(original_svd, generated_svd)
print(f"Mean Squared Error between original and generated data: {mse}")


Epoch 0: D Loss Real: 0.48980164527893066, D Loss Fake: 0.9296141862869263, G Loss: 0.5091569423675537
Generated data saved at epoch 0
Epoch 100: D Loss Real: 0.3754900097846985, D Loss Fake: 0.708217442035675, G Loss: 0.6972532272338867
Generated data saved at epoch 100
Epoch 200: D Loss Real: 0.39108389616012573, D Loss Fake: 0.6736850142478943, G Loss: 0.7465229034423828
Generated data saved at epoch 200
Epoch 300: D Loss Real: 0.3225743770599365, D Loss Fake: 0.5647570490837097, G Loss: 0.9243791103363037
Generated data saved at epoch 300
Epoch 400: D Loss Real: 0.15530376136302948, D Loss Fake: 0.27876558899879456, G Loss: 1.5811638832092285
Generated data saved at epoch 400
Epoch 500: D Loss Real: 0.09302350878715515, D Loss Fake: 0.22453196346759796, G Loss: 1.904223918914795
Generated data saved at epoch 500
Epoch 600: D Loss Real: 0.067965067923069, D Loss Fake: 0.12917275726795197, G Loss: 2.3134279251098633
Generated data saved at epoch 600
Epoch 700: D Loss Real: 0.36834731