In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.layers import Input,Dense,Lambda,Reshape,Conv1DTranspose, Conv1D,Flatten
from tensorflow.keras.models import Model,Sequential
from tensorflow import keras
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import time
from numpy import zeros
from numpy import ones
from numpy.random import rand
from numpy.random import randn

# Load and Define Data

In [2]:
df = pd.read_csv('../Data/PUMA-1204-2012to2016-5%_Treated.csv')
df

Unnamed: 0,HINCP,NP,AGEP,RAC1P,ESR,SEX,WIF,HUPAC,HHT,PUMA,ST
0,3,4,46,1,1,2,3,2,1,1204,24
1,3,4,44,1,1,1,3,2,1,1204,24
2,3,4,13,1,0,2,3,2,1,1204,24
3,3,4,8,1,0,2,3,2,1,1204,24
4,4,3,70,2,6,1,3,4,1,1204,24
...,...,...,...,...,...,...,...,...,...,...,...
4499,4,2,73,1,6,2,1,4,1,1204,24
4500,5,4,38,1,1,1,3,2,1,1204,24
4501,5,4,36,9,1,2,3,2,1,1204,24
4502,5,4,12,1,0,1,3,2,1,1204,24


In [3]:
df=df.fillna(df.mean())
df

Unnamed: 0,HINCP,NP,AGEP,RAC1P,ESR,SEX,WIF,HUPAC,HHT,PUMA,ST
0,3,4,46,1,1,2,3,2,1,1204,24
1,3,4,44,1,1,1,3,2,1,1204,24
2,3,4,13,1,0,2,3,2,1,1204,24
3,3,4,8,1,0,2,3,2,1,1204,24
4,4,3,70,2,6,1,3,4,1,1204,24
...,...,...,...,...,...,...,...,...,...,...,...
4499,4,2,73,1,6,2,1,4,1,1204,24
4500,5,4,38,1,1,1,3,2,1,1204,24
4501,5,4,36,9,1,2,3,2,1,1204,24
4502,5,4,12,1,0,1,3,2,1,1204,24


In [4]:
df.shape

(4504, 11)

# Data Preprocessing

In [5]:
X_train,X_test=train_test_split(df,test_size=200)

In [6]:
minmax=MinMaxScaler()
X_train = minmax.fit_transform(X_train)
X_test = minmax.fit_transform(X_test)

In [7]:
def fit_batchsize(X,batch_size):
    n_size = (len(X)//batch_size)*batch_size
    X = X[0:n_size]

    return X
batch_size = 10
X_train = fit_batchsize(X_train, batch_size)
X_test = fit_batchsize(X_test, batch_size)

In [8]:
X_train=X_train.astype(np.float32)

# Define VAE-GAN Model

In [9]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


In [10]:
def encoder(latent_dim=2):
    x = keras.Input(shape=(11,))
    model = layers.Reshape((11,1))(x)
    model = layers.Conv1D(12,3, activation="relu", strides=1, padding="same")(model)
    model = layers.Conv1D(24,3,activation="relu", strides=1, padding="same")(model)
    model = layers.Flatten()(model)
    z_mean = layers.Dense(latent_dim, name="z_mean")(model)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(model)
    z = Sampling()([z_mean, z_log_var])
    meansigma = keras.Model(x, [z_mean, z_log_var, z])
    return meansigma

In [11]:
def decgen(latent_dim=2):
    x = keras.Input(shape=(latent_dim,))
    model = layers.Dense(11 * 24, activation="relu")(x)
    model = layers.Reshape((11, 24))(model)
    model = layers.Conv1DTranspose(24, 3, activation="relu", strides=1, padding="same")(model)
    model = layers.Conv1DTranspose(12, 3, activation="relu", strides=1, padding="same")(model)
    model = layers.Conv1DTranspose(1, 3, activation="sigmoid", padding="same")(model)
    model = layers.Reshape((11,))(model)
    model = keras.Model(x, model)
    return model
    

In [12]:
def discriminator():
    x = keras.Input(shape=(11,))
    model = layers.Reshape((11,1))(x)
    model = layers.Conv1D(12,3, activation="relu", strides=1, padding="same")(model)
    model = layers.Conv1D(24,3,activation="relu", strides=1, padding="same")(model)
    model = layers.Flatten()(model)
    model = layers.Dense(1, activation="sigmoid")(model)
    output= keras.Model(x, model)
    return output

In [13]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = encoder(data)
            reconstruction = decgen(z)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(data, reconstruction)
            )
            reconstruction_loss *= 11
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

In [14]:
class GAN(keras.Model):
    
  # initialize models with latent dimensions
  def __init__(self, disc, gen, latent_dim=2):
    super(GAN, self).__init__()
    self.discriminator = disc
    self.generator = gen
    self.latent_dim = latent_dim
  
  # compile with optimizers and loss function
  def compile(self, optD, optG, loss_fn):
    super(GAN, self).compile()
    self.optD = optD
    self.optG = optG
    self.loss_fn = loss_fn
    
  # custom training function
  def train_step(self, real_data):
    if isinstance(real_data, tuple):
      real_data = real_data[0]
    
    # get current batch size
    bs = tf.shape(real_data)[0]
    z = tf.random.normal(shape=(bs, self.latent_dim))
    fake_data = self.generator(z)
    
    # combine real and fake images in a single vector along with their labels
    combined_data = tf.concat([real_data, fake_data], axis=0)
    labels = tf.concat([tf.ones((bs, 1)), tf.zeros((bs, 1))], axis=0)
    
    # train your discriminator
    with tf.GradientTape() as tape:
      preds = self.discriminator(combined_data)
      d_loss = self.loss_fn(labels, preds)
      grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
      self.optD.apply_gradients(zip(grads, self.discriminator.trainable_weights))
    
    # misleading labels for generator
    misleading_labels = tf.ones((bs, 1))
    z = tf.random.normal(shape=(bs, self.latent_dim))
    
    # train your generator
    with tf.GradientTape() as tape:
      fake_preds = self.discriminator(self.generator(z))
      g_loss = self.loss_fn(misleading_labels, fake_preds)
      grads = tape.gradient(g_loss, self.generator.trainable_weights)
      self.optG.apply_gradients(zip(grads, self.generator.trainable_weights))
    return {"d_loss": d_loss, "g_loss": g_loss}
# create GAN model using already built D and G

In [15]:
encoder=encoder()
decgen=decgen()
vae = VAE(encoder, decgen)
vae.compile(optimizer=keras.optimizers.Adam())
discriminator = discriminator()
gan = GAN(discriminator, decgen)
# compile your model with loss and optimizers
gan.compile(
    keras.optimizers.Adam(),
    keras.optimizers.Adam(),
    keras.losses.BinaryCrossentropy(from_logits=True))

In [16]:
vae.fit(X_train,X_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1f5b988e848>

In [17]:
gan.fit(X_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1f5babb7488>

In [18]:
z_mean, z_log_var, z = encoder(X_train)
generated_data=decgen(z)
generated_data

<tf.Tensor: shape=(4300, 11), dtype=float32, numpy=
array([[9.9473369e-01, 1.3376576e-01, 1.0022947e-01, ..., 1.0000000e+00,
        1.8294677e-11, 3.6588706e-13],
       [8.5846269e-01, 6.2751102e-01, 9.8416150e-02, ..., 1.8339843e-02,
        1.1907995e-02, 3.4481149e-05],
       [9.3095660e-01, 3.7155163e-01, 1.7663574e-01, ..., 9.8069745e-01,
        5.6144595e-04, 3.6842587e-06],
       ...,
       [9.6666193e-01, 1.8747780e-01, 3.3370593e-01, ..., 9.3422392e-07,
        3.7485026e-10, 5.7782423e-10],
       [9.8784173e-01, 1.8430734e-01, 1.4304379e-01, ..., 9.9999809e-01,
        7.7912743e-09, 1.5486953e-10],
       [9.5766485e-01, 3.1954810e-01, 3.7307432e-01, ..., 4.4721099e-05,
        9.6934812e-08, 4.5855884e-08]], dtype=float32)>

In [19]:
z_mean, z_log_var, z = encoder(X_test)
generated_data=decgen(z)
X_test=tf.convert_to_tensor(X_test)

fin_X_test=X_test

for i in range(100):
    z_mean, z_log_var, z = encoder(X_test)
    generated_data=tf.concat([generated_data,decgen(z)],0)

print(generated_data)


tf.Tensor(
[[8.9940596e-01 4.5875603e-01 1.7085603e-01 ... 7.6712221e-02
  3.5344362e-03 1.6990572e-05]
 [9.0090966e-01 4.5791599e-01 1.7774639e-01 ... 4.9769521e-02
  1.9700527e-03 1.2578583e-05]
 [9.3519294e-01 3.7960297e-01 2.1795657e-01 ... 2.2873238e-01
  1.8861890e-04 2.0930961e-06]
 ...
 [5.9778768e-01 4.5255604e-01 2.1429482e-01 ... 1.9189927e-05
  3.9558709e-03 1.6605140e-05]
 [9.4908142e-01 3.2546484e-01 1.6812876e-01 ... 9.9931812e-01
  4.8176626e-05 7.9916481e-07]
 [9.1406256e-01 4.4053292e-01 2.0403141e-01 ... 1.2953788e-02
  3.3870339e-04 4.5991305e-06]], shape=(20200, 11), dtype=float32)


In [20]:
generated_data=minmax.inverse_transform(generated_data)
generated_data

array([[5.59762383e+00, 4.75253618e+00, 1.60604667e+01, ...,
        1.15342444e+00, 1.20400353e+03, 2.40000170e+01],
       [5.60363865e+00, 4.74749595e+00, 1.67081602e+01, ...,
        1.09953904e+00, 1.20400197e+03, 2.40000126e+01],
       [5.74077177e+00, 4.27761781e+00, 2.04879178e+01, ...,
        1.45746475e+00, 1.20400019e+03, 2.40000021e+01],
       ...,
       [4.39115071e+00, 4.71533626e+00, 2.01437132e+01, ...,
        1.00003838e+00, 1.20400396e+03, 2.40000166e+01],
       [5.79632568e+00, 3.95278907e+00, 1.58041033e+01, ...,
        2.99863625e+00, 1.20400005e+03, 2.40000008e+01],
       [5.65625024e+00, 4.64319754e+00, 1.91789523e+01, ...,
        1.02590758e+00, 1.20400034e+03, 2.40000046e+01]])

In [21]:
generated_data=np.round(generated_data)
generated_data

array([[6.000e+00, 5.000e+00, 1.600e+01, ..., 1.000e+00, 1.204e+03,
        2.400e+01],
       [6.000e+00, 5.000e+00, 1.700e+01, ..., 1.000e+00, 1.204e+03,
        2.400e+01],
       [6.000e+00, 4.000e+00, 2.000e+01, ..., 1.000e+00, 1.204e+03,
        2.400e+01],
       ...,
       [4.000e+00, 5.000e+00, 2.000e+01, ..., 1.000e+00, 1.204e+03,
        2.400e+01],
       [6.000e+00, 4.000e+00, 1.600e+01, ..., 3.000e+00, 1.204e+03,
        2.400e+01],
       [6.000e+00, 5.000e+00, 1.900e+01, ..., 1.000e+00, 1.204e+03,
        2.400e+01]])

In [22]:
generated_data=pd.DataFrame(generated_data)

In [23]:
generated_data.to_csv('VAE-GAN reconstruction PUMA==1204.csv')