In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.layers import Input,Dense,Lambda
from tensorflow.keras.models import Model
from tensorflow import keras
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler

# Load and Define Data

In [2]:
df = pd.read_csv('../Data/SYN PUMA= 100  GEOID= 24001000100.csv')
df

Unnamed: 0,AGEP,SEX,ESR,RAC1P,NP,HHT,HINCP,HUPAC,WIF,GEOID10,SANITYID
0,44,1,7,1,2,6,2,3,2,24001000100,0
1,44,2,2,1,2,6,1,4,4,24001000100,0
2,20,2,2,1,2,1,3,4,2,24001000100,0
3,31,1,2,1,3,2,3,2,2,24001000100,0
4,63,1,7,1,3,5,5,4,3,24001000100,0
...,...,...,...,...,...,...,...,...,...,...,...
3713,41,2,7,1,3,6,4,4,3,24001000100,0
3714,35,1,2,1,3,1,4,2,3,24001000100,0
3715,49,2,7,1,2,1,2,4,4,24001000100,0
3716,67,2,1,1,2,1,4,4,1,24001000100,0


In [3]:
df.shape

(3718, 11)

# Data Preprocessing

In [4]:
X_train,X_test=train_test_split(df,test_size=400)


In [5]:
X_train

Unnamed: 0,AGEP,SEX,ESR,RAC1P,NP,HHT,HINCP,HUPAC,WIF,GEOID10,SANITYID
253,67,1,7,1,3,1,2,4,2,24001000100,0
1541,38,2,7,1,2,1,4,4,3,24001000100,0
784,76,2,7,1,1,1,5,4,3,24001000100,0
2835,41,1,7,1,1,1,1,4,2,24001000100,0
2535,35,1,2,1,3,1,1,4,2,24001000100,0
...,...,...,...,...,...,...,...,...,...,...,...
3059,83,1,7,1,2,6,2,2,4,24001000100,0
540,80,2,7,1,7,6,3,4,2,24001000100,0
2879,57,1,7,1,3,1,1,4,4,24001000100,0
957,1,1,1,1,2,1,1,4,1,24001000100,0


In [6]:
minmax=MinMaxScaler()
X_train = minmax.fit_transform(X_train)
X_test = minmax.fit_transform(X_test)

In [7]:
X_train

array([[0.6875    , 0.        , 1.        , ..., 0.33333333, 0.        ,
        0.        ],
       [0.38541667, 1.        , 1.        , ..., 0.66666667, 0.        ,
        0.        ],
       [0.78125   , 1.        , 1.        , ..., 0.66666667, 0.        ,
        0.        ],
       ...,
       [0.58333333, 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1875    , 1.        , 1.        , ..., 0.33333333, 0.        ,
        0.        ]])

In [8]:
def fit_batchsize(X,batch_size):
    n_size = (len(X)//batch_size)*batch_size
    X = X[0:n_size]

    return X
batch_size = 100
X_train = fit_batchsize(X_train, batch_size)
X_test = fit_batchsize(X_test, batch_size)

In [9]:
type(X_test)

numpy.ndarray

# Define VAE Network

In [10]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [11]:
latent_dim = 2

encoder_inputs = keras.Input(shape=(11,))
x = layers.Reshape((11,1))(encoder_inputs)
x = layers.Conv1D(12,3, activation="relu", strides=1, padding="same")(x)
x = layers.Conv1D(24,3,activation="relu", strides=1, padding="same")(x)
x = layers.Flatten()(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (None, 11, 1)        0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 11, 12)       48          reshape[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 11, 24)       888         conv1d[0][0]                     
____________________________________________________________________________________________

In [12]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(11 * 24, activation="relu")(latent_inputs)
x = layers.Reshape((11, 24))(x)
x = layers.Conv1DTranspose(24, 3, activation="relu", strides=1, padding="same")(x)
x = layers.Conv1DTranspose(12, 3, activation="relu", strides=1, padding="same")(x)
x = layers.Conv1DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder_outputs=layers.Reshape((11,))(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
dense (Dense)                (None, 264)               792       
_________________________________________________________________
reshape_1 (Reshape)          (None, 11, 24)            0         
_________________________________________________________________
conv1d_transpose (Conv1DTran (None, 11, 24)            1752      
_________________________________________________________________
conv1d_transpose_1 (Conv1DTr (None, 11, 12)            876       
_________________________________________________________________
conv1d_transpose_2 (Conv1DTr (None, 11, 1)             37        
_________________________________________________________________
reshape_2 (Reshape)          (None, 11)                0   

In [13]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = encoder(data)
            reconstruction = decoder(z)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(data, reconstruction)
            )
            reconstruction_loss *= 11
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

# Training the VAE Network

In [14]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
history=vae.fit(X_train,
        X_train, 
        epochs=18, 
        batch_size=100)

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


# Reconstruct and Augment Data

In [15]:
z_mean, z_log_var, z = encoder(X_train)
reconstruction=decoder(z)
X_train=tf.convert_to_tensor(X_train)

fin_X_train=X_train

for i in range(100):
    z_mean, z_log_var, z = encoder(X_train)
    reconstruction=tf.concat([reconstruction,decoder(z)],0)

print(reconstruction)


tf.Tensor(
[[5.3038853e-01 1.8047455e-01 9.8638082e-01 ... 5.5475008e-01
  4.1547418e-04 4.3499470e-04]
 [5.4254568e-01 8.3702874e-01 9.7822142e-01 ... 5.7998073e-01
  6.8873167e-04 5.7005882e-04]
 [5.3346473e-01 9.9851072e-01 8.2195938e-01 ... 5.5475068e-01
  8.3755833e-05 1.0669325e-04]
 ...
 [4.9826902e-01 9.9448037e-01 1.4860260e-01 ... 5.1490110e-01
  4.7498941e-04 5.4073334e-04]
 [4.9640143e-01 2.2233725e-03 2.9579768e-01 ... 4.7763178e-01
  1.1720657e-03 1.4424622e-03]
 [5.4760617e-01 9.9482042e-01 9.5923376e-01 ... 5.6587797e-01
  1.1663375e-04 1.2294427e-04]], shape=(333300, 11), dtype=float32)


In [16]:
reconstruction=minmax.inverse_transform(reconstruction)
reconstruction

array([[4.86134139e+01, 1.18047455e+00, 6.91828489e+00, ...,
        2.66425025e+00, 2.40010001e+10, 4.34994698e-04],
       [4.96589282e+01, 1.83702874e+00, 6.86932850e+00, ...,
        2.73994219e+00, 2.40010001e+10, 5.70058823e-04],
       [4.88779668e+01, 1.99851072e+00, 5.93175626e+00, ...,
        2.66425204e+00, 2.40010001e+10, 1.06693253e-04],
       ...,
       [4.58511358e+01, 1.99448037e+00, 1.89161563e+00, ...,
        2.54470330e+00, 2.40010001e+10, 5.40733337e-04],
       [4.56905229e+01, 1.00222337e+00, 2.77478606e+00, ...,
        2.43289533e+00, 2.40010001e+10, 1.44246221e-03],
       [5.00941306e+01, 1.99482042e+00, 6.75540257e+00, ...,
        2.69763392e+00, 2.40010001e+10, 1.22944271e-04]])

In [17]:
reconstruction=np.round(reconstruction)
reconstruction

array([[4.90000000e+01, 1.00000000e+00, 7.00000000e+00, ...,
        3.00000000e+00, 2.40010001e+10, 0.00000000e+00],
       [5.00000000e+01, 2.00000000e+00, 7.00000000e+00, ...,
        3.00000000e+00, 2.40010001e+10, 0.00000000e+00],
       [4.90000000e+01, 2.00000000e+00, 6.00000000e+00, ...,
        3.00000000e+00, 2.40010001e+10, 0.00000000e+00],
       ...,
       [4.60000000e+01, 2.00000000e+00, 2.00000000e+00, ...,
        3.00000000e+00, 2.40010001e+10, 0.00000000e+00],
       [4.60000000e+01, 1.00000000e+00, 3.00000000e+00, ...,
        2.00000000e+00, 2.40010001e+10, 0.00000000e+00],
       [5.00000000e+01, 2.00000000e+00, 7.00000000e+00, ...,
        3.00000000e+00, 2.40010001e+10, 0.00000000e+00]])

In [18]:
reconstruction=pd.DataFrame(reconstruction)

In [19]:
reconstruction.to_csv('reconstruction.csv')