# Importing and Transforming data

In [3]:
import pandas as pd
import numpy as np

real_data = pd.read_csv('real_ctdata.csv')
syn_data = pd.read_csv('ctgan_synthetic_382.csv')

In [4]:
real_data.drop(["subject_id"], axis = 1, inplace = True)
syn_data.drop(["subject_id"], axis = 1, inplace = True)
real_data = real_data.loc[:, ~real_data.columns.str.contains('^Unnamed')]
syn_data = syn_data.loc[:, ~syn_data.columns.str.contains('^Unnamed')]

In [5]:
x_train = real_data
x_test = syn_data
cat_mask = x_train.dtypes == object
cat_list = x_train.columns[cat_mask].tolist()
# Or mask categorical columns manually

In [6]:
cat_train = x_train.select_dtypes(include=[object])
cat_test = x_test.select_dtypes(include=[object])
num_train = x_train.select_dtypes(include=[float])
num_test =  x_test.select_dtypes(include=[float])

## 1. Encode Categorical Columns

We transform the categorical columns into numerical value using label encoder on top of which we apply one-hot encoder. 

In [98]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# For training set
le = LabelEncoder()
cat_train_2 = cat_train.apply(le.fit_transform)
oe = OneHotEncoder(sparse=False)
cat_train_oe = oe.fit_transform(cat_train_2)
cat_train_oe = pd.DataFrame(cat_train_oe)

# For test set
cat_test_2 = cat_test.apply(le.fit_transform)
cat_test_oe = oe.fit_transform(cat_test_2)
cat_test_oe = pd.DataFrame(cat_test_oe)

# print out the shapes
print(cat_train_oe.shape)
print(cat_test_oe.shape)

(382, 15)
(382, 15)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## 2. Normalize numerical data

In [99]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
# For training set
num_train = scaler.fit_transform(num_train)
num_train = pd.DataFrame(num_train)
# For testing set
num_test = scaler.fit_transform(num_test)
num_test = pd.DataFrame(num_test)
# print out the shapes
print(num_train.shape)
print(num_test.shape)

(382, 1)
(382, 1)


In [100]:
# Integrate datasets
x_train = pd.concat([cat_train_oe, num_train], axis=1, sort=False)
x_test = pd.concat([cat_test_oe, num_test], axis=1, sort=False)
x_train = np.array(x_train)
x_test = np.array(x_test)
# Flatten the data into vectors
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train.shape)
print(x_test.shape)

(382, 16)
(382, 16)


# Autoencoder Structure Building

A single fully-connected neural layer as encoder and decoder

In [101]:
# this is the size of our encoded representations
latent_dim = 1  # 1 representation vector
original_dim= x_train.shape[1]

In [102]:
from keras.layers import Input, Dense
from keras.models import Model


# this is our input placeholder
input_data = Input(shape=(original_dim,))
# "encoded" is the encoded representation of the input
encoded = Dense(latent_dim, activation='relu')(input_data)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(original_dim, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction (Define a model that would turn input_data into decoded output)
autoencoder = Model(input_data, decoded)

Create a separate encoder model:

In [103]:
# this model maps an input to its encoded representation
encoder = Model(input_data, encoded)

As well as the decoder model:

In [104]:
# create a placeholder for an encoded (assigned # of dimensions) input
encoded_input = Input(shape=(latent_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input)) 

# Autoencoder Model Training

In [105]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

Now let's train our autoencoder for 50 epochs:

In [None]:
autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_split = 0.2)

# Autoencoder Model Inference

In [None]:
# encode and decode some digits
# note that we take them from the *test* set
encoded_testdata = encoder.predict(x_test)
decoded_testdata = decoder.predict(encoded_testdata)

In [None]:
encoded_traindata = encoder.predict(x_train)
decoded_traindata = decoder.predict(encoded_traindata)

# Calculate Similarity Score

In [None]:
### Decide the bins by yourself:
# The upper bound should be 2 more steps more than the maximum value of both vectors
# Controling the whole length of the bins to around 200 woyld be optimal 

bins = np.arange(0,2.3,0.01)

real_inds = pd.DataFrame(np.digitize(encoded_traindata, bins), columns = ['inds'])
syn_inds = pd.DataFrame(np.digitize(encoded_testdata, bins), columns = ['inds'])


In [None]:
def identify_probs(table,column):
    counts = table[column].value_counts()
    freqs = {counts.index[i]: counts.values[i] for i in range(len(counts.index))}
    for i in range(1, len(bins)+1):
        if i not in freqs.keys():
            freqs[i] = 0
    sorted_freqs = {}
    for k in sorted(freqs.keys()):
        sorted_freqs[k] = freqs[k]
    probs = []
    for k,v in sorted_freqs.items():
        probs.append(v/len(table[column]))
    return sorted_freqs, np.array(probs)

In [None]:
from scipy.spatial import distance

real_p = identify_probs(real_inds,'inds')[1]
syn_p = identify_probs(syn_inds,'inds')[1]
def cos_similarity(p,q):
    return 1 - distance.cosine(p, q)
cos_similarity(real_p,syn_p)

# Dimension Reduction Visualization

## 1. T-SNE

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# T-sne visualization
tsne = TSNE(n_components = 2, random_state = 0)
tsne_train = tsne.fit_transform(encoded_traindata)
tsne_test = tsne.fit_transform(encoded_testdata)
tsne_train_df = pd.DataFrame(data = tsne_train, columns = ('Dim_1','Dim_2'))
tsne_test_df = pd.DataFrame(data = tsne_test, columns = ('Dim_1','Dim_2'))

plt.figure(figsize = [14, 5])
plt.subplot(121)
plt.title('Original dataset')
plt.scatter(tsne_train_df['Dim_1'],tsne_train_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
plt.axis([-30, 40, -40, 40])

plt.subplot(122)
plt.title('Synthetic dataset')
plt.scatter(tsne_test_df['Dim_1'],tsne_test_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
plt.axis([-30, 40, -40, 40])

plt.show()

## 2. PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# T-sne visualization
pca = PCA(n_components=2, random_state = 0)
pca_train = pca.fit_transform(encoded_traindata)
pca_test = pca.fit_transform(encoded_testdata)
pca_train_df = pd.DataFrame(data = pca_train, columns = ('Dim_1','Dim_2'))
pca_test_df = pd.DataFrame(data = pca_test, columns = ('Dim_1','Dim_2'))

plt.figure(figsize = [14, 5])
plt.subplot(121)
plt.title('Original dataset')
plt.scatter(pca_train_df['Dim_1'],pca_train_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
plt.axis([-1.0, 2.0, -0.5, 1.5])

plt.subplot(122)
plt.title('Synthetic dataset')
plt.scatter(pca_test_df['Dim_1'],pca_test_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
plt.axis([-1.0, 2.0, -0.5, 1.5])

plt.show()