# Importing and Transforming data

In [3]:
import pandas as pd
import numpy as np

real_data = pd.read_csv('real_ctdata.csv')
syn_data = pd.read_csv('ctgan_synthetic_382.csv')

# Cleaning data

## 1. Drop columns 

In [20]:
real_data = real_data.loc[:, ~real_data.columns.str.contains('^Unnamed|id|ID')]
syn_data = syn_data.loc[:, ~syn_data.columns.str.contains('^Unnamed|ID|id')]
real_data = real_data.dropna()
syn_data = syn_data.dropna()

## 2. Encode categorical columns and normalize numerical columns (if applicable)

In [13]:
# Categorical columns that are string types
cat_list = ['ADMISSION_TYPE', 'INSURANCE', 'ETHNICITY', 'GENDER']
# Categorical columns that are numerical types
numcat_list = [ 'HOSPITAL_EXPIRE_FLAG','EXPIRE_FLAG']
# Numerical columns
num_list = ['LOS','certain conditions originating in the perinatal period',
       'complications of pregnancy, childbirth, and the puerperium',
       'congenital anomalies',
       'diseases of the blood and blood-forming organs',
       'diseases of the circulatory system',
       'diseases of the digestive system',
       'diseases of the genitourinary system',
       'diseases of the musculoskeletal system and connective tissue',
       'diseases of the nervous system', 'diseases of the respiratory system',
       'diseases of the sense organs',
       'diseases of the skin and subcutaneous tissue',
       'endocrine, nutritional and metabolic diseases, and immunity disorders',
       'external causes of injury and supplemental classification',
       'infectious and parasitic diseases', 'injury and poisoning',
       'mental disorders', 'neoplasms',
       'symptoms, signs, and ill-defined conditions']

In [14]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
le = LabelEncoder()
oe = OneHotEncoder(sparse=False)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))

def transform_cat(df):
    df_2 = df.apply(le.fit_transform)
    df_oe = oe.fit_transform(df_2)
    df_oe = pd.DataFrame(df_oe)
    return df_oe

def transform_numcat(df):
    df_oe = oe.fit_transform(df)
    df_oe = pd.DataFrame(df_oe)
    return df_oe

def transform_num(df):
    df_2 = scaler.fit_transform(df)
    df_2 = pd.DataFrame(df_2)
    return df_2


In [16]:
numcat_train = transform_numcat(real_data[numcat_list])
num_train = transform_num(real_data[num_list])
cat_train = transform_cat(real_data[cat_list])

numcat_test = transform_numcat(syn_data[numcat_list])
num_test = transform_num(syn_data[num_list])
cat_test = transform_cat(syn_data[cat_list])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
# Integrate datasets
x_train = pd.concat([numcat_test, cat_train], axis=1, sort=False)
x_train = pd.concat([x_train, num_train], axis=1, sort=False)

x_test = pd.concat([numcat_test, cat_test], axis=1, sort=False)
x_test = pd.concat([x_test, num_test], axis=1, sort=False)


(93083, 151)
(93000, 113)


## 3. Reshape for modeling

In [None]:
x_train = np.array(x_train)
x_test = np.array(x_test)
# Flatten the data into vectors
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train.shape)
print(x_test.shape)

# Autoencoder

### Autoencoder Structure Building

A single fully-connected neural layer as encoder and decoder

In [102]:
from keras.layers import Input, Dense
from keras.models import Model
def modeling_autoencoder(latent_dim, x_train):
    original_dim= x_train.shape[1]

    # this is our input placeholder
    input_data = Input(shape=(original_dim,))
    # "encoded" is the encoded representation of the input
    encoded = Dense(latent_dim, activation='relu')(input_data)
    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(original_dim, activation='sigmoid')(encoded)

    # this model maps an input to its reconstruction (Define a model that would turn input_data into decoded output)
    autoencoder = Model(input_data, decoded)
    
    #### Create a separate encoder model ####
    # this model maps an input to its encoded representation
    encoder = Model(input_data, encoded)
    
    #### as well as the decoder model ####
    # create a placeholder for an encoded (assigned # of dimensions) input
    encoded_input = Input(shape=(latent_dim,))
    # retrieve the last layer of the autoencoder model
    decoder_layer = autoencoder.layers[-1]
    # create the decoder model
    decoder = Model(encoded_input, decoder_layer(encoded_input)) 
    
    #### Autoencoder model training ####
    autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
    
    autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_split = 0.2)
    
    return encoder, decoder


    

### Autoencoder Model Inference

In [None]:
trained_encoder = modeling_autoencoder(1, x_train)[0]
encoded_testdata = trained_encoder.predict(x_test)
encoded_traindata = trained_encoder.predict(x_train)

# Calculate Similarity Score

In [None]:
### Decide the bins by yourself:
# The upper bound should be 2 more steps more than the maximum value of both vectors
# Controling the whole length of the bins to around 200 woyld be optimal 

bins = np.arange(0,2100,20)

real_inds = pd.DataFrame(np.digitize(encoded_traindata, bins), columns = ['inds'])
syn_inds = pd.DataFrame(np.digitize(encoded_testdata, bins), columns = ['inds'])



In [None]:
def identify_probs(table,column):
    counts = table[column].value_counts()
    freqs = {counts.index[i]: counts.values[i] for i in range(len(counts.index))}
    for i in range(1, len(bins)+1):
        if i not in freqs.keys():
            freqs[i] = 0
    sorted_freqs = {}
    for k in sorted(freqs.keys()):
        sorted_freqs[k] = freqs[k]
    probs = []
    for k,v in sorted_freqs.items():
        probs.append(v/len(table[column]))
    return sorted_freqs, np.array(probs)

In [None]:
from scipy.spatial import distance

real_p = identify_probs(real_inds,'inds')[1]
syn_p = identify_probs(syn_inds,'inds')[1]
def cos_similarity(p,q):
    return 1 - distance.cosine(p, q)
cos_similarity(real_p,syn_p)

# Dimension Reduction Visualization

### Extract 5-dimensional data from autoencoder

In [None]:
trained_encoder = modeling_autoencoder(5, x_train)[0]
encoded_testdata = trained_encoder.predict(x_test)
encoded_traindata = trained_encoder.predict(x_train)

## 1. PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# T-sne visualization
pca = PCA(n_components=2, random_state = 0)
pca_train = pca.fit_transform(encoded_traindata)
pca_test = pca.fit_transform(encoded_testdata)
pca_train_df = pd.DataFrame(data = pca_train, columns = ('Dim_1','Dim_2'))
pca_test_df = pd.DataFrame(data = pca_test, columns = ('Dim_1','Dim_2'))

plt.figure(figsize = [14, 5])
plt.subplot(121)
plt.title('Original dataset')
plt.scatter(pca_train_df['Dim_1'],pca_train_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
# plt.axis([-1.0, 2.0, -0.5, 1.5]) 

plt.subplot(122)
plt.title('Synthetic dataset')
plt.scatter(pca_test_df['Dim_1'],pca_test_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
# plt.axis([-1.0, 2.0, -0.5, 1.5])

plt.show()

## 2. T-SNE

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# T-sne visualization
tsne = TSNE(n_components = 2, random_state = 0)
tsne_train = tsne.fit_transform(encoded_traindata)
tsne_test = tsne.fit_transform(encoded_testdata)
tsne_train_df = pd.DataFrame(data = tsne_train, columns = ('Dim_1','Dim_2'))
tsne_test_df = pd.DataFrame(data = tsne_test, columns = ('Dim_1','Dim_2'))

plt.figure(figsize = [14, 5])
plt.subplot(121)
plt.title('Original dataset')
plt.scatter(tsne_train_df['Dim_1'],tsne_train_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
# plt.axis([-30, 40, -40, 40])

plt.subplot(122)
plt.title('Synthetic dataset')
plt.scatter(tsne_test_df['Dim_1'],tsne_test_df['Dim_2'], marker = 'o')
plt.xlabel('Dimension 1',fontsize=14)
plt.ylabel('Dimension 2',fontsize=14)
# plt.axis([-30, 40, -40, 40])

plt.show()