# Similarity Evaluation Analysis (SEA) Dataset A

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home directory of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/PRIVACY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for univariate resemblance analisys
from similarity_evaluation import scale_data
from similarity_evaluation import pairwise_euclidean_distance
from similarity_evaluation import hausdorff_distance
from similarity_evaluation import rts_similarity

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read real and synthetic datasets
In this part real and synthetic datasets are read.

In [3]:
#Define global variables
DATA_TYPES = ['Real','GM','SDV','CTGAN','WGANGP']
SYNTHESIZERS = ['GM','SDV','CTGAN','WGANGP']
FILEPATHS = {'Real' : HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/A_Diabetes_Data_Real_Train.csv',
            'GM' : HOME_PATH + 'SYNTHETIC DATASETS/GM/A_Diabetes_Data_Synthetic_GM.csv',
            'SDV' : HOME_PATH + 'SYNTHETIC DATASETS/SDV/A_Diabetes_Data_Synthetic_SDV.csv',
            'CTGAN' : HOME_PATH + 'SYNTHETIC DATASETS/CTGAN/A_Diabetes_Data_Synthetic_CTGAN.csv',
            'WGANGP' : HOME_PATH + 'SYNTHETIC DATASETS/WGANGP/A_Diabetes_Data_Synthetic_WGANGP.csv'}
categorical_columns = ['gender','age','admission_type_id','discharge_disposition_id','admission_source_id','max_glu_serum',
                      'A1Cresult','change','diabetesMed','readmitted']
data = dict()

In [4]:
#iterate over all datasets filepaths and read each dataset
for name, path in FILEPATHS.items() :
    data[name] = pd.read_csv(path)
    for col in categorical_columns :
        data[name][col] = data[name][col].astype('category').cat.codes
data

{'Real':        encounter_id  patient_nbr  gender  age  admission_type_id  \
 0          81844290        94788       0    7                  0   
 1         396159158    135023315       1    5                  0   
 2          31258956     18397782       1    8                  0   
 3         210691074     67509558       1    8                  0   
 4         104902980     23272362       0    7                  0   
 ...             ...          ...     ...  ...                ...   
 81407      31296060      3344202       1    7                  0   
 81408     159139902     93611655       1    6                  4   
 81409     232191828     85600899       1    7                  2   
 81410       6740700      8208234       0    6                  5   
 81411      60115668     77943780       0    4                  5   
 
        discharge_disposition_id  admission_source_id  time_in_hospital  \
 0                             0                    6                 4   
 1          

## 2. Normalize data

In [5]:
#Scale the data
num_cols = (data['Real'].select_dtypes(include=['int64','float64'])).columns
scaled_data = dict()
for name in DATA_TYPES :
    scaled_data[name] = scale_data(data[name][num_cols])
scaled_data

{'Real':        encounter_id  patient_nbr  time_in_hospital  num_lab_procedures  \
 0          0.184364     0.000500          0.230769            0.358779   
 1          0.892534     0.712594          0.000000            0.312977   
 2          0.070393     0.097095          0.230769            0.328244   
 3          0.474664     0.356285          0.153846            0.404580   
 4          0.236317     0.122821          0.769231            0.259542   
 ...             ...          ...               ...                 ...   
 81407      0.070476     0.017649          0.076923            0.259542   
 81408      0.358516     0.494041          0.307692            0.473282   
 81409      0.523106     0.451764          0.153846            0.412214   
 81410      0.015152     0.043319          0.846154            0.580153   
 81411      0.135409     0.411353          0.000000            0.007634   
 
        num_procedures  num_medications  number_outpatient  number_emergency  \
 0        

## 3. Calculate the Euclidean distances between each pair of values

In [6]:
distances_values = dict()

for name in SYNTHESIZERS :
    #distances = distance.cdist(scaled_data[name].values, real_data_scaled, 'euclidean')
    distances_values[name] = pairwise_euclidean_distance(scaled_data[name].values, scaled_data['Real'].values)
    
distances_values

{'GM': '0.6841 ± 0.2458',
 'SDV': '1.2019 ± 0.1669',
 'CTGAN': '0.8133 ± 0.2601',
 'WGANGP': '1.1408 ± 0.2914'}

## 4. Calculate the Hausdorff distance between synthetic data and real data

In [7]:
hausdorff_values = dict()

for name in SYNTHESIZERS :
    hausdorff_values[name] = hausdorff_distance(scaled_data[name].values, scaled_data['Real'].values)
    
hausdorff_values

{'GM': 0.6733, 'SDV': 1.1917, 'CTGAN': 1.3428, 'WGANGP': 1.5305}

## 5. Calculate maximum RTS similarity

In [8]:
str_values = dict()

for name in SYNTHESIZERS :
    str_values[name] = rts_similarity(scaled_data[name].values, scaled_data['Real'].values)
    
str_values

{'GM': {'min': 0.0159, 'mean': 0.7764, 'max': 1.0},
 'SDV': {'min': 0.0342, 'mean': 0.69, 'max': 0.9869},
 'CTGAN': {'min': 0.0048, 'mean': 0.7621, 'max': 1.0},
 'WGANGP': {'min': 0.0016, 'mean': 0.6033, 'max': 0.9999}}