# Similarity Evaluation Analysis (SEA) Dataset B

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/PRIVACY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for univariate resemblance analisys
from similarity_evaluation import scale_data
from similarity_evaluation import pairwise_euclidean_distance
from similarity_evaluation import hausdorff_distance
from similarity_evaluation import rts_similarity

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read real and synthetic datasets
In this part real and synthetic datasets are read.

In [3]:
#Define global variables
DATA_TYPES = ['Real','GM','SDV','CTGAN','WGANGP']
SYNTHESIZERS = ['GM','SDV','CTGAN','WGANGP']
FILEPATHS = {'Real' : HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/B_Cardio_Data_Real_Train.csv',
            'GM' : HOME_PATH + 'SYNTHETIC DATASETS/GM/B_Cardio_Data_Synthetic_GM.csv',
            'SDV' : HOME_PATH + 'SYNTHETIC DATASETS/SDV/B_Cardio_Data_Synthetic_SDV.csv',
            'CTGAN' : HOME_PATH + 'SYNTHETIC DATASETS/CTGAN/B_Cardio_Data_Synthetic_CTGAN.csv',
            'WGANGP' : HOME_PATH + 'SYNTHETIC DATASETS/WGANGP/B_Cardio_Data_Synthetic_WGANGP.csv'}
categorical_columns = ['gender','cholesterol','gluc','smoke','alco','active','cardio']
data = dict()

In [4]:
#iterate over all datasets filepaths and read each dataset
for name, path in FILEPATHS.items() :
    data[name] = pd.read_csv(path)
    for col in categorical_columns :
        data[name][col] = data[name][col].astype('category').cat.codes
data

{'Real':           id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  \
 0      67617  21876       0     154    80.0    130     90            1     0   
 1      96320  16717       1     162    70.0    140     90            0     0   
 2      17571  21128       1     174    92.0    150    100            0     0   
 3      46492  23366       1     173    76.0    120     82            0     0   
 4        945  20281       0     160    60.0    120     80            0     0   
 ...      ...    ...     ...     ...     ...    ...    ...          ...   ...   
 55995  53137  16001       1     170    75.0    150     80            0     0   
 55996   8918  23209       1     162    73.0    160     90            0     0   
 55997  78302  23589       0     169    74.0    120     80            0     0   
 55998   1197  18227       0     167    70.0    120     80            0     0   
 55999  22556  15114       1     177    64.0    120     80            0     0   
 
        smoke  alc

## 2. Normalize data

In [5]:
#Scale the data
num_cols = (data['Real'].select_dtypes(include=['int64','float64'])).columns
scaled_data = dict()
for name in DATA_TYPES :
    scaled_data[name] = scale_data(data[name][num_cols])
scaled_data

{'Real':              id       age    height    weight     ap_hi     ap_lo
 0      0.676177  0.857762  0.507692  0.325843  0.019068  0.008182
 1      0.963210  0.458304  0.548718  0.269663  0.019774  0.008182
 2      0.175712  0.799845  0.610256  0.393258  0.020480  0.009091
 3      0.464925  0.973132  0.605128  0.303371  0.018362  0.007455
 4      0.009450  0.734262  0.538462  0.213483  0.018362  0.007273
 ...         ...       ...       ...       ...       ...       ...
 55995  0.531375  0.402865  0.589744  0.297753  0.020480  0.007273
 55996  0.089181  0.960976  0.548718  0.286517  0.021186  0.008182
 55997  0.783028  0.990399  0.584615  0.292135  0.018362  0.007273
 55998  0.011970  0.575223  0.574359  0.269663  0.018362  0.007273
 55999  0.225562  0.334185  0.625641  0.235955  0.018362  0.007273
 
 [56000 rows x 6 columns],
 'GM':              id       age    height    weight     ap_hi     ap_lo
 0      0.035401  0.272422  0.520833  0.276029  0.385762  0.014169
 1      0.086702  0

## 3. Calculate the Euclidean distances between each pair of values

In [6]:
distances_values = dict()

for name in SYNTHESIZERS :
    #distances = distance.cdist(scaled_data[name].values, real_data_scaled, 'euclidean')
    distances_values[name] = pairwise_euclidean_distance(scaled_data[name].values, scaled_data['Real'].values)
    
distances_values

{'GM': '0.5923 ± 0.1652',
 'SDV': '0.8678 ± 0.137',
 'CTGAN': '0.8656 ± 0.144',
 'WGANGP': '0.8992 ± 0.1638'}

## 4. Calculate the Hausdorff distance between synthetic data and real data

In [7]:
hausdorff_values = dict()

for name in SYNTHESIZERS :
    hausdorff_values[name] = hausdorff_distance(scaled_data[name].values, scaled_data['Real'].values)
    
hausdorff_values

{'GM': 0.6358, 'SDV': 1.0176, 'CTGAN': 0.9339, 'WGANGP': 0.9236}

## 5. Calculate maximum RTS similarity

In [8]:
str_values = dict()

for name in SYNTHESIZERS :
    str_values[name] = rts_similarity(scaled_data[name].values, scaled_data['Real'].values)
    
str_values

{'GM': {'min': 0.1072, 'mean': 0.8684, 'max': 0.9997},
 'SDV': {'min': 0.1361, 'mean': 0.7451, 'max': 0.9941},
 'CTGAN': {'min': 0.1052, 'mean': 0.7282, 'max': 0.9998},
 'WGANGP': {'min': 0.0577, 'mean': 0.6797, 'max': 0.9907}}