# Similarity Evaluation Analysis (SEA) Dataset F

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = "EVALUATION FUNCTIONS/PRIVACY"
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for univariate resemblance analisys
from similarity_evaluation import scale_data
from similarity_evaluation import pairwise_euclidean_distance
from similarity_evaluation import hausdorff_distance
from similarity_evaluation import rts_similarity

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read real and synthetic datasets
In this part real and synthetic datasets are read.

In [3]:
#Define global variables
DATA_TYPES = ['Real','GM','SDV','CTGAN','WGANGP']
SYNTHESIZERS = ['GM','SDV','CTGAN','WGANGP']
FILEPATHS = {'Real' : HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/F_IndianLiverPatient_Real_Train.csv',
            'GM' : HOME_PATH + 'SYNTHETIC DATASETS/GM/F_IndianLiverPatient_Synthetic_GM.csv',
            'SDV' : HOME_PATH + 'SYNTHETIC DATASETS/SDV/F_IndianLiverPatient_Synthetic_SDV.csv',
            'CTGAN' : HOME_PATH + 'SYNTHETIC DATASETS/CTGAN/F_IndianLiverPatient_Synthetic_CTGAN.csv',
            'WGANGP' : HOME_PATH + 'SYNTHETIC DATASETS/WGANGP/F_IndianLiverPatient_Synthetic_WGANGP.csv'}
categorical_columns = ['gender','class']
data = dict()

In [4]:
#iterate over all datasets filepaths and read each dataset
for name, path in FILEPATHS.items() :
    data[name] = pd.read_csv(path)
    for col in categorical_columns :
        data[name][col] = data[name][col].astype('category').cat.codes
data

{'Real':      age  gender   TB   DB  alkphos  sgpt   sgot   TP  ALB   A_G  class
 0     68       0  0.6  0.1   1620.0  95.0  127.0  4.6  2.1  0.80      0
 1     31       1  1.3  0.5    184.0  29.0   32.0  6.8  3.4  1.00      0
 2     28       1  0.8  0.3    190.0  20.0   14.0  4.1  2.4  1.40      0
 3     60       1  2.3  0.6    272.0  79.0   51.0  6.6  3.5  1.10      0
 4     48       0  0.9  0.2    173.0  26.0   27.0  6.2  3.1  1.00      0
 ..   ...     ...  ...  ...      ...   ...    ...  ...  ...   ...    ...
 461   75       0  0.8  0.2    188.0  20.0   29.0  4.4  1.8  0.60      0
 462   36       1  5.3  2.3    145.0  32.0   92.0  5.1  2.6  1.00      1
 463   37       1  0.7  0.2    235.0  96.0   54.0  9.5  4.9  1.00      0
 464   17       0  0.5  0.1    206.0  28.0   21.0  7.1  4.5  1.70      1
 465   17       1  0.9  0.2    224.0  36.0   45.0  6.9  4.2  1.55      0
 
 [466 rows x 11 columns],
 'GM':      age  gender        TB        DB     alkphos        sgpt        sgot  \
 0   

## 2. Normalize data

In [5]:
#Scale the data
num_cols = (data['Real'].select_dtypes(include=['int64','float64'])).columns
scaled_data = dict()
for name in DATA_TYPES :
    scaled_data[name] = scale_data(data[name])
scaled_data

{'Real':           age  gender        TB        DB   alkphos      sgpt      sgot  \
 0    0.790123     0.0  0.002681  0.000000  0.760625  0.042714  0.023587   
 1    0.333333     1.0  0.012064  0.020408  0.059111  0.009548  0.004270   
 2    0.296296     1.0  0.005362  0.010204  0.062042  0.005025  0.000610   
 3    0.691358     1.0  0.025469  0.025510  0.102101  0.034673  0.008133   
 4    0.543210     0.0  0.006702  0.005102  0.053737  0.008040  0.003253   
 ..        ...     ...       ...       ...       ...       ...       ...   
 461  0.876543     0.0  0.005362  0.005102  0.061065  0.005025  0.003660   
 462  0.395062     1.0  0.065684  0.112245  0.040059  0.011055  0.016470   
 463  0.407407     1.0  0.004021  0.005102  0.084025  0.043216  0.008743   
 464  0.160494     0.0  0.001340  0.000000  0.069858  0.009045  0.002033   
 465  0.160494     1.0  0.006702  0.005102  0.078652  0.013065  0.006913   
 
            TP       ALB       A_G  class  
 0    0.275362  0.260870  0.999980

## 3. Calculate the Euclidean distances between each pair of values

In [6]:
distances_values = dict()

for name in SYNTHESIZERS :
    #distances = distance.cdist(scaled_data[name].values, real_data_scaled, 'euclidean')
    distances_values[name] = pairwise_euclidean_distance(scaled_data[name].values, scaled_data['Real'].values)
    
distances_values

{'GM': '1.1111 ± 0.3394',
 'SDV': '1.3592 ± 0.322',
 'CTGAN': '1.2628 ± 0.3276',
 'WGANGP': '1.6966 ± 0.2777'}

## 4. Calculate the Hausdorff distance between synthetic data and real data

In [7]:
hausdorff_values = dict()

for name in SYNTHESIZERS :
    hausdorff_values[name] = hausdorff_distance(scaled_data[name].values, scaled_data['Real'].values)
    
hausdorff_values

{'GM': 1.4503, 'SDV': 1.4792, 'CTGAN': 1.2404, 'WGANGP': 1.6209}

## 5. Calculate maximum RTS similarity

In [8]:
str_values = dict()

for name in SYNTHESIZERS :
    str_values[name] = rts_similarity(scaled_data[name].values, scaled_data['Real'].values)
    
str_values

{'GM': {'min': 0.1653, 'mean': 0.7587, 'max': 0.9989},
 'SDV': {'min': 0.09, 'mean': 0.643, 'max': 0.9902},
 'CTGAN': {'min': 0.0712, 'mean': 0.6899, 'max': 0.9959},
 'WGANGP': {'min': 0.0882, 'mean': 0.5332, 'max': 0.9236}}