In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.linear_model import LinearRegression
from collections import Iterable
import random
from os import listdir
from os.path import isfile, join
import matplotlib.patches as mpatches

from itertools import combinations

import seaborn as sns
sns.set_style("whitegrid")

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

  


Calculations for stats and plots are based on: https://github.com/stasmix/popsynth/blob/master/pop-synth-vae.ipynb

# Functions

In [3]:
def is_a_DATGAN(name):
    if 'TGAN' in name or 'CTGAN' in name:
        return False
    else:
        return True

def compute_stats(freq_list_orig, freq_list_synth):
    """
    Different statistics computed on the frequency list
    
    """
    freq_list_orig, freq_list_synth = np.array(freq_list_orig), np.array(freq_list_synth)
    corr_mat = np.corrcoef(freq_list_orig, freq_list_synth)
    corr = corr_mat[0, 1]
    if np.isnan(corr): corr = 0.0
    # MAE
    mae = np.absolute(freq_list_orig - freq_list_synth).mean()
    # RMSE
    rmse = np.linalg.norm(freq_list_orig - freq_list_synth) / np.sqrt(len(freq_list_orig))
    # SRMSE
    freq_list_orig_avg = freq_list_orig.mean()
    srmse = rmse / freq_list_orig_avg
    # r-square
    u = np.sum((freq_list_synth - freq_list_orig)**2)
    v = np.sum((freq_list_orig - freq_list_orig_avg)**2)
    r2 = 1.0 - u / v
    stat = {'mae': mae, 'rmse': rmse, 'r2': r2, 'srmse': srmse, 'corr': corr}
    
    return stat

# Load the files

In [4]:
dataset = 'LPMC'

input_folder = '../synth_data/{}/'.format(dataset)

files_ = {}
models = []

for f in listdir(input_folder):
    if isfile(join(input_folder, f)):
        m = f.split('.')[0]
        models.append(m)
        files_[m] = join(input_folder, f)

In [5]:
df_orig = pd.read_csv('../data/' + dataset + '/data.csv')

In [6]:
if dataset is 'Chicago':
    continuous_cols = ['distance', 'age', 'departure_time']
elif dataset is 'LPMC':
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']

In [7]:
bins_cont = {}

for c in continuous_cols:
    #bins_cont[c] = pd.qcut(df_orig[c], q=10, retbins=True)[1]
    bins_cont[c] = pd.cut(df_orig[c], bins=10, retbins=True)[1]
    bins_cont[c][0] = -np.inf
    bins_cont[c][-1] = np.inf
    df_orig[c] = pd.cut(df_orig[c], bins=bins_cont[c])

In [8]:
df_orig.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,travel_year,travel_month,travel_date,day_of_week,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_n_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent
0,drive,HBO,Petrol_Car,child,0.0,1,2012,4,1,7,...,"(0.106, 0.212]","(-inf, 0.137]","(-inf, 0.215]","(-inf, 0.0567]",0,"(-inf, 0.183]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(0.104, 0.208]"
1,drive,HBO,Petrol_Car,free,0.0,1,2012,4,1,7,...,"(0.212, 0.318]","(-inf, 0.137]","(-inf, 0.215]","(-inf, 0.0567]",0,"(-inf, 0.183]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(-inf, 0.104]"
2,drive,HBO,Petrol_Car,full,1.0,1,2012,4,1,7,...,"(0.212, 0.318]","(-inf, 0.137]","(0.859, 1.074]","(0.0567, 0.113]",1,"(0.362, 0.54]","(2.34, 3.51]","(1.027, 2.034]",0.0,"(0.313, 0.417]"
3,pt,HBW,Average_Car,full,1.0,1,2012,4,1,7,...,"(0.106, 0.212]","(-inf, 0.137]","(-inf, 0.215]","(0.0567, 0.113]",1,"(-inf, 0.183]","(2.34, 3.51]","(-inf, 1.027]",0.0,"(-inf, 0.104]"
4,pt,HBO,Average_Car,free,0.0,1,2012,4,1,7,...,"(0.106, 0.212]","(-inf, 0.137]","(0.215, 0.429]","(-inf, 0.0567]",0,"(0.183, 0.362]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(-inf, 0.104]"


In [9]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
orig_str = 'random-original'

# Stats per individual column

In [10]:
all_stats = {}

# Go through each model
for i, m in enumerate(models):
    
    print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))
    
    all_stats[m] = {}
    
    # Load all dataframes for current model
    df = pd.read_csv(files_[m])
       
    # Discretize continuous columns
    for c in continuous_cols:
        df[c] = pd.cut(df[c], bins=bins_cont[c])

    # Go through each columns
    for c in df_orig.columns:

        agg_vars = [c]

        real = df_orig.copy()
        real['count'] = 1
        real = real.groupby(agg_vars, observed=True).count()
        real /= len(df_orig)

        synth = df.copy()
        synth['count'] = 1
        synth = synth.groupby(agg_vars, observed=True).count()
        synth /= len(df)

        real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
        real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

        sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
        
        all_stats[m][c] = sts

Preparing stats for model [1mCTGAN[0m (1/29)
Preparing stats for model [1mSGAN_OR_BO[0m (2/29)
Preparing stats for model [1mSGAN_OR_NO[0m (3/29)
Preparing stats for model [1mSGAN_OR_OD[0m (4/29)
Preparing stats for model [1mSGAN_WI_BO[0m (5/29)
Preparing stats for model [1mSGAN_WI_NO[0m (6/29)
Preparing stats for model [1mSGAN_WI_OD[0m (7/29)
Preparing stats for model [1mSGAN_WO_BO[0m (8/29)
Preparing stats for model [1mSGAN_WO_NO[0m (9/29)
Preparing stats for model [1mSGAN_WO_OD[0m (10/29)
Preparing stats for model [1mTGAN[0m (11/29)
Preparing stats for model [1mWGAN_OR_BO[0m (12/29)
Preparing stats for model [1mWGAN_OR_NO[0m (13/29)
Preparing stats for model [1mWGAN_OR_OD[0m (14/29)
Preparing stats for model [1mWGAN_WI_BO[0m (15/29)
Preparing stats for model [1mWGAN_WI_NO[0m (16/29)
Preparing stats for model [1mWGAN_WI_OD[0m (17/29)
Preparing stats for model [1mWGAN_WO_BO[0m (18/29)
Preparing stats for model [1mWGAN_WO_NO[0m (19/29)
Preparing sta

In [11]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in df_orig.columns:

    agg_vars = [c]

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [12]:
all_stats[orig_str] = stats_orig

In [13]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)
        
    for s in stats_str:
        res[test][s] = {}

    for m in all_stats.keys():

        for s in stats_str:
            
            tmp = []
            for c in cols:
                tmp.append(all_stats[m][c][s])
            
            res[test][s][m] = np.mean(tmp)

In [14]:
for test in ['all', 'cont', 'cat']:
    
    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'
        
    for s in ['srmse']:#stats_str:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
        print()


Ranking on all columns based on SRMSE:
   1. WGGP_WI_NO           - 8.25e-02
   2. random-original      - 9.67e-02
   3. WGGP_WI_OD           - 9.70e-02
   4. WGAN_WI_NO           - 1.03e-01
   5. WGGP_WO_NO           - 1.04e-01
   6. WGGP_WI_BO           - 1.04e-01
   7. WGGP_WO_OD           - 1.21e-01
   8. WGGP_OR_OD           - 1.22e-01
   9. WGGP_WO_BO           - 1.24e-01
  10. WGGP_OR_BO           - 1.27e-01
  11. WGGP_OR_NO           - 1.51e-01
  12. TGAN                 - 1.52e-01
  13. WGAN_WO_NO           - 1.72e-01
  14. SGAN_WI_NO           - 1.81e-01
  15. CTGAN                - 2.03e-01
  16. WGAN_WI_OD           - 2.15e-01
  17. WGAN_OR_NO           - 2.31e-01
  18. WGAN_OR_OD           - 2.32e-01
  19. WGAN_OR_BO           - 2.66e-01
  20. SGAN_WI_OD           - 2.66e-01
  21. WGAN_WO_OD           - 2.84e-01
  22. WGAN_WI_BO           - 2.95e-01
  23. SGAN_OR_NO           - 3.17e-01
  24. SGAN_WI_BO           - 3.55e-01
  25. SGAN_OR_OD           - 3.97e-01
  26. WGAN_

# Stats per couple columns

In [28]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])
    
print('There are {} combinations!'.format(len(combs)))

There are 378 combinations!


In [16]:
all_stats = {}

# Go through each model
for i, m in enumerate(models):
    
    print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))
    
    all_stats[m] = {}
    
    # Load all dataframes for current model
    df = pd.read_csv(files_[m])
       
    # Discretize continuous columns
    for c in continuous_cols:
        df[c] = pd.cut(df[c], bins=bins_cont[c])

    # Go through each columns
    for c in combs:

        agg_vars = c.split('::')

        real = df_orig.copy()
        real['count'] = 1
        real = real.groupby(agg_vars, observed=True).count()
        real /= len(df_orig)

        synth = df.copy()
        synth['count'] = 1
        synth = synth.groupby(agg_vars, observed=True).count()
        synth /= len(df)

        real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
        real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

        sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
        
        all_stats[m][c] = sts

Preparing stats for model [1mCTGAN[0m (1/29)
Preparing stats for model [1mSGAN_OR_BO[0m (2/29)
Preparing stats for model [1mSGAN_OR_NO[0m (3/29)
Preparing stats for model [1mSGAN_OR_OD[0m (4/29)
Preparing stats for model [1mSGAN_WI_BO[0m (5/29)
Preparing stats for model [1mSGAN_WI_NO[0m (6/29)
Preparing stats for model [1mSGAN_WI_OD[0m (7/29)
Preparing stats for model [1mSGAN_WO_BO[0m (8/29)
Preparing stats for model [1mSGAN_WO_NO[0m (9/29)
Preparing stats for model [1mSGAN_WO_OD[0m (10/29)
Preparing stats for model [1mTGAN[0m (11/29)
Preparing stats for model [1mWGAN_OR_BO[0m (12/29)
Preparing stats for model [1mWGAN_OR_NO[0m (13/29)
Preparing stats for model [1mWGAN_OR_OD[0m (14/29)
Preparing stats for model [1mWGAN_WI_BO[0m (15/29)
Preparing stats for model [1mWGAN_WI_NO[0m (16/29)
Preparing stats for model [1mWGAN_WI_OD[0m (17/29)
Preparing stats for model [1mWGAN_WO_BO[0m (18/29)
Preparing stats for model [1mWGAN_WO_NO[0m (19/29)
Preparing sta

In [17]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in combs:

    agg_vars = c.split('::')

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [18]:
all_stats[orig_str] = stats_orig

In [25]:
res = {}

for s in stats_str:
    res[s] = {}
                    
for m in all_stats.keys():

    for s in stats_str:

        tmp = []
        for c in combs:
            tmp.append(all_stats[m][c][s])

        res[s][m] = np.mean(tmp)

In [32]:
for s in ['srmse']:#stats_str:
    print('Ranking on all coupled combinations based on {}:'.format(s.upper()))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
    print()

Ranking on all coupled combinations based on MAE:
   1. random-original      - 3.60e-03
   2. WGGP_WI_NO           - 3.87e-03
   3. WGGP_WI_OD           - 4.03e-03
   4. WGGP_WI_BO           - 4.27e-03
   5. WGGP_WO_NO           - 4.63e-03
   6. TGAN                 - 5.09e-03
   7. WGGP_WO_OD           - 5.11e-03
   8. WGGP_OR_OD           - 5.15e-03
   9. WGGP_WO_BO           - 5.24e-03
  10. WGGP_OR_BO           - 5.33e-03
  11. WGAN_WI_NO           - 5.78e-03
  12. SGAN_WI_NO           - 6.72e-03
  13. WGAN_WO_NO           - 7.13e-03
  14. CTGAN                - 7.86e-03
  15. WGGP_OR_NO           - 7.97e-03
  16. WGAN_WI_OD           - 9.60e-03
  17. WGAN_OR_OD           - 9.88e-03
  18. WGAN_OR_BO           - 1.01e-02
  19. WGAN_OR_NO           - 1.05e-02
  20. SGAN_WI_OD           - 1.06e-02
  21. WGAN_WI_BO           - 1.09e-02
  22. SGAN_WI_BO           - 1.18e-02
  23. WGAN_WO_OD           - 1.26e-02
  24. SGAN_OR_NO           - 1.28e-02
  25. WGAN_WO_BO           - 1.48e-02
