In [1]:
import os
os.chdir('../..')

In [2]:
from modules.table_evaluator import load_data, TableEvaluator
# From https://github.com/Baukebrenninkmeijer/table-evaluator

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
import numpy as np

import pandas as pd

import pickle

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Functions

In [3]:
def is_a_DATGAN(name):
    if any(x in name for x in ['TGAN', 'CTGAN', 'FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']):
        return False
    else:
        return True
    
def check_low_appearing_vars(df):
    
    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print('Variable {}: '.format(c))
                for idx, v in zip(val.index, val):
                    if v < 0.01:
                        print('  {} - {:.2f}% ({:d})'.format(idx, 100*v, int(v*len(df))))
                print()
                
def replace_low_appearing_values(df, dataset):
    
    if 'Chicago' in dataset:
        dct_ = {}
        for i in df['hh_vehicles'].unique():
            if i >= 5:
                dct_[i] = '5+'
            else:
                dct_[i] = str(i)        
        df['hh_vehicles'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_size'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_size'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_bikes'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_bikes'].replace(dct_, inplace=True)       

    elif 'LPMC' in dataset:
        dct_ = {}
        for i in df['pt_n_interchanges'].unique():
            if i >= 2:
                dct_[i] = '2+'
            else:
                dct_[i] = str(i)        
        df['pt_n_interchanges'].replace(dct_, inplace=True) 
        
        dct_ = {
            'Diesel_LGV': 'LGV',
            'Petrol_LGV': 'LGV',
            'Hybrid_Car': 'Average_Car'
        }

In [4]:
dataset = 'Chicago'
orig_str = 'random-original'
input_folder = '../synth_data/{}/'.format(dataset)
n_models = 5
n_data = 5

# Models for testing all DATGANS
models = ['CTGAN', 'TGAN']

for i in ['WGAN', 'SGAN', 'WGGP']:
    for j in ['WI', 'OR', 'WO']:
        for k in ['NO', 'BO', 'OD']:
            models.append('{}_{}_{}'.format(i,j,k))
            
# Models for testing different DAGs
#models = ['FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']

models.sort()

files_ = {}

for m in models:
    tmp = []
    if is_a_DATGAN(m):
        spl = m.split('_')
        for i in range(n_models):
            for j in range(n_data):
                tmp.append(input_folder + '{}_{}_{:0>2}_{}_{:0>2}.csv'.format(spl[0], spl[1], i+1,  spl[2], j+1))
    else:
        for i in range(n_models):
            for j in range(n_data):
                tmp.append(input_folder + '{}_{:0>2}_{:0>2}.csv'.format(m, i+1, j+1))
    files_[m] = tmp

In [5]:
if 'Chicago' in dataset:
    continuous_columns = ["distance", "age", "departure_time"]
elif 'LPMC' in dataset:
    continuous_columns = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']

In [6]:
df_orig = pd.read_csv('../data/' + dataset.split('_')[0] + '/data.csv')

In [7]:
replace_low_appearing_values(df_orig, dataset)

In [8]:
check_low_appearing_vars(df_orig)

In [9]:
cat_cols = list(set(df_orig.columns) - set(continuous_columns))

In [10]:
filepath = './notebooks/results/{}/'.format(dataset)
filename = 'te_results.pickle'
te_results = {}
n = 7000

if 'Chicago' in dataset:
    target_col = 'choice'
elif 'LPMC' in dataset:
    target_col = 'mode_choice'

try:
    te_results = pickle.load(open(f'{filepath}{filename}','rb'))
    print('Found previous picle file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

Found previous pickel file, using that


In [None]:
for i, model in enumerate(models):
    
    if model in te_results.keys():
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists".format(model, i+1, len(models)))
    else:
        print("Getting results for model \033[1m{}\033[0m ({}/{})".format(model, i+1, len(models)))
        te_results[model] = []
        
    n_files_done = len(te_results[model])
    
    for j, f in enumerate(files_[model][n_files_done:]):
        print("  Processing file {}/{}".format(j+1+n_files_done, len(files_[model])) + " "*20)

        tmp_df = pd.read_csv(f)
        replace_low_appearing_values(tmp_df, dataset)
        
        te = TableEvaluator(tmp_df, df_orig, cat_cols, verbose=False)
        
        res = te.evaluate(target_col=target_col, kfold=True)
        
        te_results[model].append(res.content.to_dict()['result'])
        
        pickle.dump(te_results, open(filepath + filename, 'wb'))

Results for model [1mCTGAN[0m (1/29) already exists
  Processing file 2/25                    
  Processing file 3/25                    
  Processing file 4/25                    
  Processing file 5/25                    
  Processing file 6/25                    
  Processing file 7/25                    
  Processing file 8/25                    
  Processing file 9/25                    
  Processing file 10/25                    
  Processing file 11/25                    
  Processing file 12/25                    
  Processing file 13/25                    
  Processing file 14/25                    
  Processing file 15/25                    
  Processing file 16/25                    
  Processing file 17/25                    
  Processing file 18/25                    
  Processing file 19/25                    
  Processing file 20/25                    
  Processing file 21/25                    
  Processing file 22/25                    
  Processing file 23/25       

  Processing file 6/25                    
  Processing file 7/25                    
  Processing file 8/25                    
  Processing file 9/25                    
  Processing file 10/25                    
  Processing file 11/25                    
  Processing file 12/25                    
  Processing file 13/25                    
  Processing file 14/25                    
  Processing file 15/25                    
  Processing file 16/25                    
  Processing file 17/25                    
  Processing file 18/25                    
  Processing file 19/25                    
  Processing file 20/25                    
  Processing file 21/25                    
  Processing file 22/25                    
  Processing file 23/25                    
  Processing file 24/25                    
  Processing file 25/25                    
Getting results for model [1mSGAN_WO_NO[0m (9/29)
  Processing file 1/25                    
  Processing file 2/25       

In [None]:
if orig_str not in te_results:
    te_results[orig_str] = []

    for i in range(n_models*n_data):
        print("Processing random dataset {}/{}".format(i+1, len(files_[model])) + " "*20)

        train = df_orig.sample(int(len(df_orig) * 0.5))
        train.index = range(len(train))
        test = df_orig[~df_orig.index.isin(train.index)]
        test.index = range(len(test))
        
        te = TableEvaluator(test, train, cat_cols, verbose=False)
        
        res = te.evaluate(target_col=target_col, kfold=True)
        
        te_results[orig_str].append(res.content.to_dict()['result'])
        
        pickle.dump(te_results, open(filepath + filename, 'wb'))

In [None]:
keys = te_results[orig_str][0].keys()

In [None]:
res = {}

for k in keys:
    res[k] = {}
            
    for m in te_results.keys():
        
        tmp = []
        
        for i in range(n_models*n_data):
            tmp.append(te_results[m][i][k])
            
        res[k][m] = {
            'mean': np.mean(tmp),
            'std': np.std(tmp)
        }

In [None]:
for s in keys:
    print('Ranking on "{}":'.format(s))

    sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1]['mean'])[::-1]}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<20} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
    print()
