In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import pandas as pd
import seaborn as sns
import pickle
from pandas import ExcelWriter
from pandas import ExcelFile
import sample_utils

%matplotlib inline

# Import Data

In [2]:
"""

<< TRAIN DATA >>

FOR GLOBAL USAGE

train_Y_mean : (num_of_cycle, output_dim)
train_Y_std  : (num_of_cycle, output_dim)
train_Y_min  : (num_of_cycle, output_dim)
train_Y_std  : (num_of_cycle, output_dim)

"""
# 'rdfwfv_wfv_rdf_train2020_RDFWFV_20201222_V10.xlsx'

file_path = 'rdfwfv_wfv_rdf_train2020_RDFWFV_20201222_V10.xlsx'

num_of_cycle = 200
num_in_cycle = 50

data = np.load('../data_handler/'+file_path+'.npy', allow_pickle=True)
X_all, Y_all, X_per_cycle, Y_per_cycle = data[0], data[1], data[2], data[3]
print(data[0].shape, data[1].shape, data[2].shape, data[3].shape)
print()

train_Y_mean = np.mean(Y_all[:num_in_cycle*num_of_cycle], axis=0, dtype=np.float32)
train_Y_std = np.std(Y_all[:num_in_cycle*num_of_cycle], axis=0, dtype=np.float32)

train_Y_min = np.min(Y_all[:num_in_cycle*num_of_cycle], axis=0)
train_Y_max = np.max(Y_all[:num_in_cycle*num_of_cycle], axis=0)

print("mean:", train_Y_mean)
print("std:", train_Y_std)
print("min:", train_Y_min)
print("max:", train_Y_max)

(10000, 6) (10000, 6) (200, 6) (200, 6)

mean: [6.9403541e-12 3.4309983e-01 3.3316767e-01 1.4782393e-05 3.5800593e-05
 6.1970989e+01]
std: [1.2889904e-11 2.5909597e-02 2.7164428e-02 7.1507980e-06 1.3731923e-05
 1.3432132e+00]
min: [8.407e-14 2.400e-01 2.280e-01 3.816e-06 8.906e-06 6.030e+01]
max: [1.9850e-10 4.3600e-01 4.2800e-01 3.5650e-05 7.6300e-05 9.3065e+01]


In [3]:
"""

<< TEST DATA >>

1) test_real : (num_of_cycle, sample_num, output_dim) (ex) (6, 250, 6)
2) test_X_cycle : (num_of_cycle, input_dim)           (ex) (6, 7)

"""

file_path = '2021_RDFWFV_20210107.xlsx'

data = np.load('../data_handler/'+file_path+'.npy', allow_pickle=True)

X_all, Y_all, X_per_cycle, Y_per_cycle = data[0], data[1], data[2], data[3]
print(data[0].shape, data[1].shape, data[2].shape, data[3].shape)
print()

test_real = Y_all

output_dim = test_real.shape[1]
num_of_cycle = 16
real_bin_num = 10
num_in_cycle = int(test_real.shape[0]/num_of_cycle)

print(" VARIABLES ")
print("output_dim", output_dim)
print("num_of_cycle", num_of_cycle)
print("num_in_cycle", num_in_cycle)
print("real_bin_num", real_bin_num)
print()

# for pair plot indexing
test_X_cycle = X_per_cycle

test_real = test_real.reshape(num_of_cycle, num_in_cycle, -1)
# Y_mean = np.mean(test_real, axis=1, dtype=np.float32).reshape(num_of_cycle, 1, num_of_cycle)
# Y_std = np.std(test_real, axis=1, dtype=np.float32).reshape(num_of_cycle, 1, num_of_cycle)

print("!! check !! test real shape", test_real.shape)
print("!! check !! X value ( especially its precision )")
print()
for i in range(num_of_cycle):
    print(test_X_cycle[i])

(4000, 6) (4000, 6) (16, 6) (16, 6)

 VARIABLES 
output_dim 6
num_of_cycle 16
num_in_cycle 250
real_bin_num 10

!! check !! test real shape (16, 250, 6)
!! check !! X value ( especially its precision )

[0.00000000e+00 5.78895870e+20 3.59561006e+19 2.88838939e-02
 1.00000000e+00 0.00000000e+00]
[0.00000000e+00 1.57080658e+19 1.19778254e+18 1.18212834e-02
 1.00000000e+00 0.00000000e+00]
[0.00000000e+00 1.97195192e+19 1.27521313e+18 3.02198477e-02
 1.00000000e+00 0.00000000e+00]
[0.00000000e+00 9.15906057e+20 1.15458844e+20 2.43231726e-02
 1.00000000e+00 0.00000000e+00]
[0.00000000e+00 4.90884204e+20 6.96683015e+19 1.91663741e-02
 1.00000000e+00 0.00000000e+00]
[6.90006771e-03 8.51545870e+19 8.25700721e+18 2.12625635e-02
 0.00000000e+00 1.00000000e+00]
[6.57265186e-03 1.74466960e+20 1.03200793e+19 2.63799659e-02
 0.00000000e+00 1.00000000e+00]
[1.85497107e-02 9.10148759e+19 8.11373190e+18 1.33503512e-02
 0.00000000e+00 1.00000000e+00]
[1.51214486e-02 7.96001377e+19 1.34431524e+19 3.71338

# Load samples and calculate EMD

In [4]:
# ########################################  CHANGE HERE ####################################################################

# name = 'naive_date_day6_data_rdfwfv_wfv_rdf_train2020_RDFWFV_20201222_V10.xlsx_model_ccgan_seed_0_lr_0.0003_0.001_hidden_dim_250_batch_size_32_noise_d_100_sample_num_250_tr_num_in_cycle_50_layer_2_kappa_0.005_kernel_sigma_0.01.pkl'

# ##########################################################################################################################

# with (open(name, "rb")) as openfile:
#     result = pickle.load(openfile)

# test_sample = result['test sample']

# test_gen = test_sample
# print('test_sample shape:', test_sample.shape)

# num_of_cycle = test_gen.shape[0]
# num_in_cycle = test_real.shape[1]
# test_gen_sample_num = test_gen.shape[1]


# print("test data sample num:", test_gen_sample_num)
# print("test data shape: ", test_gen.shape)

# ###################### Calculate EMD ######################
# real_bin_num = 10
# minmax = 'train_real_global'
# test_EMD_score_list, test_sink_score_list = sample_utils.new_EMD_all_pair_each_X_integral(generated_samples = test_gen, real_samples = test_real, real_bin_num=real_bin_num, num_of_cycle=num_of_cycle, min_list = train_Y_min, max_list = train_Y_max, train_mean=train_Y_mean, train_std = train_Y_std, minmax=minmax, check=False) 
# print('mean of test_EMD_score_list', np.mean(test_EMD_score_list))

# ###################### Add 'EMD value' to file #####################
# result['test EMD'] = test_EMD_score_list
# with (open(name, "wb")) as openfile:
#     pickle.dump(result, openfile)



In [5]:
def load_samples_and_calcultae_EMD(filepath, real_bin_num = 10, minmax = 'train_real_global', real_samples = test_real, min_list = train_Y_min, max_list = train_Y_max, train_mean=train_Y_mean, train_std = train_Y_std):
    with (open(name, "rb")) as openfile:
        result = pickle.load(openfile)
    test_gen = result['test sample']

    
    num_of_cycle = test_gen.shape[0]
    num_in_cycle = test_real.shape[1]
    test_gen_sample_num = test_gen.shape[1]


    ###################### Calculate EMD ######################
    test_EMD_score_list, test_sink_score_list = sample_utils.new_EMD_all_pair_each_X_integral(generated_samples = test_gen, real_samples = real_samples, real_bin_num=real_bin_num, num_of_cycle=num_of_cycle, min_list = train_Y_min, max_list = train_Y_max, train_mean=train_Y_mean, train_std = train_Y_std, minmax=minmax, check=False) 
    print('mean of test_EMD_score_list', np.mean(test_EMD_score_list))

    ###################### Add 'EMD value' to file #####################
    result['test EMD'] = test_EMD_score_list
    with (open(name, "wb")) as openfile:
        pickle.dump(result, openfile)
    
    return result

In [6]:
name = 'naive_date_day6_data_rdfwfv_wfv_rdf_train2020_RDFWFV_20201222_V10.xlsx_model_ccgan_seed_0_lr_0.0003_0.001_hidden_dim_250_batch_size_32_noise_d_100_sample_num_250_tr_num_in_cycle_50_layer_2_kappa_0.005_kernel_sigma_0.5.pkl'

In [7]:
result=load_samples_and_calcultae_EMD(name)

FileNotFoundError: [Errno 2] No such file or directory: 'naive_date_day6_data_rdfwfv_wfv_rdf_train2020_RDFWFV_20201222_V10.xlsx_model_ccgan_seed_0_lr_0.0003_0.001_hidden_dim_250_batch_size_32_noise_d_100_sample_num_250_tr_num_in_cycle_50_layer_2_kappa_0.005_kernel_sigma_0.5.pkl'