# Import

In [1]:
# System
import os
import json

# Data processing
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

# Config

In [2]:
# Config path
root = '/Volumes/Expansion/User_Backup/b08209033/111-2_IVT_analysis/'
file = 'config.json'
config_path = os.path.join(root, file)

# Import config
with open(config_path) as infile:
    config = json.load(infile)
    infile.close()

# Update config
config.update({"Flag_timeline_feature": False})
config.update({"ML_fname_dataset": "IVT_TS_dataset.npz"})

# Export config
with open(config_path, 'w') as outfile:
    json.dump(config, outfile, sort_keys=True)
    outfile.close()

# Read SVD

In [3]:
os.chdir(config["Path_IVT_calculation"])
with np.load(config["Fname_IVT_svd"]) as dataset:
    time_structure = dataset['time']
    feature_num = int(config["Var_Feature_num_SVD"])
# Choose k important structure
time_structure = (time_structure[:feature_num]).T # (Timestep, feature) after transpose

# Dataset

In [4]:
# Ratio
    # Total: 16 partitions
    # train: 12 partitions
    # test :  3 partitions
    # valid:  1 partitions
data_ratio  = 16/16
train_ratio = 12/16
test_ratio  =  3/16
valid_ratio =  1/16
split_ratio = [train_ratio, valid_ratio, test_ratio]

# Size
data_size  = len(time_structure)
train_size = int(data_size * train_ratio)
test_size  = int(data_size * test_ratio)
valid_size = data_size - (train_size + test_size) # remaining
split_size = [train_size, valid_size, test_size]

# Save data split info
config.update({"ML_split_ratio": split_ratio})
config.update({"ML_split_size":  split_size })
with open(config_path, 'w') as outfile:
    json.dump(config, outfile, sort_keys=True)
    outfile.close()

In [5]:
# Timeline feature (optional)
    # This feature is designed to be 
    # a periodic sin wave, with period = 365 day
timeline_addfeat = np.arange(data_size) # monotonic
timeline_addfeat = np.abs(np.sin(timeline_addfeat/365*np.pi)) # periodic & continuity
timeline_addfeat = np.reshape(timeline_addfeat, (-1, 1))

if (config["Flag_timeline_feature"]):
    feature_map = np.concatenate((time_structure, timeline_addfeat), axis = 1)
else:
    feature_map = time_structure

In [6]:
# Split dataset
train_set = feature_map[:train_size,:]; time_structure = time_structure[train_size:,:]
valid_set = feature_map[:valid_size,:]; time_structure = time_structure[valid_size:,:]
test_set  = feature_map[:test_size,:] ; time_structure = time_structure[test_size:,:]

In [7]:
"""
my_scaler = joblib.load('scaler.gz')
scaler.inverse_transform(scaled_data)
"""
# Rescale dataset
train_scaler = MinMaxScaler(feature_range=(-1,1))
valid_scaler = MinMaxScaler(feature_range=(-1,1))
test_scaler  = MinMaxScaler(feature_range=(-1,1))

train_set_scaled = train_scaler.fit_transform(train_set)
valid_set_scaled = valid_scaler.fit_transform(valid_set)
test_set_scaled  = test_scaler.fit_transform(test_set)

In [8]:
# Save scaler
os.chdir(config["Path_IVT_calculation"])
joblib.dump(train_scaler, 'train_scaler.gz')
joblib.dump(valid_scaler, 'valid_scaler.gz')
joblib.dump(test_scaler, 'test_scaler.gz')
# Save dataset
os.chdir(config["Path_IVT_calculation"])
np.savez(config["ML_fname_dataset"], 
         train = train_set_scaled, 
         valid = valid_set_scaled, 
         test  = test_set_scaled)