# Import

In [1]:
# System
import os
import time
import json

# Data processing
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

"""
# Tensor
import torch 
from torch.utils.data import Dataset, DataLoader, TensorDataset
"""

# Config

In [2]:
# Config path
root = '/Volumes/Expansion/User_Backup/b08209033/111-2_IVT_analysis/'
file = 'config.json'
config_path = os.path.join(root, file)

# Import config
with open(config_path) as infile:
    config = json.load(infile)
    infile.close()

# Update config
config.update({"Flag_timeline_feature": True})
config.update({"ML_fname_dataset": "IVT_TS_dataset.npz"})

# Export config
with open(config_path, 'w') as outfile:
    json.dump(config, outfile, sort_keys=True)
    outfile.close()

# Read time structure

In [3]:
os.chdir(config["Path_IVT_calculation"])
with np.load(config["Fname_IVT_svd"]) as dataset:
    time_structure = dataset['time']
    feature_num = int(config["Var_Feature_num_SVD"])
# Choose k important structure
time_structure = (time_structure[:feature_num]).T # (Timestep, feature) after transpose

# Dataset

In [4]:
# Split dataset
# Normalizer
# Split observation and prediction
# Pack to DataSet

### Define data split method

In [5]:
# Ratio
    # Total: 16 partitions
    # train: 12 partitions
    # test :  3 partitions
    # valid:  1 partitions
data_ratio  = 16/16
train_ratio = 12/16
test_ratio  =  3/16
valid_ratio =  1/16
split_ratio = [train_ratio, valid_ratio, test_ratio]

# Size
data_size  = len(time_structure)
train_size = int(data_size * train_ratio)
test_size  = int(data_size * test_ratio)
valid_size = data_size - (train_size + test_size) # remaining
split_size = [train_size, valid_size, test_size]

# Save data split info
config.update({"ML_split_ratio": split_ratio})
config.update({"ML_split_size":  split_size })
with open(config_path, 'w') as outfile:
    json.dump(config, outfile, sort_keys=True)
    outfile.close()

### Timeline feature (optional)

In [6]:
timeline_addfeat = np.arange(data_size) # monotonic0
timeline_addfeat = np.sin(timeline_addfeat/365*2*np.pi) # periodic & continuity
timeline_addfeat = np.reshape(timeline_addfeat, (-1, 1))
if (config["Flag_timeline_feature"]):
    feature_map = np.concatenate((time_structure, timeline_addfeat), axis = 1)
else:
    feature_map = time_structure

### Split dataset

In [7]:
train_set = feature_map[:train_size,:]; time_structure = time_structure[train_size:,:]
valid_set = feature_map[:valid_size,:]; time_structure = time_structure[valid_size:,:]
test_set  = feature_map[:test_size,:] ; time_structure = time_structure[test_size:,:]

### Rescale dataset 

In [8]:
train_scaler = MinMaxScaler(feature_range=(-1,1))
valid_scaler = MinMaxScaler(feature_range=(-1,1))
test_scaler  = MinMaxScaler(feature_range=(-1,1))

train_set_scaled = train_scaler.fit_transform(train_set)
valid_set_scaled = valid_scaler.fit_transform(valid_set)
test_set_scaled  = test_scaler.fit_transform(test_set)

os.chdir(config["Path_IVT_calculation"])
joblib.dump(train_scaler, 'train_scaler.gz')
joblib.dump(valid_scaler, 'valid_scaler.gz')
joblib.dump(test_scaler, 'test_scaler.gz')

#my_scaler = joblib.load('scaler.gz')
#scaler.inverse_transform(scaled_data)

['test_scaler.gz']

In [9]:
os.chdir(config["Path_IVT_calculation"])
np.savez(config["ML_fname_dataset"], 
         train = train_set_scaled, 
         valid = valid_set_scaled, 
         test  = test_set_scaled)

In [11]:
"""
# Define "available" data index
data_idx = np.arange(data_size)
train_data_idx = data_idx[:train_size]; data_idx = data_idx[train_size:]
valid_data_idx = data_idx[:valid_size]; data_idx = data_idx[valid_size:]
test_data_idx = data_idx[:test_size]; data_idx = data_idx[test_size:]


# Padding into sequence
X_train = np.zeros((train_size, config["ML_seq_len"], feature_num))
Y_train = np.zeros((train_size, 1, feature_num))
for i, idx in enumerate(train_data_idx):
    X_train[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_train[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)
    
X_valid = np.zeros((valid_size, config["ML_seq_len"], feature_num))
Y_valid = np.zeros((valid_size, 1, feature_num))
for i, idx in enumerate(valid_data_idx):
    X_valid[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_valid[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)
    

X_test = np.zeros((test_size, config["ML_seq_len"], feature_num))
Y_test = np.zeros((test_size, 1, feature_num))
for i, idx in enumerate(test_data_idx):
    X_test[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_test[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)

# Data
X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
Y_train = torch.from_numpy(Y_train).type(torch.FloatTensor)
X_valid = torch.from_numpy(X_valid).type(torch.FloatTensor)
Y_valid = torch.from_numpy(Y_valid).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
Y_test = torch.from_numpy(Y_test).type(torch.FloatTensor)
train_data_idx = torch.from_numpy(train_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
valid_data_idx = torch.from_numpy(valid_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
test_data_idx = torch.from_numpy(test_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
# Encapsulate to TensorDataset
train_set = TensorDataset(X_train, Y_train, train_data_idx)
valid_set = TensorDataset(X_valid, Y_valid, valid_data_idx)
test_set = TensorDataset(X_test, Y_test, test_data_idx)

print(train_size)
print(data_size)
"""

'\n# Define "available" data index\ndata_idx = np.arange(data_size)\ntrain_data_idx = data_idx[:train_size]; data_idx = data_idx[train_size:]\nvalid_data_idx = data_idx[:valid_size]; data_idx = data_idx[valid_size:]\ntest_data_idx = data_idx[:test_size]; data_idx = data_idx[test_size:]\n\n\n# Padding into sequence\nX_train = np.zeros((train_size, config["ML_seq_len"], feature_num))\nY_train = np.zeros((train_size, 1, feature_num))\nfor i, idx in enumerate(train_data_idx):\n    X_train[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]\n    Y_train[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)\n    \nX_valid = np.zeros((valid_size, config["ML_seq_len"], feature_num))\nY_valid = np.zeros((valid_size, 1, feature_num))\nfor i, idx in enumerate(valid_data_idx):\n    X_valid[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]\n    Y_valid[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)\