# Import

In [1]:
# System
import os
import time
import json

# Data processing
import numpy as np

# Tensor
import torch 
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler

# Config

In [2]:
# Config path
root = '/Volumes/Expansion/User_Backup/b08209033/111-2_IVT_analysis/'
folder = '2023_0330'
file = 'src/config.json'
config_path = os.path.join(os.path.join(root, folder), file)

# Import config
with open(config_path) as infile:
    config = json.load(infile)
    infile.close()

# Update config
config.update({"ML_seq_len": 7})
config.update({"ML_forecast_step": 1})
config.update({"ML_train_set_name": 'TensorDataset_train.pt'})
config.update({"ML_valid_set_name": 'TensorDataset_valid.pt'})
config.update({"ML_test_set_name": 'TensorDataset_test.pt'})

# Export config
with open(config_path, 'w') as outfile:
    json.dump(config, outfile, sort_keys=True)
    outfile.close()

# Read time structure

In [3]:
os.chdir(config["IVTPath"])
with np.load(config["IVT_SVD_fname"]) as dataset:
    time_structure = dataset['time']
    feature_num = int(dataset['feature_threshold'][0][0])

# Choose k important structure
time_structure = (time_structure[:feature_num]).T

# Test
scaler = MinMaxScaler(feature_range=(-1,1))
time_structure = scaler.fit_transform(time_structure)
#scaler.inverse_transform(scaled_data)

# Pack into Dataset

In [4]:
# Defined train valid test split
train_ratio = 12/16
test_ratio = 3/16
valid_ratio = 1 - train_ratio - test_ratio

# Define "available" data length
data_size = len(time_structure) - (config["ML_seq_len"] + config["ML_forecast_step"]) + 1
train_size = int(data_size * train_ratio)
test_size = int(data_size * test_ratio)
valid_size = data_size - (train_size + test_size)

# Define "available" data index
data_idx = np.arange(data_size)
train_data_idx = data_idx[:train_size]; data_idx = data_idx[train_size:]
valid_data_idx = data_idx[:valid_size]; data_idx = data_idx[valid_size:]
test_data_idx = data_idx[:test_size]; data_idx = data_idx[test_size:]


# Padding into sequence
X_train = np.zeros((train_size, config["ML_seq_len"], feature_num))
Y_train = np.zeros((train_size, 1, feature_num))
for i, idx in enumerate(train_data_idx):
    X_train[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_train[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)
    
X_valid = np.zeros((valid_size, config["ML_seq_len"], feature_num))
Y_valid = np.zeros((valid_size, 1, feature_num))
for i, idx in enumerate(valid_data_idx):
    X_valid[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_valid[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)
    

X_test = np.zeros((test_size, config["ML_seq_len"], feature_num))
Y_test = np.zeros((test_size, 1, feature_num))
for i, idx in enumerate(test_data_idx):
    X_test[i,:,:] = time_structure[idx:idx+config["ML_seq_len"],:]
    Y_test[i,:,:] = time_structure[idx+config["ML_seq_len"]+config["ML_forecast_step"]-1,:].reshape(1, -1)

# Scaler, MinMax or Norm or else
def Normalizer(tensor):
    multiplier = 3
    mean =  tensor.mean()
    std = tensor.std()
    return (tensor - mean)/(std*multiplier), np.array([mean, (std*multiplier)])
def MinMaxer(tensor):
    maximum = tensor.max()
    minimum = tensor.min()
    return (tensor - minimum)/(maximum - minimum), np.array([minimum, maximum])
"""
X_train, X_train_dist = Normalizer(X_train)
Y_train, Y_train_dist = Normalizer(Y_train)
X_valid, X_valid_dist = Normalizer(X_valid)
Y_valid, Y_valid_dist = Normalizer(Y_valid)
X_test, X_test_dist = Normalizer(X_test)
Y_test, Y_test_dist = Normalizer(Y_test)
print(X_test_dist)
print(Y_test_dist)
"""

"""
X_train, X_train_dist = MinMaxer(X_train)
Y_train, Y_train_dist = MinMaxer(Y_train)
X_valid, X_valid_dist = MinMaxer(X_valid)
Y_valid, Y_valid_dist = MinMaxer(Y_valid)
X_test, X_test_dist = MinMaxer(X_test)
Y_test, Y_test_dist = MinMaxer(Y_test)
"""
# Data
X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
Y_train = torch.from_numpy(Y_train).type(torch.FloatTensor)
X_valid = torch.from_numpy(X_valid).type(torch.FloatTensor)
Y_valid = torch.from_numpy(Y_valid).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
Y_test = torch.from_numpy(Y_test).type(torch.FloatTensor)
train_data_idx = torch.from_numpy(train_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
valid_data_idx = torch.from_numpy(valid_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
test_data_idx = torch.from_numpy(test_data_idx.reshape(-1, 1, 1)).type(torch.IntTensor)
# Encapsulate to TensorDataset
train_set = TensorDataset(X_train, Y_train, train_data_idx)
valid_set = TensorDataset(X_valid, Y_valid, valid_data_idx)
test_set = TensorDataset(X_test, Y_test, test_data_idx)

print(train_size)
print(data_size)

11766
15688


In [5]:
os.chdir(config["IVTPath"])
torch.save(train_set, config["ML_train_set_name"])
torch.save(valid_set, config["ML_valid_set_name"])
torch.save(test_set, config["ML_test_set_name"])