In [8]:
%reset -f
%load_ext autoreload
%autoreload 2

import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '../'))  # return the projest's root folder
sys.path.append(project_root)
from utils import multiple_split, load_parameters, split_helper, restrict_group_samples

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from functools import partial

import pandas as pd
import numpy as np

random_seed = 42
np.random.seed(random_seed)

In [10]:
# load the parameters from file ---------------------------------------------------------
params = load_parameters('parameters.yaml')

dataset: nasa
features: ['voltage', 'temperature', 'discharge_voltage_rate']
labels: ['capacity']
data_groupby: ['cycle', 'capacity']
num_trials: 500
overlap_mode: all
split_size: 20
multi_split_size: [5, 10, 15, 20, 40, 50, 70, 100]
num_restricted: 20


In [11]:
# data read path
folder_path = f'../data/data_{params.dataset}/'

# data save path 
data_save_path = f'data_{params.dataset}/'
os.makedirs(os.path.dirname(data_save_path), exist_ok=True)

In [12]:
# read the data
path_dict = {'train': f'{folder_path}train.csv',
             'valid': f'{folder_path}valid.csv',
             'test': f'{folder_path}test.csv'}
data_dict = {key: pd.read_csv(path) for key, path in path_dict.items()}

train, valid, test = data_dict['train'], data_dict['valid'], data_dict['test']

In [13]:
# DATA_1, DATA_3 ----------------------------------------------------------------------
# process the training and validation data
partial_train = partial(multiple_split, multiple_split_steps=params.multi_split_size, overlap_mode=params.overlap_mode)
partial_valid = partial(multiple_split, multiple_split_steps=params.multi_split_size, overlap_mode=params.overlap_mode)

train = split_helper(train, params.data_groupby, params.features, params.labels, partial_train)
valid = split_helper(valid, params.data_groupby, params.features, params.labels, partial_valid)

train.to_csv(f'{data_save_path}train_processed.csv')
valid.to_csv(f'{data_save_path}valid_processed.csv')
# DATA_1, DATA_3 ----------------------------------------------------------------------

In [14]:
# DATA_1, DATA_3 ----------------------------------------------------------------------
# process the testing data
partial_test = partial(restrict_group_samples, num_restricted=params.num_restricted)

test = split_helper(test, params.data_groupby, params.features, params.labels, partial_test)
test.to_csv(f'{data_save_path}test_processed_first{params.num_restricted}.csv')
# DATA_1, DATA_3 ----------------------------------------------------------------------