In [18]:
%reset -f
%load_ext autoreload
%autoreload 1

import sys
from pathlib import Path

dist_to_root = 0    # the distance to project root folder
_project_root = Path.cwd().parents[dist_to_root]
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from utils import multiple_split, load_parameters, split_helper, restrict_group_samples, split_without_overlap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from functools import partial
from pathlib import Path

import pandas as pd
import numpy as np

random_seed = 42
np.random.seed(random_seed)

In [20]:
# load the parameters from file ---------------------------------------------------------
params = load_parameters('parameters.yaml')

dataset: nasa
labels: ['capacity']
features: ['voltage', 'temperature', 'power', 'current', 'discharge_power_rate', 'discharge_current_rate', 'discharge_voltage_rate', 'sum_relativeTime', 'range_voltage', 'range_current', 'range_temperature', 'step_length', 'duration', 'delta_current', 'delta_voltage', 'delta_temperature']
data_groupby: ['cycle']
num_trials: 200
overlap_mode: no
split_size: 20
multi_split_size: [5, 10, 15, 20, 25, 30, 40, 50, 70, 100]
num_restricted: 20
test_wlen: 20


In [21]:
# define different processors for 3 datasets
processors = {
    'train': partial(multiple_split, multiple_split_steps=params.multi_split_size, overlap_mode=params.overlap_mode),
    'valid': partial(multiple_split, multiple_split_steps=params.multi_split_size, overlap_mode=params.overlap_mode),
    'test': partial(split_without_overlap, split_size=params.test_wlen)
    # 'test': partial(multiple_split, multiple_split_steps=params.multi_split_size, overlap_mode=params.overlap_mode),
}

In [22]:
# data read path
source_folder = Path('..')/'data'/f'data_{params.dataset}'
print(f'load data from {source_folder}')

# data save path 
dest_folder = Path(f'data_{params.dataset}')/f'{len(params.features)}features_{len(params.multi_split_size)}splits'
dest_folder.mkdir(parents=True, exist_ok=True)

load data from ../data/data_nasa


In [23]:
def sample_dataframe_by_cycle(df, cycle_col='cycle'):

    def sample_group(group):
        return group.iloc[::5]

    sampled_df = df.groupby(cycle_col, group_keys=False).apply(sample_group)

    return sampled_df

In [24]:
# read the file from source folder
for csv_file in source_folder.glob('*.csv'):
    # get the file name
    file_name = csv_file.stem
    prefix = file_name.split('_')[0]    # prefix in {'train', 'valid', 'test'}

    # get the processor for current file
    if prefix in processors:
        partial_func = processors[prefix]
    else:
        continue

    # read the file
    data = pd.read_csv(csv_file)
    # data = sample_dataframe_by_cycle(data)
    # process the data
    print(f'{file_name}.csv processing')
    data = split_helper(data, params.data_groupby, params.features, params.labels, partial_func)
    # save the processed data
    processed_data_path = dest_folder/f'{file_name}_processed.csv'
    data.to_csv(processed_data_path, index=False)
    print(f'{file_name} -> finished -> saved into {processed_data_path}')

valid.csv processing
valid -> finished -> saved into data_nasa/16features_10splits/valid_processed.csv
test.csv processing
test -> finished -> saved into data_nasa/16features_10splits/test_processed.csv
train.csv processing
train -> finished -> saved into data_nasa/16features_10splits/train_processed.csv
