<a href="https://colab.research.google.com/github/philippmatthes/tmd/blob/master/src/shl-power-transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fit and configure scalers

In [None]:
# Get needed auxiliary files for colab
!git clone https://github.com/philippmatthes/tmd
%cd /content/tmd/src
!mkdir shl-dataset
!wget -nc -O shl-dataset/challenge-2019-train_torso.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
!wget -nc -O shl-dataset/challenge-2019-train_bag.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
!wget -nc -O shl-dataset/challenge-2019-train_hips.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_hips.zip
!wget -nc -O shl-dataset/challenge-2020-train_hand.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-train_hand.zip

In [None]:
# Switch to src dir and select tensorflow
%cd /content/tmd/src
%tensorflow_version 2.x

/content/tmd/src


In [None]:
# Create our scalers
from sklearn.preprocessing import PowerTransformer

shl_dataset_X_attributes = [
    'acc_x', 'acc_y', 'acc_z',
    'mag_x', 'mag_y', 'mag_z',
    'gyr_x', 'gyr_y', 'gyr_z',
    'gra_x', 'gra_y', 'gra_z',
    'lacc_x', 'lacc_y', 'lacc_z',
    'ori_x', 'ori_y', 'ori_z', 'ori_w',
]

shl_dataset_y_attributes = ['labels']

shl_dataset_attributes = shl_dataset_X_attributes + shl_dataset_y_attributes

scalers = dict([(a, PowerTransformer()) for a in shl_dataset_X_attributes])
scalers

{'acc_x': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'acc_y': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'acc_z': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gra_x': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gra_y': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gra_z': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gyr_x': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gyr_y': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'gyr_z': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'lacc_x': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'lacc_y': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'lacc_z': PowerTransformer(copy=True, method='yeo-johnson', standardize=True),
 'mag_x': PowerTransformer(copy=True, method='yeo

In [None]:
from pathlib import Path

DATASET_DIRS = [
    Path('shl-dataset/challenge-2019-train_torso.zip'),
    Path('shl-dataset/challenge-2019-train_bag.zip'),
    Path('shl-dataset/challenge-2019-train_hips.zip'),
    Path('shl-dataset/challenge-2020-train_hand.zip'),
]

In [None]:
# Load the datasets

import zipfile
import tempfile
import pathlib

import pandas as pd

from tqdm import tqdm

shl_dataset_files = [
    'Acc_x.txt', 'Acc_y.txt', 'Acc_z.txt',
    'Mag_x.txt', 'Mag_y.txt', 'Mag_z.txt',
    'Gyr_x.txt', 'Gyr_y.txt', 'Gyr_z.txt',
    'Gra_x.txt', 'Gra_y.txt', 'Gra_z.txt',
    'LAcc_x.txt', 'LAcc_y.txt', 'LAcc_z.txt',
    'Ori_x.txt', 'Ori_y.txt', 'Ori_z.txt', 'Ori_w.txt',
    'Label.txt'
]

class SHLDataset:
    def __init__(self):
        pass

    def concat_inplace(self, other):
        for attribute in shl_dataset_attributes:
            setattr(self, attribute, np.concatenate((
                getattr(self, attribute),
                getattr(other, attribute)
            ), axis=0))


def load_shl_dataset(dataset_dir: pathlib.Path, tqdm=None, nrows=None):
    dataset = SHLDataset()
    if tqdm is None:
        tqdm = lambda x, desc: x # passthrough
    for attribute, filename in tqdm(
        list(zip(shl_dataset_attributes, shl_dataset_files)),
        desc=f'Loading dataset subfiles'
    ):
        df = pd.read_csv(dataset_dir / filename, header=None, sep=' ', nrows=nrows, dtype=np.float16)
        np_arr = np.nan_to_num(df.to_numpy())
        setattr(dataset, attribute, np_arr)
    return dataset


def load_zipped_shl_dataset(zip_dir: pathlib.Path, tqdm=None, nrows=None, subdir_in_zip='train'):
    with tempfile.TemporaryDirectory() as unzip_dir:
        with zipfile.ZipFile(zip_dir, 'r') as zip_ref:
            if tqdm:
                for member in tqdm(zip_ref.infolist(), desc=f'Extracting {zip_dir}'):
                    zip_ref.extract(member, unzip_dir)
            else:
                zip_ref.extractall(unzip_dir)

        train_dir = pathlib.Path(unzip_dir) / subdir_in_zip
        sub_dirs = [x for x in train_dir.iterdir() if train_dir.is_dir()]

        result_dataset = None
        for sub_dir in sub_dirs:
            sub_dataset = load_shl_dataset(train_dir / sub_dir, tqdm=tqdm, nrows=nrows)
            if result_dataset is None:
                result_dataset = sub_dataset
            else:
                result_dataset.concat_inplace(sub_dataset)
                del sub_dataset
        return result_dataset

dataset = None

for dataset_dir in DATASET_DIRS:
    # Load dataset from zip file into temporary directory
    partial_dataset = load_zipped_shl_dataset(dataset_dir, tqdm=tqdm)
    if dataset is None:
        dataset = partial_dataset
    else:
        dataset.concat_inplace(partial_dataset)

Extracting shl-dataset/challenge-2019-train_torso.zip: 100%|██████████| 22/22 [03:06<00:00,  8.50s/it]
Loading dataset subfiles: 100%|██████████| 20/20 [06:03<00:00, 18.20s/it]
Extracting shl-dataset/challenge-2019-train_bag.zip: 100%|██████████| 22/22 [03:02<00:00,  8.29s/it]
Loading dataset subfiles: 100%|██████████| 20/20 [06:04<00:00, 18.21s/it]
Extracting shl-dataset/challenge-2019-train_hips.zip: 100%|██████████| 22/22 [03:11<00:00,  8.68s/it]
Loading dataset subfiles: 100%|██████████| 20/20 [06:22<00:00, 19.10s/it]
Extracting shl-dataset/challenge-2020-train_hand.zip: 100%|██████████| 23/23 [03:11<00:00,  8.33s/it]
Loading dataset subfiles: 100%|██████████| 20/20 [06:33<00:00, 19.67s/it]


In [None]:
import numpy as np

import json
import joblib

export_dir = 'models/'
num_random_samples = 10000

for attribute, scaler in tqdm(scalers.items(), desc='Fitting scalers'):
    samples = getattr(dataset, attribute)
    random_samples_idx = np.random.choice(samples.shape[0], num_random_samples, replace=False)
    random_samples = samples[random_samples_idx]
    scaler.fit(random_samples.astype(np.float64))

    # Platform independent export
    transformer_params = {
        'lambdas': list(scaler.lambdas_),
    }
    with open(export_dir + f'{attribute}.scaler.json', 'w') as f:
        f.write(json.dumps(transformer_params))
    # Python export
    joblib.dump(scaler, export_dir + f'{attribute}.scaler.joblib')

In [None]:
# Download updated model folder
import shutil
shutil.make_archive('models', 'zip', 'models')

'/content/tmd/src/models.zip'