<a href="https://colab.research.google.com/github/PhilippMatthes/diplom/blob/master/src/shl-deep-learning-timeseries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using a deep CNN to directly classify SHL timeseries data

In [1]:
# Free up some disk space on colab
!rm -rf /usr/local/lib/python2.7
!rm -rf /swift
!rm -rf /usr/local/lib/python3.6/dist-packages/torch
!rm -rf /usr/local/lib/python3.6/dist-packages/pystan
!rm -rf /usr/local/lib/python3.6/dist-packages/spacy
!rm -rf /tensorflow-1.15.2/

In [2]:
# Get needed auxiliary files for colab
!git clone https://github.com/philippmatthes/diplom

Cloning into 'diplom'...
remote: Enumerating objects: 1755, done.[K
remote: Counting objects: 100% (1092/1092), done.[K
remote: Compressing objects: 100% (734/734), done.[K
remote: Total 1755 (delta 542), reused 817 (delta 301), pack-reused 663[K
Receiving objects: 100% (1755/1755), 34.53 MiB | 20.57 MiB/s, done.
Resolving deltas: 100% (918/918), done.


In [3]:
# Change into src dir and load our datasets
%cd /content/diplom/src
!mkdir shl-dataset

/content/diplom/src


In [None]:
# Download training datasets
!wget -nc -O shl-dataset/challenge-2019-train_torso.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
!wget -nc -O shl-dataset/challenge-2019-train_bag.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
!wget -nc -O shl-dataset/challenge-2019-train_hips.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_hips.zip
!wget -nc -O shl-dataset/challenge-2020-train_hand.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-train_hand.zip
# Download validation dataset
!wget -nc -O shl-dataset/challenge-2020-validation.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-validation.zip

--2021-08-19 09:37:33--  http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
Resolving www.shl-dataset.org (www.shl-dataset.org)... 37.187.125.22
Connecting to www.shl-dataset.org (www.shl-dataset.org)|37.187.125.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5852446972 (5.5G) [application/zip]
Saving to: ‘shl-dataset/challenge-2019-train_torso.zip’


2021-08-19 09:49:39 (7.69 MB/s) - ‘shl-dataset/challenge-2019-train_torso.zip’ saved [5852446972/5852446972]

--2021-08-19 09:49:39--  http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
Resolving www.shl-dataset.org (www.shl-dataset.org)... 37.187.125.22
Connecting to www.shl-dataset.org (www.shl-dataset.org)|37.187.125.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5628524721 (5.2G) [application/zip]
Saving to: ‘shl-dataset/challenge-2019-train_bag.zip’

enge-2019-train_bag   8%[>                   

In [None]:
# Unzip training datasets
!unzip -n -d shl-dataset/challenge-2019-train_torso shl-dataset/challenge-2019-train_torso.zip
!rm shl-dataset/challenge-2019-train_torso.zip
!unzip -n -d shl-dataset/challenge-2019-train_bag shl-dataset/challenge-2019-train_bag.zip
!rm shl-dataset/challenge-2019-train_bag.zip
!unzip -n -d shl-dataset/challenge-2019-train_hips shl-dataset/challenge-2019-train_hips.zip
!rm shl-dataset/challenge-2019-train_hips.zip
!unzip -n -d shl-dataset/challenge-2020-train_hand shl-dataset/challenge-2020-train_hand.zip
!rm shl-dataset/challenge-2020-train_hand.zip
# Unzip validation dataset
!unzip -n -d shl-dataset/challenge-2020-validation shl-dataset/challenge-2020-validation.zip
!rm shl-dataset/challenge-2020-validation.zip

In [None]:
%cd /content/diplom/src
%tensorflow_version 2.x

In [2]:
# Check configuration and hardware resources

import distutils

import tensorflow as tf

if distutils.version.LooseVersion(tf.__version__) < '2.0':
    raise Exception('This notebook is compatible with TensorFlow 2.0 or higher.')

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Define all datasets to train our model on

from pathlib import Path

TRAIN_DATASET_DIRS = [
    Path('shl-dataset/challenge-2019-train_torso/train/Torso'),
    Path('shl-dataset/challenge-2019-train_bag/train/Bag'),
    Path('shl-dataset/challenge-2019-train_hips/train/Hips'),
    Path('shl-dataset/challenge-2020-train_hand/train/Hand'),
]

VALIDATION_DATASET_DIRS = [
    Path('shl-dataset/challenge-2020-validation/validation/Torso'),         
    Path('shl-dataset/challenge-2020-validation/validation/Bag'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hips'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hand'),                  
]

In [4]:
# Define more useful constants about our dataset

LABEL_ORDER = [
    'Null',
    'Still',
    'Walking',
    'Run',
    'Bike',
    'Car',
    'Bus',
    'Train',
    'Subway',
]

SAMPLE_LENGTH = 500

In [5]:
# Results from data analysis

CLASS_WEIGHTS = {
    0: 0.0, # NULL label
    1: 1.0021671573438011, 
    2: 0.9985739895697523, 
    3: 2.8994439843842423, 
    4: 1.044135815617944, 
    5: 0.7723505499007343, 
    6: 0.8652474758172704, 
    7: 0.7842127155793044, 
    8: 1.0283208861290594
}

In [6]:
# Define features for our dataset

from collections import OrderedDict

import numpy as np

# Attributes to load from our dataset
X_attributes = [
    'acc_x', 'acc_y', 'acc_z',
    'mag_x', 'mag_y', 'mag_z',
    'gyr_x', 'gyr_y', 'gyr_z',
]

# Files within the dataset that contain our attributes
X_files = [
    'Acc_x.txt', 'Acc_y.txt', 'Acc_z.txt',
    'Mag_x.txt', 'Mag_y.txt', 'Mag_z.txt',
    'Gyr_x.txt', 'Gyr_y.txt', 'Gyr_z.txt',
]

# Features to generate from our loaded attributes
# Note that `a` is going to be a dict of attribute tracks
X_features = OrderedDict({
    'acc_mag': lambda a: np.sqrt(a['acc_x']**2 + a['acc_y']**2 + a['acc_z']**2),
    'mag_mag': lambda a: np.sqrt(a['mag_x']**2 + a['mag_y']**2 + a['mag_z']**2),
    'gyr_mag': lambda a: np.sqrt(a['gyr_x']**2 + a['gyr_y']**2 + a['gyr_z']**2),
})

# Define where to find our labels for supervised learning
y_file = 'Label.txt'
y_attribute = 'labels'

In [7]:
# Load pretrained power transformers for feature scaling

import joblib

X_feature_scalers = OrderedDict({})
for feature_name, _ in X_features.items():
    scaler_dir = f'models/shl-scalers/{feature_name}.scaler.joblib'
    scaler = joblib.load(scaler_dir)
    scaler.copy = False # Save memory
    X_feature_scalers[feature_name] = scaler
    print(f'Loaded scaler from {scaler_dir}.')

Loaded scaler from models/shl-scalers/acc_mag.scaler.joblib.
Loaded scaler from models/shl-scalers/mag_mag.scaler.joblib.
Loaded scaler from models/shl-scalers/gyr_mag.scaler.joblib.




In [9]:
# Load the training and validation data into a high performance datatype

import os
import shutil

from tqdm import tqdm

import pandas as pd

def read_chunks(n_chunks, X_attr_readers, y_attr_reader):
    for _ in range(n_chunks):
        # Load raw attribute tracks
        X_raw_attrs = OrderedDict({})
        for X_attribute, X_attr_reader in zip(X_attributes, X_attr_readers):
            X_attr_track = next(X_attr_reader)
            X_attr_track = np.nan_to_num(X_attr_track.to_numpy())
            X_raw_attrs[X_attribute] = X_attr_track

        # Calculate features
        X_feature_tracks = None
        for X_feature_name, X_feature_func in X_features.items():
            X_feature_track = X_feature_func(X_raw_attrs)
            X_feature_track = X_feature_scalers[X_feature_name] \
                .transform(X_feature_track)
            if X_feature_tracks is None:
                X_feature_tracks = X_feature_track
            else:
                X_feature_tracks = np.dstack((X_feature_tracks, X_feature_track))

        # Load labels
        y_attr_track = next(y_attr_reader) # dim (None, sample_length)
        y_attr_track = np.nan_to_num(y_attr_track.to_numpy()) # dim (None, sample_length)
        y_attr_track = y_attr_track[:, 0] # dim (None, 1)

        yield X_feature_tracks, y_attr_track

def count_samples(dataset_dir):
    # Every file in the dataset has the same length, use the labels file
    n_samples = 0
    with open(dataset_dir / y_file) as f:
        for _ in tqdm(f, desc=f'Counting samples in {dataset_dir}'):
            n_samples += 1
    return n_samples

def create_chunked_readers(
    dataset_dir,
    chunksize, 
    xdtype=np.float32, # Use np.float16 with caution, can lead to overflows
    ydtype=np.int
):
    # Initialize chunked csv readers
    read_csv_kwargs = { 'sep': ' ', 'header': None, 'chunksize': chunksize }

    X_attr_readers = [] # (dim datasets x readers)
    for filename in X_files:
        X_reader = pd.read_csv(dataset_dir / filename, dtype=xdtype, **read_csv_kwargs)
        X_attr_readers.append(X_reader)
    y_attr_reader = pd.read_csv(dataset_dir / y_file, dtype=ydtype, **read_csv_kwargs)

    return X_attr_readers, y_attr_reader

def export_tfrecords(
    dataset_dir,
    n_chunks=16, # Load dataset in parts to not overload memory
):
    target_dir = f'{dataset_dir}.tfrecord'
    if os.path.isfile(target_dir):
        print(f'{target_dir} already exists.')
        return

    print(f'Exporting to {target_dir}.')

    n_samples = count_samples(dataset_dir)
    chunksize = int(np.floor(n_samples / n_chunks))
    X_attr_readers, y_attr_reader = create_chunked_readers(dataset_dir, chunksize)    

    with tf.io.TFRecordWriter(str(target_dir)) as file_writer:
        with tqdm(total=n_samples, desc=f'Reading samples to {target_dir}') as pbar:
            for X_feature_tracks, y_attr_track in read_chunks(
                n_chunks, X_attr_readers, y_attr_reader
            ):
                for X, y in zip(X_feature_tracks, y_attr_track):
                    X_flat = X.flatten() # TFRecords don't support multidimensional arrays
                    record_bytes = tf.train.Example(features=tf.train.Features(feature={
                        'X': tf.train.Feature(float_list=tf.train.FloatList(value=X_flat)),
                        'y': tf.train.Feature(int64_list=tf.train.Int64List(value=[y])) 
                    })).SerializeToString()
                    file_writer.write(record_bytes)
                pbar.update(chunksize)

for dataset_dir in TRAIN_DATASET_DIRS + VALIDATION_DATASET_DIRS:
    export_tfrecords(dataset_dir)


shl-dataset/challenge-2019-train_torso/train/Torso.tfrecord already exists.
Exporting to shl-dataset/challenge-2019-train_bag/train/Bag.tfrecord.


Counting samples in shl-dataset/challenge-2019-train_bag/train/Bag: 196072it [00:02, 74285.74it/s]
Reading samples from shl-dataset/challenge-2019-train_bag/train/Bag.tfrecord: 100%|█████████▉| 196064/196072 [04:21<00:00, 750.42it/s]


Exporting to shl-dataset/challenge-2019-train_hips/train/Hips.tfrecord.


Counting samples in shl-dataset/challenge-2019-train_hips/train/Hips: 196072it [00:02, 72422.27it/s]
Reading samples from shl-dataset/challenge-2019-train_hips/train/Hips.tfrecord: 100%|█████████▉| 196064/196072 [04:26<00:00, 734.39it/s]


Exporting to shl-dataset/challenge-2020-train_hand/train/Hand.tfrecord.


Counting samples in shl-dataset/challenge-2020-train_hand/train/Hand: 196072it [00:02, 72511.64it/s]
Reading samples from shl-dataset/challenge-2020-train_hand/train/Hand.tfrecord: 100%|█████████▉| 196064/196072 [04:26<00:00, 736.54it/s]


Exporting to shl-dataset/challenge-2020-validation/validation/Torso.tfrecord.


Counting samples in shl-dataset/challenge-2020-validation/validation/Torso: 28789it [00:00, 90973.95it/s]
Reading samples from shl-dataset/challenge-2020-validation/validation/Torso.tfrecord: 100%|█████████▉| 28784/28789 [00:42<00:00, 673.88it/s]


Exporting to shl-dataset/challenge-2020-validation/validation/Bag.tfrecord.


Counting samples in shl-dataset/challenge-2020-validation/validation/Bag: 28789it [00:00, 91195.40it/s]
Reading samples from shl-dataset/challenge-2020-validation/validation/Bag.tfrecord: 100%|█████████▉| 28784/28789 [00:42<00:00, 675.77it/s]


Exporting to shl-dataset/challenge-2020-validation/validation/Hips.tfrecord.


Counting samples in shl-dataset/challenge-2020-validation/validation/Hips: 28789it [00:00, 91566.55it/s]
Reading samples from shl-dataset/challenge-2020-validation/validation/Hips.tfrecord: 100%|█████████▉| 28784/28789 [00:43<00:00, 669.26it/s]


Exporting to shl-dataset/challenge-2020-validation/validation/Hand.tfrecord.


Counting samples in shl-dataset/challenge-2020-validation/validation/Hand: 28789it [00:00, 91931.22it/s]
Reading samples from shl-dataset/challenge-2020-validation/validation/Hand.tfrecord: 100%|█████████▉| 28784/28789 [00:42<00:00, 679.48it/s]


In [10]:
BATCH_SIZE = 128
SHUFFLE_SIZE = 16384 # Must be larger than batch size

def decode_tfrecord(record_bytes):
    example = tf.io.parse_single_example(record_bytes, {
        'X': tf.io.FixedLenFeature([SAMPLE_LENGTH, len(X_features)], tf.float32),
        'y': tf.io.FixedLenFeature([1], tf.int64)
    })
    return example['X'], example['y']

def create_dataset_tensors(dataset_dirs):
    tfrecord_dirs = [f'{d}.tfrecord' for d in dataset_dirs]
    print(f'Creating dataset over {tfrecord_dirs}.')     
    dataset = tf.data.TFRecordDataset(tfrecord_dirs) \
        .map(decode_tfrecord, num_parallel_calls=tf.data.AUTOTUNE) \
        .shuffle(SHUFFLE_SIZE) \
        .batch(BATCH_SIZE)
    count = sum(1 for _ in dataset)
    print(f'Counted {count * BATCH_SIZE} samples in dataset.')
    return dataset

def create_train_tensors():
    return create_dataset_tensors(TRAIN_DATASET_DIRS)

def create_validation_tensors():
    return create_dataset_tensors(VALIDATION_DATASET_DIRS)

In [11]:
!pip install keras-tuner -q

In [12]:
# Define helper functions for model creation

from tensorflow import keras
from tensorflow.keras import layers, models

def make_resnet_block(input_layer, block_height):
    conv_kwargs = { 
        'filters': block_height, 
        'padding': 'same', 
        'kernel_regularizer': 'l2',
    }

    conv_x = layers.Conv1D(kernel_size=8, **conv_kwargs)(input_layer)
    conv_x = layers.BatchNormalization()(conv_x)
    conv_x = layers.LeakyReLU(alpha=0.2)(conv_x)

    conv_y = layers.Conv1D(kernel_size=5, **conv_kwargs)(conv_x)
    conv_y = layers.BatchNormalization()(conv_y)
    conv_y = layers.LeakyReLU(alpha=0.2)(conv_y)

    conv_z = layers.Conv1D(kernel_size=3, **conv_kwargs)(conv_y)
    conv_z = layers.BatchNormalization()(conv_z)

    shortcut = layers.Conv1D(kernel_size=1, **conv_kwargs)(input_layer)
    shortcut = layers.BatchNormalization()(shortcut)

    output_block = layers.add([shortcut, conv_z])
    output_block = layers.LeakyReLU(alpha=0.2)(output_block)

    return output_block


def make_resnet(hp):
    input_shape = (SAMPLE_LENGTH, len(X_features))
    input_layer = layers.Input(input_shape)

    endpoint_layer = input_layer # Will be built now
    for i in range(hp.Int('n_layers', 2, 10)):
        endpoint_layer = make_resnet_block(
            endpoint_layer, 
            hp.Int(f'block_{i}_maps', 64, 512, step=64),
        )
    
    gap_layer = layers.GlobalAveragePooling1D()(endpoint_layer)
    output_layer = layers.Dense(len(LABEL_ORDER), activation='softmax')(gap_layer)

    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['acc']
    )

    return model

In [13]:
import keras_tuner as kt

tuner = kt.Hyperband(
    hypermodel=make_resnet, 
    objective='val_acc', 
    max_epochs=15, 
    overwrite=True,
    directory='models',
    project_name='shl-resnet-gridsearch',
)

tuner.search_space_summary()

Search space summary
Default search space size: 4
n_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': None}
block_0_maps (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 64, 'sampling': None}
block_0_regularizer (Boolean)
{'default': True, 'conditions': []}
block_0_activation (Choice)
{'default': 'lrelu', 'conditions': [], 'values': ['lrelu', 'relu'], 'ordered': False}


In [14]:
# Define callbacks for our training

from tensorflow.keras import callbacks

decay_lr = callbacks.ReduceLROnPlateau(
    monitor='val_acc',
    factor=0.5, 
    patience=5, # Epochs
    min_lr=0.0001, 
    verbose=1
)

stop_early = callbacks.EarlyStopping(
    monitor='val_acc', 
    patience=10, # Epochs
    verbose=1
)

In [None]:
# Keras tuner grid search training

tuner.search(
    create_train_tensors(),
    epochs=15,
    callbacks=[decay_lr, stop_early],
    validation_data=create_validation_tensors(),
    verbose=1,
    shuffle=False, # Shuffling doesn't work with our prefetching
    class_weight=CLASS_WEIGHTS,
)

Trial 21 Complete [01h 17m 57s]
val_acc: 0.5507139563560486

Best val_acc So Far: 0.5605545043945312
Total elapsed time: 15h 38m 46s

Search: Running Trial #22

Hyperparameter    |Value             |Best Value So Far 
n_layers          |4                 |2                 
block_0_maps      |256               |128               
block_0_regular...|False             |False             
block_0_activation|lrelu             |relu              
block_1_maps      |192               |128               
block_1_regular...|False             |False             
block_1_activation|lrelu             |lrelu             
block_2_maps      |192               |256               
block_2_regular...|False             |True              
block_2_activation|lrelu             |relu              
block_3_maps      |192               |None              
block_3_regular...|True              |None              
block_3_activation|lrelu             |None              
block_4_maps      |192               |Non

In [None]:
from google.colab import files

shutil.make_archive('models/shl-resnet-gridsearch', 'zip', 'models/shl-resnet-gridsearch')
files.download(zip_filename) # Download to control machine