<a href="https://colab.research.google.com/github/PhilippMatthes/diplom/blob/master/src/shl-deep-learning-timeseries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using a deep CNN to directly classify SHL timeseries data

In [1]:
# Free up some disk space on colab
!rm -rf /usr/local/lib/python2.7
!rm -rf /swift
!rm -rf /usr/local/lib/python3.6/dist-packages/torch
!rm -rf /usr/local/lib/python3.6/dist-packages/pystan
!rm -rf /usr/local/lib/python3.6/dist-packages/spacy
!rm -rf /tensorflow-1.15.2/

In [2]:
# Get needed auxiliary files for colab
!git clone https://github.com/philippmatthes/diplom

Cloning into 'diplom'...
remote: Enumerating objects: 1583, done.[K
remote: Counting objects: 100% (920/920), done.[K
remote: Compressing objects: 100% (612/612), done.[K
remote: Total 1583 (delta 438), reused 728 (delta 268), pack-reused 663[K
Receiving objects: 100% (1583/1583), 34.10 MiB | 23.05 MiB/s, done.
Resolving deltas: 100% (814/814), done.


In [3]:
# Change into src dir and load our datasets
%cd /content/diplom/src
!mkdir shl-dataset

/content/diplom/src


In [4]:
# Download training datasets
!wget -nc -O shl-dataset/challenge-2019-train_torso.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
!wget -nc -O shl-dataset/challenge-2019-train_bag.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
!wget -nc -O shl-dataset/challenge-2019-train_hips.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_hips.zip
!wget -nc -O shl-dataset/challenge-2020-train_hand.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-train_hand.zip
# Download validation dataset
!wget -nc -O shl-dataset/challenge-2020-validation.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-validation.zip

--2021-08-12 07:15:10--  http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
Resolving www.shl-dataset.org (www.shl-dataset.org)... 37.187.125.22
Connecting to www.shl-dataset.org (www.shl-dataset.org)|37.187.125.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5852446972 (5.5G) [application/zip]
Saving to: ‘shl-dataset/challenge-2019-train_torso.zip’


2021-08-12 07:23:31 (11.1 MB/s) - ‘shl-dataset/challenge-2019-train_torso.zip’ saved [5852446972/5852446972]

--2021-08-12 07:23:31--  http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
Resolving www.shl-dataset.org (www.shl-dataset.org)... 37.187.125.22
Connecting to www.shl-dataset.org (www.shl-dataset.org)|37.187.125.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5628524721 (5.2G) [application/zip]
Saving to: ‘shl-dataset/challenge-2019-train_bag.zip’


2021-08-12 07:31:33 (11.1 MB/s) - ‘shl-datas

In [5]:
# Unzip training datasets
!unzip -n -d shl-dataset/challenge-2019-train_torso shl-dataset/challenge-2019-train_torso.zip
!rm shl-dataset/challenge-2019-train_torso.zip
!unzip -n -d shl-dataset/challenge-2019-train_bag shl-dataset/challenge-2019-train_bag.zip
!rm shl-dataset/challenge-2019-train_bag.zip
!unzip -n -d shl-dataset/challenge-2019-train_hips shl-dataset/challenge-2019-train_hips.zip
!rm shl-dataset/challenge-2019-train_hips.zip
!unzip -n -d shl-dataset/challenge-2020-train_hand shl-dataset/challenge-2020-train_hand.zip
!rm shl-dataset/challenge-2020-train_hand.zip
# Unzip validation dataset
!unzip -n -d shl-dataset/challenge-2020-validation shl-dataset/challenge-2020-validation.zip
!rm shl-dataset/challenge-2020-validation.zip

Archive:  shl-dataset/challenge-2019-train_torso.zip
   creating: shl-dataset/challenge-2019-train_torso/train/Torso/
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Acc_x.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Acc_y.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Acc_z.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gra_x.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gra_y.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gra_z.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gyr_x.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gyr_y.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Gyr_z.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/Label.txt  
  inflating: shl-dataset/challenge-2019-train_torso/train/Torso/LAcc_x.txt  
  inflating: shl-dataset/challenge-2019-train

In [1]:
%cd /content/diplom/src
%tensorflow_version 2.x

/content/diplom/src


In [2]:
# Import garbage collector to save memory here and there
import gc

In [3]:
# Define all datasets to train our model on

from pathlib import Path

TRAIN_DATASET_DIRS = [
    Path('shl-dataset/challenge-2019-train_torso/train/Torso'),
    Path('shl-dataset/challenge-2019-train_bag/train/Bag'),
    Path('shl-dataset/challenge-2019-train_hips/train/Hips'),
    Path('shl-dataset/challenge-2020-train_hand/train/Hand'),
]

VALIDATION_DATASET_DIRS = [
    Path('shl-dataset/challenge-2020-validation/validation/Torso'),         
    Path('shl-dataset/challenge-2020-validation/validation/Bag'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hips'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hand'),                  
]

shl_dataset_label_order = [
    'Null',
    'Still',
    'Walking',
    'Run',
    'Bike',
    'Car',
    'Bus',
    'Train',
    'Subway',
]

shl_dataset_X_attributes = [
    'acc_x', 'acc_y', 'acc_z',
    'mag_x', 'mag_y', 'mag_z',
    'gyr_x', 'gyr_y', 'gyr_z',
    'gra_x', 'gra_y', 'gra_z',
    'lacc_x', 'lacc_y', 'lacc_z',
    'ori_x', 'ori_y', 'ori_z', 'ori_w',
]

sample_length = 500

X_files = [
    'Acc_x.txt', 'Acc_y.txt', 'Acc_z.txt',
    'Mag_x.txt', 'Mag_y.txt', 'Mag_z.txt',
    'Gyr_x.txt', 'Gyr_y.txt', 'Gyr_z.txt',
    'Gra_x.txt', 'Gra_y.txt', 'Gra_z.txt',
    'LAcc_x.txt', 'LAcc_y.txt', 'LAcc_z.txt',
    'Ori_x.txt', 'Ori_y.txt', 'Ori_z.txt', 'Ori_w.txt',
]

X_attributes = [
    'acc_x', 'acc_y', 'acc_z',
    'mag_x', 'mag_y', 'mag_z',
    'gyr_x', 'gyr_y', 'gyr_z',
    'gra_x', 'gra_y', 'gra_z',
    'lacc_x', 'lacc_y', 'lacc_z',
    'ori_x', 'ori_y', 'ori_z', 'ori_w',
]

y_file = 'Label.txt'

y_attribute = 'labels'

# Results from data analysis
class_weights = {
    0: 0.0,
    1: 1.0021671573438011, 
    2: 0.9985739895697523, 
    3: 2.8994439843842423, 
    4: 1.044135815617944, 
    5: 0.7723505499007343, 
    6: 0.8652474758172704, 
    7: 0.7842127155793044, 
    8: 1.0283208861290594
}

In [4]:
from tensorflow import keras

# Check that we can use our GPU, to not wait forever during training
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3496327479195429570, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 16183459840
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8676984723890448601
 physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"]

In [5]:
# Load power transformers for preprocessing

import joblib

from collections import OrderedDict

scalers = OrderedDict({})
for a in shl_dataset_X_attributes:
    scaler = joblib.load(f'models/shl-scalers/{a}.scaler.joblib')
    scaler.copy = False # Save memory
    scalers[a] = scaler

In [8]:
# Create our model
from architectures.resnet import make_resnet

MODEL_DIR = Path('models/shl-resnet-all-timeseries')

model = make_resnet((sample_length, len(shl_dataset_X_attributes)), len(shl_dataset_label_order))
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

class SHLDatasetGenerator(keras.utils.Sequence):    
    def __init__(
        self, 
        dataset_dirs, 
        batch_size=pow(2, 5),
        prefetch_size_per_dataset=pow(2, 14), 
        xdtype=np.float16, 
        ydtype=np.int
    ):
        self.dataset_dirs = dataset_dirs
        self.batch_size = batch_size
        
        self.prefetch_size_per_dataset = prefetch_size_per_dataset
        self.prefetched_X_batches = None
        self.prefetched_y_batches = None
        self.prefetched_base_step_idx = None

        self.xdtype = xdtype
        self.ydtype = ydtype

        # Count samples in datasets
        self.dataset_samples = [] # (dim datasets)
        for dataset_dir in dataset_dirs:
            # Every file in the dataset has the same length, use the labels file
            samples = 0
            with open(dataset_dir / y_file) as f:
                for _ in tqdm(f, desc=f'Counting samples in {dataset_dir}'):
                    samples += 1
            assert samples > prefetch_size_per_dataset
            self.dataset_samples.append(samples)
        
        self._setup_new_chunked_readers()

    def __len__(self):
        # Datasets should be of equal length, but in case some dataset
        # is shorter than the others, we need to truncate our samples
        max_n_samples = min(self.dataset_samples) * len(self.dataset_dirs)
        # Datasets need to be truncated by the prefetch size
        padding = self.prefetch_size_per_dataset * len(self.dataset_dirs)
        max_n_samples = max_n_samples - (max_n_samples % padding)
        return int(np.floor(max_n_samples / self.batch_size))                  

    def _setup_new_chunked_readers(self):
        # Throw away potentially existing readers
        self.X_attr_readers_for_datasets = [] # (dim datasets x readers)
        self.y_attr_reader_for_datasets = [] # (dim datasets)

        # Initialize new chunked csv readers
        read_csv_kwargs = { 'sep': ' ', 'header': None, 'chunksize': self.prefetch_size_per_dataset }
        for dirname in self.dataset_dirs:
            X_readers = []
            for filename in X_files:
                X_reader = pd.read_csv(dirname / filename, dtype=self.xdtype, **read_csv_kwargs)
                X_readers.append(X_reader)
            self.X_attr_readers_for_datasets.append(X_readers)
            y_reader = pd.read_csv(dirname / y_file, dtype=self.ydtype, **read_csv_kwargs)
            self.y_attr_reader_for_datasets.append(y_reader)

    def on_epoch_end(self):
        self._setup_new_chunked_readers()

    def _prefetch_batches(self):
        X_combined = None # dim (None x sample_length x n_X_attributes)
        for X_attr_readers in self.X_attr_readers_for_datasets:
            X_subdataset = None # dim (None x sample_length x n_X_attributes)
            for X_attribute, X_attr_reader in zip(X_attributes, X_attr_readers):
                X_attr_track = next(X_attr_reader)
                X_attr_track = np.nan_to_num(X_attr_track.to_numpy())
                X_attr_track = scalers[X_attribute].transform(X_attr_track)
                X_subdataset = X_attr_track if X_subdataset is None else np.dstack((X_subdataset, X_attr_track))
            X_combined = X_subdataset if X_combined is None else np.concatenate((X_combined, X_subdataset), axis=0)
        
        y_combined = None # dim (None, 1)
        for y_attr_reader in self.y_attr_reader_for_datasets:
            y_attr_track = next(y_attr_reader) # dim (None, sample_length)
            y_attr_track = np.nan_to_num(y_attr_track.to_numpy()) # dim (None, sample_length)
            y_attr_track = y_attr_track[:, 0] # dim (None, 1)
            y_combined = y_attr_track if y_combined is None else np.concatenate((y_combined, y_attr_track), axis=0)
        
        # Shuffle data points
        assert len(X_combined) == len(y_combined)
        p = np.random.permutation(len(y_combined))
        X_combined = X_combined[p]
        y_combined = y_combined[p]

        # Pack the prefetched data into batches
        self.prefetched_X_batches = np.split(X_combined, len(X_combined) // self.batch_size, axis=0)
        self.prefetched_y_batches = np.split(y_combined, len(y_combined) // self.batch_size, axis=0)

    def __getitem__(self, step_idx):
        if self.prefetched_y_batches is None:
            is_prefetched = False
        else:
            n_prefetched_batches = len(self.prefetched_y_batches)
            is_above_prefetch = step_idx > (self.prefetched_base_step_idx + n_prefetched_batches - 1)
            is_below_prefetch = step_idx < self.prefetched_base_step_idx
            is_prefetched = (not is_above_prefetch) and (not is_below_prefetch)

        if not is_prefetched:
            self._prefetch_batches()
            self.prefetched_base_step_idx = step_idx

        scoped_idx = step_idx - self.prefetched_base_step_idx
        X = self.prefetched_X_batches[scoped_idx]
        y = self.prefetched_y_batches[scoped_idx]

        return X, X

# Use batch generators to not preprocess the whole dataset at once   
train_generator = SHLDatasetGenerator(TRAIN_DATASET_DIRS, prefetch_size_per_dataset=pow(2, 16))
validation_generator = SHLDatasetGenerator(VALIDATION_DATASET_DIRS)

# Train model
callbacks = [
    keras.callbacks.CSVLogger('train.log', append=False),
    keras.callbacks.ModelCheckpoint(
        str(MODEL_DIR), save_best_only=True, monitor='val_loss', verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.25, patience=25, min_lr=0.0001, verbose=1
    ),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, verbose=1),
]

model.fit(
    train_generator,
    epochs=200,
    callbacks=callbacks,
    validation_data=validation_generator,
    verbose=1,
    shuffle=False, # Shuffling doesn't work with our prefetching
    class_weight=class_weights
)