<a href="https://colab.research.google.com/github/PhilippMatthes/diplom/blob/master/src/shl-deep-learning-timeseries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using a deep CNN to directly classify SHL timeseries data

In [1]:
# Free up some disk space on colab
!rm -rf /usr/local/lib/python2.7
!rm -rf /swift
!rm -rf /usr/local/cuda-10.0
!rm -rf /usr/local/cuda-10.1
!rm -rf /usr/local/lib/python3.6/dist-packages/torch
!rm -rf /usr/local/lib/python3.6/dist-packages/pystan
!rm -rf /usr/local/lib/python3.6/dist-packages/spacy
!rm -rf /tensorflow-1.15.2/
!rm -rf /opt/nvidia

In [None]:
# Get needed auxiliary files for colab
!git clone https://github.com/philippmatthes/diplom

In [2]:
# Change into src dir and load our datasets
%cd /content/diplom/src
!mkdir shl-dataset

/content/diplom/src
mkdir: cannot create directory ‘shl-dataset’: File exists


In [None]:
# Download training datasets
!wget -nc -O shl-dataset/challenge-2019-train_torso.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_torso.zip
!wget -nc -O shl-dataset/challenge-2019-train_bag.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_bag.zip
!wget -nc -O shl-dataset/challenge-2019-train_hips.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2019/challenge-2019-train_hips.zip
!wget -nc -O shl-dataset/challenge-2020-train_hand.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-train_hand.zip
# Download validation dataset
!wget -nc -O shl-dataset/challenge-2020-validation.zip http://www.shl-dataset.org/wp-content/uploads/SHLChallenge2020/challenge-2020-validation.zip

In [3]:
# Unzip training datasets
!unzip -n -d shl-dataset/challenge-2019-train_torso shl-dataset/challenge-2019-train_torso.zip
!rm shl-dataset/challenge-2019-train_torso.zip
!unzip -n -d shl-dataset/challenge-2019-train_bag shl-dataset/challenge-2019-train_bag.zip
!rm shl-dataset/challenge-2019-train_bag.zip
!unzip -n -d shl-dataset/challenge-2019-train_hips shl-dataset/challenge-2019-train_hips.zip
!rm shl-dataset/challenge-2019-train_hips.zip
!unzip -n -d shl-dataset/challenge-2020-train_hand shl-dataset/challenge-2020-train_hand.zip
!rm shl-dataset/challenge-2020-train_hand.zip
# Unzip validation dataset
!unzip -n -d shl-dataset/challenge-2020-validation shl-dataset/challenge-2020-validation.zip
!rm shl-dataset/challenge-2020-validation.zip

unzip:  cannot find or open shl-dataset/challenge-2019-train_torso.zip, shl-dataset/challenge-2019-train_torso.zip.zip or shl-dataset/challenge-2019-train_torso.zip.ZIP.
rm: cannot remove 'shl-dataset/challenge-2019-train_torso.zip': No such file or directory
Archive:  shl-dataset/challenge-2019-train_bag.zip
unzip:  cannot find or open shl-dataset/challenge-2019-train_hips.zip, shl-dataset/challenge-2019-train_hips.zip.zip or shl-dataset/challenge-2019-train_hips.zip.ZIP.
rm: cannot remove 'shl-dataset/challenge-2019-train_hips.zip': No such file or directory
Archive:  shl-dataset/challenge-2020-train_hand.zip
   creating: shl-dataset/challenge-2020-train_hand/train/
   creating: shl-dataset/challenge-2020-train_hand/train/Hand/
  inflating: shl-dataset/challenge-2020-train_hand/train/Hand/Acc_x.txt  
  inflating: shl-dataset/challenge-2020-train_hand/train/Hand/Acc_y.txt  
  inflating: shl-dataset/challenge-2020-train_hand/train/Hand/Acc_z.txt  
  inflating: shl-dataset/challenge-202

In [2]:
%cd /content/diplom/src
%tensorflow_version 2.x

/content/diplom/src


In [3]:
# Import garbage collector to save memory here and there
import gc

In [4]:
# Define all datasets to train our model on

from pathlib import Path

TRAIN_DATASET_DIRS = [
    Path('shl-dataset/challenge-2019-train_torso/train/Torso'),
    Path('shl-dataset/challenge-2019-train_bag/train/Bag'),
    Path('shl-dataset/challenge-2019-train_hips/train/Hips'),
    Path('shl-dataset/challenge-2020-train_hand/train/Hand'),
]

VALIDATION_DATASET_DIRS = [
    Path('shl-dataset/challenge-2020-validation/validation/Torso'),         
    Path('shl-dataset/challenge-2020-validation/validation/Bag'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hips'),   
    Path('shl-dataset/challenge-2020-validation/validation/Hand'),                  
]

In [5]:
from tensorflow import keras

# Check that we can use our GPU, to not wait forever during training
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 1467716063153075211]

In [6]:
# Load power transformers for preprocessing

import joblib

from collections import OrderedDict

from tools.dataset import shl_dataset_X_attributes

scalers = OrderedDict({})
for a in shl_dataset_X_attributes:
    scaler = joblib.load(f'models/shl-scalers/{a}.scaler.joblib')
    scaler.copy = False # Save memory
    scalers[a] = scaler

In [7]:
# from sklearn.utils.class_weight import compute_class_weight

# Compute class weights for unbiased training
# labels_reduced = train_dataset.labels[:, 0].astype(np.int) # Only select first idx of each sample
# class_weights = compute_class_weight(
#     'balanced', 
#     classes=np.unique(labels_reduced), 
#     y=labels_reduced
# )
# class_weights = dict(zip(np.unique(labels_reduced), class_weights)) # Keras adaption
# del labels_reduced # Save memory
# gc.collect()
# Fill in NULL class for tf 2.x
# class_weights[0] = 0

# class_weights

In [8]:
# Create our model

from tensorflow.keras import layers

from architectures.resnet import make_resnet
from tools.dataset import shl_dataset_label_order

MODEL_DIR = Path('models/shl-resnet-all-attributes')

model = make_resnet(
    input_shape=[500, len(shl_dataset_X_attributes)], 
    output_classes=len(shl_dataset_label_order)
)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 500, 19)]    0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 500, 64)      9792        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 500, 64)      256         conv1d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 500, 64)      0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

shl_dataset_X_files = [
    'Acc_x.txt', 'Acc_y.txt', 'Acc_z.txt',
    'Mag_x.txt', 'Mag_y.txt', 'Mag_z.txt',
    'Gyr_x.txt', 'Gyr_y.txt', 'Gyr_z.txt',
    'Gra_x.txt', 'Gra_y.txt', 'Gra_z.txt',
    'LAcc_x.txt', 'LAcc_y.txt', 'LAcc_z.txt',
    'Ori_x.txt', 'Ori_y.txt', 'Ori_z.txt', 'Ori_w.txt',
]

shl_dataset_y_file = 'Label.txt'

class DatasetGenerator(keras.utils.Sequence):
    def __init__(self, dataset_dirs, batch_size=128, prefetch_size=65536, xdtype=np.float32, ydtype=np.int):
        assert batch_size % len(dataset_dirs) == 0

        self.dataset_dirs = dataset_dirs
        self.batch_size = batch_size
        self.xdtype = xdtype
        self.ydtype = ydtype

        # Count samples in datasets
        self.n_samples = 0
        for dataset_dir in dataset_dirs:
            # Every file in the dataset has the same length
            with open(dataset_dir / shl_dataset_y_file) as f:
                for _ in tqdm(f, desc='Counting samples'):
                    self.n_samples += 1
        print(f'Total sample count: {self.n_samples}')

        self.prefetch_size = prefetch_size
        self.prefetched_window_start_idx = 0
        self.prefetched_window_end_idx = 0

    def __len__(self):
        return int(np.floor(self.n_samples / self.batch_size))
    
    def __getitem__(self, batch_idx):
        # Generate samples indexes of the batch
        requested_start_idx = int(batch_idx * self.batch_size / len(self.dataset_dirs))
        requested_end_idx = int((batch_idx + 1) * self.batch_size / len(self.dataset_dirs))

        # If we are outside the prefetched window, prefetch the next one
        if requested_start_idx < self.prefetched_window_start_idx or \
           requested_end_idx > self.prefetched_window_end_idx:
            self.prefetched_window_start_idx = requested_start_idx
            self.prefetched_window_end_idx = requested_end_idx + self.prefetch_size
            csv_skiprows = self.prefetched_window_start_idx
            csv_nrows = self.prefetched_window_end_idx - self.prefetched_window_start_idx

            # Read this part of the dataset
            for dataset_dir in self.dataset_dirs:
                # Load attributes
                self.window_X = None
                for attribute, filename in zip(shl_dataset_X_attributes, shl_dataset_X_files):
                    fpath = dataset_dir / filename
                    df = pd.read_csv(
                        fpath, header=None, sep=' ', dtype=self.xdtype, 
                        skiprows=csv_skiprows, nrows=csv_nrows
                    )
                    track = np.nan_to_num(df.to_numpy())
                    scaled_track = scalers[attribute].transform(track)

                    if self.window_X is None:
                        self.window_X = scaled_track
                    else:
                        self.window_X = np.dstack((self.window_X, scaled_track))
                # Load labels
                fpath = dataset_dir / shl_dataset_y_file
                df = pd.read_csv(
                    fpath, header=None, sep=' ', dtype=self.ydtype, 
                    skiprows=csv_skiprows, nrows=csv_nrows
                )
                track = np.nan_to_num(df.to_numpy())
                self.window_y = track[:, 0] # Only use first index
        scoped_start_idx = requested_start_idx - self.prefetched_window_start_idx
        scoped_end_idx = requested_end_idx - self.prefetched_window_end_idx
        X = self.window_X[scoped_start_idx:scoped_end_idx]
        y = self.window_y[scoped_start_idx:scoped_end_idx]
        return X, y

# Use batch generators to not preprocess the whole dataset at once   
train_generator = DatasetGenerator(TRAIN_DATASET_DIRS)
validation_generator = DatasetGenerator(VALIDATION_DATASET_DIRS)

# Train model
callbacks = [
    keras.callbacks.CSVLogger(f'train.log', append=False),
    keras.callbacks.ModelCheckpoint(
        str(MODEL_DIR), save_best_only=True, monitor='val_loss', verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.25, patience=25, min_lr=0.0001, verbose=1
    ),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, verbose=1),
]
model.fit(
    train_generator,
    epochs=200,
    callbacks=callbacks,
    validation_data=validation_generator,
    verbose=1,
    shuffle=False
    # class_weight=class_weights
)

Counting samples: 196072it [00:00, 944959.05it/s]
Counting samples: 196072it [00:00, 903914.11it/s]
Counting samples: 196072it [00:00, 914853.47it/s]
Counting samples: 196072it [00:00, 923459.10it/s]
Counting samples: 28789it [00:00, 861889.22it/s]
Counting samples: 28789it [00:00, 896268.83it/s]
Counting samples: 28789it [00:00, 897474.56it/s]
Counting samples: 28789it [00:00, 918123.89it/s]


Total sample count: 784288
Total sample count: 115156
