# Summary
- [Data](#Data)
- [Model](#Model)
- [Training](#Training)
    - [10-fold-cross-validation-(early-stopping)](#10-fold-cross-validation-(early-stopping))
- [Evaluation](#Evaluation)
- [Visualization](#Visualization)
    - [Interpretation](#Interpretation)
- [Implemented-but-not-used](#Implemented-but-not-used)
    - [Debug](#Debug)

# Dependencies


In [4]:

#visualization
import matplotlib.pyplot as plt
#math tools
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import resample
from scipy.signal import decimate
from fastdtw import fastdtw
#machine learning
from nltk.cluster.kmeans import KMeansClusterer

from sklearn.model_selection import StratifiedKFold
#utils
from time import time
import warnings
from os.path import join
from os import listdir
import pickle

#custom
from utils import *
from load_data import *
from training import *

# Utils
Cf `utils.py`

In [2]:

def plot(plot_i,train,valid,test,average=True):
    if average:
        print(results[:80].replace(";","|"))
        if early_stopping:
            title = "average {} over 10 folds over the {} first epochs".format(index2plot[plot_i],shortest_fold)
        else:
            title = "average {} over 10 folds over {} epochs".format(index2plot[plot_i],n_epochs)
    else :
        train,valid,test=np.asarray(train),np.asarray(valid),np.asarray(test)
        title=str((task_i,is_lstm,learning_rate,hidden_size,num_layers,bidirectional,
    dropout,clip,window_size))
    plt.figure()
    plt.title(title)
    plt.plot(train[:,plot_i],label="training")
    plt.plot(valid[:,plot_i],label="validation")
    plt.plot(test[:,plot_i],label="test")
    plt.xlabel("epochs")
    plt.ylabel(index2plot[plot_i])
    plt.legend()

def return_results(train_metrics,valid_metrics,test_metrics,early_stopping,flat_falses):
    train_metrics,valid_metrics,test_metrics=np.asarray(train_metrics),np.asarray(valid_metrics),np.asarray(test_metrics)
    model_name="LSTM" if is_lstm else "GRU"
    task_name=index2task[task_i] if task_i is not None else str(task_i)
    results="{} ; {} ; {} ; {}   ; {} ; {} ; {} ; {} ; {:.2f} (± {:.2f}) ; None ; {} ; TRUE ; TRUE ; {} ; TRUE ; {:.2f} (± {:.2f}) ; {:.2f} (± {:.2f}) ".format(
    task_name,model_name,learning_rate, hidden_size,num_layers,bidirectional,dropout,clip,
    np.mean(early_stopping),np.std(early_stopping),compute_movement,downsampling_factor,
     np.mean(train_metrics[:,1]),np.std(train_metrics[:,1]), np.mean(valid_metrics[:,1]),np.std(valid_metrics[:,1]))

    test_metrics=test_metrics.T
    for metric in test_metrics[1:]:#don't care about the loss
        mean,std=np.mean(metric),np.std(metric)
        results+="; {:.2f} (± {:.2f}) ".format(mean,std)
    results+=" ; "
    results+=" ; ".join(map(str, flat_falses))
    return results

# Data
Cf `load_data.py`
## Loading

## Task selection
set `task_i` to None if you want to train the model on all tasks at once (i.e. early fusion)  
Else set `task_i` to the desired task index (cf. task2index)

## Compute movement
Transforms data as Zhang et al. (cf Report #5)

## Scale then downsample (or not) then concatenate task id (or not)
Set `downsampling_factor` to `1` if you don't want to downsample
## Split in subsequence (or not)
Set `window_size` to `None` if you don't want to split data into subsequence of fixed length  
Set `paper_air_split` to `False` if you don't want to split data into strokes

In [3]:
## Loading
#Cf `load_data.py`
task_i=task2index["spiral"]
compute_movement=False
downsampling_factor=1
window_size=None
paper_air_split=False
try:
    assert window_size is None or not paper_air_split
except AssertionError:
    print("you have to choose between subsequences of fixed length and strokes !")
else:
    print("\nloading and massaging data, this might take a few seconds...")
    data_gen=load_data()
    data,targets=[],[]
    for subject,label in data_gen:
        data.append(subject)
        targets.append(label)
    print("(75-3 subjects, 8 tasks, X timesteps, 7 measures)")
    print(len(data),len(data[0]),len(data[0][0]),len(data[0][0][0]))
    data, targets= massage_data(data, targets,task_i, compute_movement, downsampling_factor, window_size,paper_air_split)


loading and massaging data, this might take a few seconds...
(75-3 subjects, 8 tasks, X timesteps, 7 measures)
72 8 1772 7

task index, name
0 spiral
len(data), len(targets), len(data[0]) :
72 72 1772

movement was not computed (i.e. data was not transformed)

scaling 
len(data), len(targets), len(data[0]) :
72 72 1772
the task is represented as one single sequence  (i.e. data was not transformed)


#  Training

In [8]:
max_len=3117
for i,task in enumerate(data):
    
    if len(task) > max_len:
        data[i]=task[:max_len]
    else:
        data[i]=np.concatenate((task,np.zeros(shape=(max_len-len(task),7))))

In [10]:
NUM_CLUSTERS = 2

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=fastdtw, repeats=25)


In [11]:
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

TypeError: unsupported operand type(s) for +=: 'float' and 'tuple'