In [43]:
import glob
import os
# import librosa
# from librosa import display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from random import shuffle
%matplotlib inline
plt.style.use('ggplot')
# import pydot
# import graphviz
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import time

dataset_path_prefix = 'dataset'
model_path_prefix = 'trained_models'

# Load Leapmotion dataset

In [42]:
leap_features = np.loadtxt(os.path.join(dataset_path_prefix, 'leap_merge_features.csv'), delimiter=',')
leap_labels = np.array(np.loadtxt(os.path.join(dataset_path_prefix, 'leap_merge_labels.csv'), delimiter=','), dtype=np.int)

In [8]:
X_leap_train, X_leap_test, y_leap_train, y_leap_test = train_test_split(
        leap_features, leap_labels, stratify=leap_labels, train_size=.75, random_state=round(time.time()))

print(X_leap_train.shape)
print(X_leap_test.shape)
print(y_leap_train.shape)
print(y_leap_test.shape)

num_rows, num_cols = 100, 87

X_leap_train = X_leap_train.reshape(X_leap_train.shape[0], num_rows, num_cols)
X_leap_test = X_leap_test.reshape(X_leap_test.shape[0], num_rows, num_cols)

print("After reshaping")
print(X_leap_train.shape)
print(X_leap_test.shape)
print(y_leap_train.shape)
print(y_leap_test.shape)

(887, 8700)
(296, 8700)
(887, 6)
(296, 6)




# Load Voice dataset

In [9]:
features = np.loadtxt('voice_merge_features.csv', delimiter=',')
labels = np.array(np.loadtxt('voice_merge_labels.csv', delimiter=','), dtype=np.int)

In [10]:
X_all = features

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)
y_all = enc.fit_transform(labels.reshape((-1, 1)))

In [11]:
X_voice_train, X_voice_test, y_voice_train, y_voice_test = train_test_split(
        X_all, y_all, stratify=y_all, train_size=.75, random_state=round(time.time()))

print(X_voice_train.shape)
print(X_voice_test.shape)
print(y_voice_train.shape)
print(y_voice_test.shape)

num_rows, num_cols = 100, 87

X_leap_train = X_leap_train.reshape(X_leap_train.shape[0], num_rows, num_cols)
X_leap_test = X_leap_test.reshape(X_leap_test.shape[0], num_rows, num_cols)

print("After reshaping")
print(X_voice_train.shape)
print(X_voice_test.shape)
print(y_voice_train.shape)
print(y_voice_test.shape)



# Load Video dataset

In [None]:
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
from models import ResearchModels
from data import DataSet
import time
import os.path

In [15]:
model_name = 'mlp_merge'
saved_model = None  # None or weights file
class_limit = 6  # int, can be 1-101 or None
seq_length = 40
load_to_memory = True  # pre-load the sequences into memory
batch_size = 32
nb_epoch = 10

# Chose images or features and image shape based on network.
if model_name in ['conv_3d', 'c3d', 'lrcn']:
    data_type = 'images'
    image_shape = (80, 80, 3)
elif model_name in ['lstm', 'mlp', 'mlp_merge']:
    data_type = 'features'
    image_shape = None
else:
    raise ValueError("Invalid model. See train.py for options.")

    
# Helper: Save the model.
checkpointer = ModelCheckpoint(
    filepath=os.path.join('data', 'checkpoints', model_name + '-' + data_type + \
        '.{epoch:03d}-{val_loss:.3f}.hdf5'),
    verbose=1,
    save_best_only=True)

# Helper: TensorBoard
tb = TensorBoard(log_dir=os.path.join('data', 'logs', model_name))

# Helper: Stop when we stop learning.
early_stopper = EarlyStopping(patience=5)

# Helper: Save results.
timestamp = time.time()
csv_logger = CSVLogger(os.path.join('data', 'logs', model_name + '-' + 'training-' + \
    str(timestamp) + '.log'))
    

# Get the data and process it.
if image_shape is None:
    data = DataSet(
        seq_length=seq_length,
        class_limit=class_limit
    )
else:
    data = DataSet(
        seq_length=seq_length,
        class_limit=class_limit,
        image_shape=image_shape
    )

# Get samples per epoch.
# Multiply by 0.7 to attempt to guess how much of data.data is the train set.
steps_per_epoch = (len(data.data) * 0.7) // batch_size

if load_to_memory:
    # Get data.
    X_video_train, y_video_train = data.get_all_sequences_in_memory('train', data_type)
    X_video_test, y_video_test = data.get_all_sequences_in_memory('test', data_type)
else:
    # Get generators.
    generator = data.frame_generator(batch_size, 'train', data_type)
    val_generator = data.frame_generator(batch_size, 'test', data_type)    

check the classes order
['right', 'left', 'up', 'down', 'start', 'no']
Loading 984 samples into memory for training.
Loading 389 samples into memory for testing.


In [33]:
print(X_video_train.shape)
print(X_video_test.shape)
print(y_video_train.shape)
print(y_video_test.shape)

(1029, 40, 2048)
(344, 40, 2048)
(1029, 6)
(344, 6)


In [52]:
X_leap_test[0][0]

array([-1.55454561e-01,  9.82281327e-01,  1.04676999e-01, -7.39377439e-01,
       -4.54269983e-02, -6.71756923e-01, -2.26899643e+01,  1.49956421e+02,
        1.20354774e+02, -4.77434736e+01,  9.60827789e+01, -1.71007025e+02,
       -3.05654907e+01,  1.00049690e+02,  1.24440170e+02, -3.05654907e+01,
        1.00049690e+02,  1.24440170e+02, -5.67678566e+01,  1.27429832e+02,
        1.40550293e+02, -6.92803574e+01,  1.51950836e+02,  1.34779343e+02,
       -7.44361649e+01,  1.68014343e+02,  1.25391136e+02, -1.56523962e+01,
        1.10663216e+02,  1.31320755e+02, -3.81087227e+01,  1.66541580e+02,
        1.38856659e+02, -5.16770096e+01,  1.99280258e+02,  1.38388321e+02,
       -6.04333153e+01,  2.17131866e+02,  1.36895081e+02, -6.72023621e+01,
        2.29374603e+02,  1.35174576e+02, -8.63309574e+00,  1.14827522e+02,
        1.25273323e+02, -2.49031429e+01,  1.70015961e+02,  1.26739426e+02,
       -3.76661148e+01,  2.06400635e+02,  1.17025009e+02, -4.74663773e+01,
        2.26412659e+02,  

# Define Leapmotion Model

In [35]:
#LSTM
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

import numpy as np

data_dim = 87
timesteps = 100
num_classes = 6
leap_batch_size = 55

In [17]:
## Imports
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import print_summary, plot_model
from keras import regularizers
from keras.layers import BatchNormalization
from keras.models import load_model

Using TensorFlow backend.


In [18]:
img_rows, img_cols = 40, 32

X_voice_train = X_voice_train.reshape(X_voice_train.shape[0], img_rows, img_cols, 1)
X_voice_test = X_voice_test.reshape(X_voice_test.shape[0], img_rows, img_cols, 1)

In [92]:
def extract_representation_layer(model_name, trainable=False, new_model=False):
    
    model = load_model(os.path.join(model_path_prefix, model_name))
    if new_model:
        model = Sequential.from_config(model.get_config())
    else:
        model.trainable = trainable
    
    return model, model.layers[-2]

In [101]:
leap_model, leapmotion_representation_output = extract_representation_layer("leap_model.h5", trainable=True, new_model=False)
voice_model, voice_representation_output = extract_representation_layer("voice_model.h5", trainable=True, new_model=False)
video_model, video_representation_output = extract_representation_layer("video_model.h5", trainable=True, new_model=False)
# leap_model = load_model("leap_model.h5")
# voice_model = load_model("voice_model.h5")
# video_model = ResearchModels(len(data.classes), model_name, seq_length, saved_model)

# Merge the models

In [113]:
from keras.layers import Merge, concatenate
from keras.optimizers import RMSprop, Adam
from keras.models import Model

# merge_layer = concatenate([leapmotion_representation_output, voice_representation_output])
# dense_layer = Dense(64, activation='relu', name="merge_dense")(merge_layer)
# dense_layer = Dropout(0.5, name="merge_dropout")(dense_layer)
# output_layer = Dense(num_classes, activation='softmax', name="merge_output")(dense_layer)

# merge_model = Model(inputs=[leap_model.input, voice_model.input], outputs=output_layer)

merge_model = Sequential()
merge_model.add(Merge([video_representation_output, voice_representation_output], mode='concat', concat_axis=-1))

merge_model.add(Dense(128, activation='relu', name="merge_dense_1"))
merge_model.add(Dropout(0.8, name="merge_dropout_1"))
# merge_model.add(Dense(128, activation='relu', name="merge_dense_2"))
# merge_model.add(Dropout(0.5, name="merge_dropout_2"))
merge_model.add(Dense(num_classes, activation='softmax', name="merge_output"))

  del sys.path[0]


In [114]:
optimizer = RMSprop(lr=1e-3)

merge_model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=optimizer,
                  metrics=['accuracy'])

In [None]:
merge_model.fit([X_merge_video_train, X_merge_voice_train],\
          y_merge_train,
          batch_size=32, shuffle=False,
          epochs=10,
          verbose=1,
          validation_data=([X_merge_video_val, X_merge_voice_val], y_merge_val))

Train on 4405 samples, validate on 881 samples
Epoch 1/10

In [103]:
score = merge_model.evaluate([X_merge_leap_test, X_merge_voice_test], y_merge_test, verbose=1)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 5.43571733797
Test accuracy: 0.222804054054


# Prepare the merged dataset

In [39]:
def generate_merge_data(X_voice, y_voice,\
                        X_leap, y_leap,\
                        X_video, y_video,\
                       augment_index=1, ):

    X_merge_voice = []
    X_merge_leap = []
    X_merge_video = []
    y_merge = []

    for i in range(6):

        voice_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_voice,y_voice))))
        leap_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_leap,y_leap))))
        video_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_video,y_video))))
        
        for _ in range(augment_index):
            
            shuffle(voice_tmp)
            shuffle(leap_tmp)
            shuffle(video_tmp)

            for tuple_3 in zip(voice_tmp, leap_tmp, video_tmp):
                X_merge_voice.append(tuple_3[0])
                X_merge_leap.append(tuple_3[1])
                X_merge_video.append(tuple_3[2])

                y_merge.append(i)
                
    ohe = OneHotEncoder(sparse=False)
    
    return np.array(X_merge_voice), np.array(X_merge_leap), np.array(X_merge_video), ohe.fit_transform(np.array(y_merge).reshape(-1, 1))

In [40]:
X_merge_voice_train,\
X_merge_leap_train,\
X_merge_video_train,\
y_merge_train = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_voice_train,\
                                    y_voice_train,\
                                    augment_index=5)

X_merge_voice_val,\
X_merge_leap_val,\
X_merge_video_val,\
y_merge_val = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_voice_train,\
                                    y_voice_train,\
                                    augment_index=1)

In [41]:
print((X_merge_voice_train.shape))
print((X_merge_leap_train.shape))
print((X_merge_video_train.shape))
print((y_merge_train.shape))

(4435, 40, 32, 1)
(4435, 100, 87)
(4435, 40, 32, 1)
(4435, 6)


In [42]:
print((X_merge_voice_val.shape))
print((X_merge_leap_val.shape))
print((X_merge_video_val.shape))
print((y_merge_val.shape))

(887, 40, 32, 1)
(887, 100, 87)
(887, 40, 32, 1)
(887, 6)


In [43]:
X_merge_voice_test,\
X_merge_leap_test,\
X_merge_video_test,\
y_merge_test = generate_merge_data(X_voice_test,\
                                    y_voice_test,\
                                    X_leap_test,\
                                    y_leap_test,\
                                    X_leap_test,\
                                    y_leap_test,\
                                    augment_index=20)

In [44]:
print((X_merge_voice_test.shape))
print((X_merge_leap_test.shape))
print((X_merge_video_test.shape))
print((y_merge_test.shape))

(5920, 40, 32, 1)
(5920, 100, 87)
(5920, 100, 87)
(5920, 6)
