In [45]:
import glob
import os
# import librosa
# from librosa import display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from random import shuffle
%matplotlib inline
plt.style.use('ggplot')
# import pydot
# import graphviz
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import time

dataset_path_prefix = 'dataset'
model_path_prefix = 'trained_models'

# Load Leapmotion dataset

In [4]:
leap_features = np.loadtxt(os.path.join(dataset_path_prefix, 'leap_merge_features.csv'), delimiter=',')
leap_labels = np.array(np.loadtxt(os.path.join(dataset_path_prefix, 'leap_merge_labels.csv'), delimiter=','), dtype=np.int)

In [5]:
X_leap_train, X_leap_test, y_leap_train, y_leap_test = train_test_split(
        leap_features, leap_labels, stratify=leap_labels, train_size=.75, random_state=round(time.time()))

print(X_leap_train.shape)
print(X_leap_test.shape)
print(y_leap_train.shape)
print(y_leap_test.shape)

num_rows, num_cols = 100, 87

X_leap_train = X_leap_train.reshape(X_leap_train.shape[0], num_rows, num_cols)
X_leap_test = X_leap_test.reshape(X_leap_test.shape[0], num_rows, num_cols)

print("After reshaping")
print(X_leap_train.shape)
print(X_leap_test.shape)
print(y_leap_train.shape)
print(y_leap_test.shape)



(887, 8700)
(296, 8700)
(887, 6)
(296, 6)
After reshaping
(887, 100, 87)
(296, 100, 87)
(887, 6)
(296, 6)


# Load Voice dataset

In [6]:
features = np.loadtxt(os.path.join(dataset_path_prefix, 'voice_merge_features.csv'), delimiter=',')
labels = np.array(np.loadtxt(os.path.join(dataset_path_prefix, 'voice_merge_labels.csv'), delimiter=','), dtype=np.int)

In [7]:
X_all, _, y_all, _ = train_test_split(
        features, labels, stratify=labels, train_size=.1, random_state=round(time.time()))



In [8]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)
y_all = enc.fit_transform(y_all.reshape((-1, 1)))

In [9]:
X_voice_train, X_voice_test, y_voice_train, y_voice_test = train_test_split(
        X_all, y_all, stratify=y_all, train_size=.8, random_state=round(time.time()))

# X_voice_train, X_voice_val, y_voice_train, y_voice_val = train_test_split(
#         X_voice_train, y_voice_train, stratify=y_voice_train, train_size=.8, random_state=round(time.time()))

print(X_voice_train.shape)
print(X_voice_test.shape)
# print(X_voice_val.shape)
print(y_voice_train.shape)
print(y_voice_test.shape)
# print(y_voice_val.shape)
# num_rows, num_cols = 40, 32

# X_leap_train = X_leap_train.reshape(X_leap_train.shape[0], num_rows, num_cols)
# X_leap_test = X_leap_test.reshape(X_leap_test.shape[0], num_rows, num_cols)

# print("After reshaping")
# print(X_voice_train.shape)
# print(X_voice_test.shape)
# print(y_voice_train.shape)
# print(y_voice_test.shape)

(1133, 1280)
(284, 1280)
(1133, 6)
(284, 6)




# Load Video dataset

In [10]:
video_features = np.loadtxt(os.path.join(dataset_path_prefix, 'video_merge_features.csv'), delimiter=',')
video_labels = np.array(np.loadtxt(os.path.join(dataset_path_prefix, 'video_merge_labels.csv'), delimiter=','), dtype=np.int)

In [11]:
print(video_features.shape)
print(video_labels.shape)

(1373, 81920)
(1373, 6)


In [12]:
X_video_train, X_video_test, y_video_train, y_video_test = train_test_split(
        video_features, video_labels, stratify=video_labels, train_size=.8, random_state=round(time.time()))

# X_video_train, X_video_val, y_video_train, y_video_val = train_test_split(
#         X_video_train, y_video_train, stratify=y_video_train, train_size=.9, random_state=round(time.time()))

print(X_video_train.shape)
print(X_video_test.shape)
# print(X_video_val.shape)
print(y_video_train.shape)
print(y_video_test.shape)
# print(y_video_val.shape)



(1098, 81920)
(275, 81920)
(1098, 6)
(275, 6)


In [13]:
del video_features

# Define Leapmotion Model

In [32]:
#LSTM
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

import numpy as np

data_dim = 87
timesteps = 100
num_classes = 6
leap_batch_size = 55

In [26]:
## Imports
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import print_summary, plot_model
from keras import regularizers
from keras.layers import BatchNormalization
from keras.models import load_model

In [27]:
def extract_representation_layer(model_name, trainable=False, new_model=False):
    
    model = load_model(os.path.join(model_path_prefix, model_name))
    if new_model:
        model = Sequential.from_config(model.get_config())
    else:
        model.trainable = trainable
    
    return model, model.layers[-2]

In [28]:
leap_model, leapmotion_representation_output = extract_representation_layer("leap_model.h5", trainable=True, new_model=False)
voice_model, voice_representation_output = extract_representation_layer("voice_model.h5", trainable=True, new_model=False)
video_model, video_representation_output = extract_representation_layer("video_model.h5", trainable=True, new_model=False)
# leap_model = load_model("leap_model.h5")
# voice_model = load_model("voice_model.h5")
# video_model = ResearchModels(len(data.classes), model_name, seq_length, saved_model)

In [29]:
score = voice_model.evaluate(X_voice_test, y_voice_test, verbose=1)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 0.463952642931
Test accuracy: 0.855633802817


# Merge the models

In [33]:
from keras.layers import Merge, concatenate
from keras.optimizers import RMSprop, Adam
from keras.models import Model

In [34]:
# merge_layer = concatenate([leapmotion_representation_output, voice_representation_output])
# dense_layer = Dense(64, activation='relu', name="merge_dense")(merge_layer)
# dense_layer = Dropout(0.5, name="merge_dropout")(dense_layer)
# output_layer = Dense(num_classes, activation='softmax', name="merge_output")(dense_layer)

# merge_model = Model(inputs=[leap_model.input, voice_model.input], outputs=output_layer)

merge_model = Sequential()
merge_model.add(Merge([video_representation_output, leapmotion_representation_output, voice_representation_output], mode='concat', concat_axis=-1))

merge_model.add(Dense(256, activation='relu', name="merge_dense_1"))
merge_model.add(Dropout(0.5, name="merge_dropout_1"))
merge_model.add(Dense(256, activation='relu', name="merge_dense_2"))
merge_model.add(Dropout(0.5, name="merge_dropout_2"))
merge_model.add(Dense(num_classes, activation='softmax', name="merge_output"))

  if __name__ == '__main__':


In [35]:
optimizer = RMSprop(lr=5e-5)

merge_model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=optimizer,
                  metrics=['accuracy'])

In [40]:
merge_model.fit([X_merge_video_train, X_merge_leap_train, X_merge_voice_train],\
          y_merge_train,
          batch_size=64, shuffle=False,
          epochs=5,
          verbose=1,
          validation_data=([X_merge_video_val, X_merge_leap_val, X_merge_voice_val], y_merge_val))

Train on 4435 samples, validate on 887 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7f944bacf8>

In [43]:
score = merge_model.evaluate([X_merge_video_test, X_merge_leap_test, X_merge_voice_test], y_merge_test, verbose=1)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 0.188800419798
Test accuracy: 0.964285714475


In [23]:
merge_model = load_model("merge_model.h5")

  return cls(**config)


RuntimeError: The layer has never been called and thus has no defined output shape.

# Prepare the merged dataset

In [14]:
def generate_merge_data(X_voice, y_voice,\
                        X_leap, y_leap,\
                        X_video, y_video,\
                       augment_index=1, ):

    X_merge_voice = []
    X_merge_leap = []
    X_merge_video = []
    y_merge = []

    for i in range(6):

        voice_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_voice,y_voice))))
        leap_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_leap,y_leap))))
        video_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_video,y_video))))
        
        for _ in range(augment_index):
            
            shuffle(voice_tmp)
            shuffle(leap_tmp)
            shuffle(video_tmp)

            for tuple_3 in zip(voice_tmp, leap_tmp, video_tmp):
                X_merge_voice.append(tuple_3[0])
                X_merge_leap.append(tuple_3[1])
                X_merge_video.append(tuple_3[2])

                y_merge.append(i)
                
    ohe = OneHotEncoder(sparse=False)
    
    return np.array(X_merge_voice), np.array(X_merge_leap), np.array(X_merge_video), ohe.fit_transform(np.array(y_merge).reshape(-1, 1))

In [15]:
X_merge_voice_train,\
X_merge_leap_train,\
X_merge_video_train,\
y_merge_train = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_video_train,\
                                    y_video_train,\
                                    augment_index=5)

X_merge_voice_val,\
X_merge_leap_val,\
X_merge_video_val,\
y_merge_val = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_video_train,\
                                    y_video_train,\
                                    augment_index=1)

In [16]:
print((X_merge_voice_train.shape))
print((X_merge_leap_train.shape))
print((X_merge_video_train.shape))
print((y_merge_train.shape))

(4435, 1280)
(4435, 100, 87)
(4435, 81920)
(4435, 6)


In [17]:
print((X_merge_voice_val.shape))
print((X_merge_leap_val.shape))
print((X_merge_video_val.shape))
print((y_merge_val.shape))

(887, 1280)
(887, 100, 87)
(887, 81920)
(887, 6)


In [20]:
X_merge_voice_test,\
X_merge_leap_test,\
X_merge_video_test,\
y_merge_test = generate_merge_data(X_voice_test,\
                                    y_voice_test,\
                                    X_leap_test,\
                                    y_leap_test,\
                                    X_video_test,\
                                    y_video_test,\
                                    augment_index=5)


In [22]:
print((X_merge_voice_test.shape))
print((X_merge_leap_test.shape))
print((X_merge_video_test.shape))
print((y_merge_test.shape))

(1260, 1280)
(1260, 100, 87)
(1260, 81920)
(1260, 6)


In [None]:
acc = []
num_test = 50

for i in range(num_test):
    
    if i > 0 and i % 5 == 0:
        print("Test %d times" % (i))
        
    X_merge_voice_test,\
    X_merge_leap_test,\
    X_merge_video_test,\
    y_merge_test = generate_merge_data(X_voice_test,\
                                        y_voice_test,\
                                        X_leap_test,\
                                        y_leap_test,\
                                        X_video_test,\
                                        y_video_test,\
                                        augment_index=5)
    
    score = merge_model.evaluate([X_merge_video_test, X_merge_leap_test, X_merge_voice_test], y_merge_test, verbose=1)
    acc.append(score[1])



In [None]:
plt.scatter(list(range(10)), acc)