In [1]:
import glob
import os
# import librosa
# from librosa import display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from random import shuffle
%matplotlib inline
plt.style.use('ggplot')
# import pydot
# import graphviz
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import time

# Load Leapmotion dataset

In [2]:
leap_features = np.loadtxt('leap_merge_features.csv', delimiter=',')
leap_labels = np.array(np.loadtxt('leap_merge_labels.csv', delimiter=','), dtype=np.int)

In [3]:
X_leap_train, X_leap_test, y_leap_train, y_leap_test = train_test_split(
        leap_features, leap_labels, stratify=leap_labels, train_size=.75, random_state=round(time.time()))

print(X_leap_train.shape)
print(X_leap_test.shape)
print(y_leap_train.shape)
print(y_leap_test.shape)

(887, 8700)
(296, 8700)
(887, 6)
(296, 6)




# Load Voice dataset

In [4]:
features = np.loadtxt('voice_merge_features.csv', delimiter=',')
labels = np.array(np.loadtxt('voice_merge_labels.csv', delimiter=','), dtype=np.int)

In [5]:
X_all = features

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)
y_all = enc.fit_transform(labels.reshape((-1, 1)))

In [6]:
X_voice_train, X_voice_test, y_voice_train, y_voice_test = train_test_split(
        X_all, y_all, stratify=y_all, train_size=.75, random_state=round(time.time()))



In [7]:
print(X_voice_train.shape)
print(X_voice_test.shape)
print(y_voice_train.shape)
print(y_voice_test.shape)

(10633, 1280)
(3545, 1280)
(10633, 6)
(3545, 6)


# Load Video dataset

In [8]:
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
from models import ResearchModels
from data import DataSet
import time
import os.path

Using TensorFlow backend.


ImportError: No module named 'models'

In [15]:
model_name = 'mlp_merge'
saved_model = None  # None or weights file
class_limit = 6  # int, can be 1-101 or None
seq_length = 40
load_to_memory = True  # pre-load the sequences into memory
batch_size = 32
nb_epoch = 10

# Chose images or features and image shape based on network.
if model_name in ['conv_3d', 'c3d', 'lrcn']:
    data_type = 'images'
    image_shape = (80, 80, 3)
elif model_name in ['lstm', 'mlp', 'mlp_merge']:
    data_type = 'features'
    image_shape = None
else:
    raise ValueError("Invalid model. See train.py for options.")

    
# Helper: Save the model.
checkpointer = ModelCheckpoint(
    filepath=os.path.join('data', 'checkpoints', model_name + '-' + data_type + \
        '.{epoch:03d}-{val_loss:.3f}.hdf5'),
    verbose=1,
    save_best_only=True)

# Helper: TensorBoard
tb = TensorBoard(log_dir=os.path.join('data', 'logs', model_name))

# Helper: Stop when we stop learning.
early_stopper = EarlyStopping(patience=5)

# Helper: Save results.
timestamp = time.time()
csv_logger = CSVLogger(os.path.join('data', 'logs', model_name + '-' + 'training-' + \
    str(timestamp) + '.log'))
    

# Get the data and process it.
if image_shape is None:
    data = DataSet(
        seq_length=seq_length,
        class_limit=class_limit
    )
else:
    data = DataSet(
        seq_length=seq_length,
        class_limit=class_limit,
        image_shape=image_shape
    )

# Get samples per epoch.
# Multiply by 0.7 to attempt to guess how much of data.data is the train set.
steps_per_epoch = (len(data.data) * 0.7) // batch_size

if load_to_memory:
    # Get data.
    X_video_train, y_video_train = data.get_all_sequences_in_memory('train', data_type)
    X_video_test, y_video_test = data.get_all_sequences_in_memory('test', data_type)
else:
    # Get generators.
    generator = data.frame_generator(batch_size, 'train', data_type)
    val_generator = data.frame_generator(batch_size, 'test', data_type)    

check the classes order
['right', 'left', 'up', 'down', 'start', 'no']
Loading 984 samples into memory for training.
Loading 389 samples into memory for testing.


In [17]:
print(X_video_train.shape)
print(X_video_test.shape)
print(y_video_train.shape)
print(y_video_test.shape)

(984, 40, 2048)
(389, 40, 2048)
(984, 6)
(389, 6)


# Define Leapmotion Model

In [9]:
#LSTM
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

import numpy as np

data_dim = 87
timesteps = 100
num_classes = 6
leap_batch_size = 55

# Expected input batch shape: (batch_size, timesteps, data_dim)
# Note that we have to provide the full batch_input_shape since the network is stateful.
# the sample of index i in batch k is the follow-up for the sample i in batch k-1.
def build_leapmotion_model():
    model = Sequential()
    model.add(LSTM(55, return_sequences=True,
                   input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
    # model.add(LSTM(55, return_sequences=True))  # returns a sequence of vectors of dimension 32
    # model.add(LSTM(55, return_sequences=True)) # return a single vector of dimension 32
    model.add(Dropout(0.2))
    model.add(LSTM(55))
    model.add(Dropout(0.2))
    model.add(Dense(55))
    model.add(Dropout(0.2))
    
    return model

In [10]:
num_rows, num_cols = 100, 87

X_leap_train = X_leap_train.reshape(X_leap_train.shape[0], num_rows, num_cols)
X_leap_test = X_leap_test.reshape(X_leap_test.shape[0], num_rows, num_cols)

# Define Voice Model

In [12]:
## Imports
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import print_summary, plot_model
from keras import regularizers
from keras.layers import BatchNormalization
from keras.models import load_model

In [13]:
img_rows, img_cols = 40, 32

X_voice_train = X_voice_train.reshape(X_voice_train.shape[0], img_rows, img_cols, 1)
X_voice_test = X_voice_test.reshape(X_voice_test.shape[0], img_rows, img_cols, 1)

In [14]:
def build_voice_model(input_shape=(img_rows, img_cols, 1)):
    # Model definition
    model = Sequential()
    model.add(Conv2D(4, kernel_size=(5, 5),
                     activation='relu',
                     input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    
    model.add(Conv2D(8, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    return model

In [24]:
leapmotion_model = build_leapmotion_model()
voice_model = build_voice_model()
# leap_model = load_model("leap_model.h5")
# voice_model = load_model("voice_model.h5")
# video_model = ResearchModels(len(data.classes), model_name, seq_length, saved_model)

# Merge the models

In [25]:
from keras.layers import Merge
from keras.optimizers import RMSprop, Adam

model = Sequential()
model.add(Merge([leapmotion_model, voice_model], mode='concat', concat_axis=-1))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

  """


In [26]:
optimizer = RMSprop(lr=1e-3)

model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=optimizer,
                  metrics=['accuracy'])

In [32]:
model.fit([X_merge_leap_train, X_merge_voice_train],\
          y_merge_train,
          batch_size=64, shuffle=False,
          epochs=20,
          verbose=1,
          validation_data=([X_merge_leap_val, X_merge_voice_val], y_merge_val))

Train on 4435 samples, validate on 887 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

KeyboardInterrupt: 

In [66]:
score = model.evaluate([X_merge_leap_test, X_merge_voice_test], y_merge_test, verbose=1)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 0.23723840368724736
Test accuracy: 0.9260869564021311


# Prepare the merged dataset

In [27]:
def generate_merge_data(X_voice, y_voice,\
                        X_leap, y_leap,\
                        X_video, y_video,\
                       augment_index=1, ):

    X_merge_voice = []
    X_merge_leap = []
    X_merge_video = []
    y_merge = []

    for i in range(6):

        voice_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_voice,y_voice))))
        leap_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_leap,y_leap))))
        video_tmp = list(map(lambda d:d[0], filter(lambda d: np.argmax(d[1])==i, zip(X_video,y_video))))
        
        for _ in range(augment_index):
            
            shuffle(voice_tmp)
            shuffle(leap_tmp)
            shuffle(video_tmp)

            for tuple_3 in zip(voice_tmp, leap_tmp, video_tmp):
                X_merge_voice.append(tuple_3[0])
                X_merge_leap.append(tuple_3[1])
                X_merge_video.append(tuple_3[2])

                y_merge.append(i)
                
    ohe = OneHotEncoder(sparse=False)
    
    return np.array(X_merge_voice), np.array(X_merge_leap), np.array(X_merge_video), ohe.fit_transform(np.array(y_merge).reshape(-1, 1))

In [29]:
X_merge_voice_train,\
X_merge_leap_train,\
X_merge_video_train,\
y_merge_train = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_voice_train,\
                                    y_voice_train,\
                                    augment_index=5)

X_merge_voice_val,\
X_merge_leap_val,\
X_merge_video_val,\
y_merge_val = generate_merge_data(X_voice_train,\
                                    y_voice_train,\
                                    X_leap_train,\
                                    y_leap_train,\
                                    X_voice_train,\
                                    y_voice_train,\
                                    augment_index=1)

In [30]:
print((X_merge_voice_train.shape))
print((X_merge_leap_train.shape))
print((X_merge_video_train.shape))
print((y_merge_train.shape))

(4435, 40, 32, 1)
(4435, 100, 87)
(4435, 40, 32, 1)
(4435, 6)


In [31]:
print((X_merge_voice_val.shape))
print((X_merge_leap_val.shape))
print((X_merge_video_val.shape))
print((y_merge_val.shape))

(887, 40, 32, 1)
(887, 100, 87)
(887, 40, 32, 1)
(887, 6)


In [42]:
X_merge_voice_test,\
X_merge_leap_test,\
X_merge_video_test,\
y_merge_test = generate_merge_data(X_voice_test,\
                                    y_voice_test,\
                                    X_leap_test,\
                                    y_leap_test,\
                                    X_leap_test,\
                                    y_leap_test,\
                                    augment_index=20)

In [43]:
print((X_merge_voice_test.shape))
print((X_merge_leap_test.shape))
print((X_merge_video_test.shape))
print((y_merge_test.shape))

(5920, 40, 32, 1)
(5920, 100, 87)
(5920, 100, 87)
(5920, 6)


In [175]:
X_video_train[0][:,0]

array([0.04563421, 0.05056863, 0.05538939, 0.07386345, 0.01360721,
       0.02085993, 0.0332799 , 0.01394629, 0.01090135, 0.00746803,
       0.01606545, 0.00902444, 0.01460904, 0.01598709, 0.00903382,
       0.0153631 , 0.00978239, 0.01720455, 0.01530915, 0.01020595,
       0.01140743, 0.00635595, 0.00841652, 0.0112095 , 0.01028466,
       0.03030672, 0.0327414 , 0.02451475, 0.02913259, 0.01686394,
       0.01783792, 0.00886245, 0.0191673 , 0.02494109, 0.01072723,
       0.00994156, 0.01186314, 0.01294528, 0.01531062, 0.01087279],
      dtype=float32)