In [1]:
import tensorflow as tf
from collections import namedtuple
slim = tf.contrib.slim
keras = tf.keras
GRU = keras.layers.GRU
BIO_DIR = keras.layers.Bidirectional
import numpy as np
import os
import tqdm
import time

In [2]:
Conv = namedtuple('Conv', ['kernel', 'stride', 'filters','pooling'])
Gru = namedtuple('Gru',['nn'])

In [3]:
conv_config = [
    Conv(kernel=(5,5),stride=(1,1),filters=96,pooling=(4,1)),
    Conv(kernel=(5,5),stride=(1,1),filters=96,pooling=(4,1)),
    Conv(kernel=(5,5),stride=(1,1),filters=96,pooling=(4,1)),
    Conv(kernel=(5,5),stride=(1,1),filters=96,pooling=(2,1)),
]
rnn_config = [
    Gru(nn=96),
    Gru(nn=96)
]

In [8]:
class CRNN(object):
    def __init__(self,input_images, conv_config,rnn_config,class_num,bio_direct):
        self.net = input_images
        self.size = input_images.shape
        self.conv_config = conv_config
        self.rnn_config = rnn_config
        self.bio_direct = bio_direct
        self.class_num = class_num
    def build_conv_layers(self):
        for i,config in enumerate(self.conv_config):
            self.net = slim.conv2d(self.net,config.filters,
                                   kernel_size=config.kernel,
                                   stride=config.stride,
                                   normalizer_fn=slim.batch_norm)
            print(self.net.get_shape())
            self.net  = slim.max_pool2d(self.net,config.pooling,stride=config.pooling)
            print(self.net.get_shape())
        self.net = tf.squeeze(self.net,axis=[1])
        print(self.net.get_shape())
    def build_rnn_layer(self):
        for i, config in enumerate(self.rnn_config):
            self.net = GRU(config.nn,return_sequences=True)(self.net)
#             if bio_direct:
                
                
#             else:
#                 self.net = GRU(config.nn,return_sequences=True)(self.net)

        self.net = keras.layers.MaxPool1D(pool_size=[self.size[2]])(self.net)
        self.net = tf.squeeze(self.net,[1])
        print(self.net.get_shape())
    def build_pooling_fc(self):
        # always name the last layer as final layer
        self.final_layer = slim.fully_connected(self.net,self.class_num,activation_fn=tf.nn.softmax)
        print(self.final_layer.get_shape())

In [9]:
data_root = "/home/philip/data/Keyword_spot"

In [10]:
batch_size = 256
valid_on_batch = 100
save_on_batch = 5000
total_step = 20000
class_num=31


In [11]:
bio_direct = True
# ohem = online hard example mining
# This is to select examples within a mini-batch with the top-k losses and only BP the loss on them
# ratio is to determin how many example to BP
ohem = True
ohem_ratio_start = 0.9

In [15]:
training_data_gen = keras.preprocessing.image.ImageDataGenerator(width_shift_range=0.1)
training_data_set_dir = os.path.join(data_root,"train/train_on_all/")
# training_data_set_dir = os.path.join(data_root,"train_on_all")

training_gen = training_data_gen.flow_from_directory(training_data_set_dir,class_mode="categorical",
                                                     target_size=(128,63),color_mode="grayscale",
                                                     batch_size=batch_size)
valid_dataset_dir = os.path.join(data_root,"train/valid_image")
valid_data_gen = keras.preprocessing.image.ImageDataGenerator().flow_from_directory(valid_dataset_dir,
                                                                                    class_mode="categorical",
                                                                                    target_size=(128,63),
                                                                                    color_mode="grayscale",
                                                                                    batch_size=batch_size)

training_gen.reset()
valid_data_gen.reset()

Found 107488 images belonging to 31 classes.
Found 14428 images belonging to 31 classes.


In [16]:
def init_network(conv_config,rnn_config,class_num,bio_direct):
    input_images = tf.placeholder(tf.float32,[None,128,63,1],name = 'input')
    crnn = CRNN(input_images,conv_config,rnn_config,class_num,bio_direct)
    crnn.build_conv_layers()
    crnn.build_rnn_layer()
    crnn.build_pooling_fc()
    crnn.predictions = tf.nn.l2_normalize(crnn.final_layer, 1, 1e-10, name='predicitons')
    return crnn,input_images

In [17]:
crnn,input_images = init_network(conv_config,rnn_config,class_num,bio_direct)

(?, 128, 63, 96)
(?, 32, 63, 96)
(?, 32, 63, 96)
(?, 8, 63, 96)
(?, 8, 63, 96)
(?, 2, 63, 96)
(?, 2, 63, 96)
(?, 1, 63, 96)
(?, 63, 96)
(?, 96)
(?, 31)


In [18]:
valid = 0
total_step = 5000
save_model = False
valid_on_batch = 100
# class_num = 30
save_on_batch = 5000
# ohem = online hard example mining
# This is to select examples within a mini-batch with the top-k losses and only BP the loss on them
# ratio is to determin how many example to BP
ohem = True
ohem_ratio_start = 0.6

In [19]:
labels = tf.placeholder(tf.float32,[None,class_num],name='label')
global_step = tf.Variable(0, trainable=False)
diff = keras.losses.categorical_crossentropy(labels,crnn.predictions)
v_loss = tf.reduce_mean(diff)
if ohem:
    ohem_ratio = tf.train.exponential_decay(ohem_ratio_start,global_step,1000,0.9,staircase=True)
    k = tf.cast(tf.multiply(ohem_ratio,tf.cast(tf.shape(diff)[0],tf.float32)),dtype=tf.int32)
    diff,indices = tf.nn.top_k(diff,k=k)
loss =tf.reduce_mean(diff)
acc = tf.reduce_mean(keras.metrics.categorical_accuracy(labels, crnn.predictions))

In [20]:
starter_learning_rate = 1e-2
global_step = tf.Variable(0, trainable=False)
# learning_rate = tf.train.natural_exp_decay(starter_learning_rate,global_step,5000,0.3)
learning_rate = tf.train.exponential_decay(starter_learning_rate,global_step,5000,0.2,staircase=True)
# learning_rate = starter_learning_rate
learning_rate = tf.maximum(learning_rate,1e-5)
# train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss,global_step=global_step)
# train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)

In [21]:
train_loss_sum = tf.summary.scalar("loss",loss)
learning_rate_sum = tf.summary.scalar("learning_rate",learning_rate)
train_acc_sum = tf.summary.scalar("classification_accuarcy",acc)
now = time.localtime(time.time())
time_folder = "%d_%d_%d_%d_%d"%(now.tm_mon,now.tm_mday,now.tm_hour,now.tm_min,now.tm_sec)
training_summary_merge = tf.summary.merge([train_loss_sum,learning_rate_sum,train_acc_sum])
log_dir = os.path.join(data_root,"CRNN/log/")
log_dir_training = os.path.join(log_dir+ "train/")
tb_writer＿training = tf.summary.FileWriter(logdir=log_dir_training+time_folder)

In [22]:
log_dir_valid = os.path.join(log_dir+ "valid/")
tb_writer＿valid = tf.summary.FileWriter(logdir=log_dir_valid+time_folder)
# valid_loss_sum = tf.summary.scalar("loss",v_loss)
# valid_acc_sum = tf.summary.scalar("classification_accuarcy",acc)
# valid_summary_merge = tf.summary.merge([valid_loss_sum,valid_acc_sum])

In [23]:
sess = tf.InteractiveSession()
init_op = tf.group(
        tf.local_variables_initializer(),
        tf.global_variables_initializer())
sess.run(init_op)

In [24]:
def write_new_sum(writer,value,name,g_step):
    summary = tf.Summary()
    new_sum = summary.value.add()
    new_sum.simple_value = value
    new_sum.tag = name
    tb_writer_valid.add_summary(summary,g_step)

In [27]:
# for test
valid_on_batch = 200
total_step = 100


In [29]:
# model_save_folder = os.path.join(data_root,"CRNN/model",time_folder)
# os.mkdir(model_save_folder)
for step in tqdm.tqdm_notebook(range(total_step)):
    image_batch,label_batch = training_gen.__next__()
    loss_get,_,training_summary = sess.run([loss,train_op,training_summary_merge],feed_dict={input_images:image_batch,labels:label_batch})
    g_step = tf.train.global_step(sess,global_step)
    tb_writer_training.add_summary(training_summary,g_step)
    if valid and (g_step+1)%valid_on_batch == 0:
        counter = 01
        valid_losses = []
        valid_acces = []
        for i in range(valid_data_gen.samples//valid_data_gen.batch_size+1):
            valid_images,valid_labels =valid_data_gen.next()
            valid_loss, valid_acc= sess.run([v_loss,acc],feed_dict={input_images:valid_images,labels:valid_labels})
            counter += 1
            valid_losses.append(valid_loss)
            valid_acces.append(valid_acc)
#         print(np.mean(valid_losses))
        write_new_sum(tb_writer_valid,np.mean(valid_losses),"loss",g_step)
        write_new_sum(tb_writer_valid,np.mean(valid_acces),"classification_accuarcy",g_step)
    
    if save_model and (g_step+1)%save_on_batch == 0:
        save_path = os.path.join(model_save_folder,str(g_step+1),"mobilenet.ckpt")
        os.mkdir(os.path.dirname(save_path))
        save_path = saver.save(sess,save_path)




In [20]:
class_indice = list(training_gen.class_indices.keys())

In [24]:
import cv2
import glob
im = cv2.imread(data_root+"test/test_image/1aed7c6d_nohash_0.png",0)
im = im.reshape((1,128,63,1))

In [25]:
a = sess.run([crnn.predictions],feed_dict={input_images:im})

In [26]:
class_indice[np.argmax(a)]

'nine'

In [46]:
valid_command = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']

In [47]:
import os
import cv2
import glob
import tqdm
import csv
batch_size = 128
csvfile = open("new_submission.csv",'w')
fieldnames = ['fname', 'label']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

In [1]:
data_root = "/home/philip/data/Keyword_spot/"

In [49]:
def replace_and_save_result(file_name,predict):
    if predict not in valid_command:
        predict = "unknown"
    filen = (os.path.basename(file_name)).replace("png","wav")
    writer.writerow({"fname":filen,"label":predict})

In [50]:
test_files = glob.glob(data_root+"test/test_image/*.png")
total_file = len(test_files)
for start in tqdm.tqdm_notebook(range(0,total_file,batch_size)):
    end = min(start+batch_size,total_file)
    batch_files =test_files[start:end]
    im = np.array([cv2.imread(i,0) for i in batch_files])
    im = np.expand_dims(im,-1)
#     print(im.shape)
#     im = im.reshape((batch_size,128,63,1))
    result = sess.run(crnn.predictions,feed_dict={input_images:im})
#     print(len(result[0][1]))
    predicts = [class_indice[np.argmax(a)] for a in result]
    [replace_and_save_result(_[0],_[1]) for _ in zip(batch_files,predicts)]




In [51]:
csvfile.close()

## Thought at the point
- CRNN composed of CNN feature extractor and RNN, try different number of layers and layer configurations
- still don't understand how to convert CNN output to RNN input (how to convert feature map into rnn input)
- elimenate the audio files that are not correctly recored
- try online hard example mining and offline
    - online: select the top-k loss to bp
    - offline: examples could not be detected by existing model
- try biodirection
- try Pure CNN
- find keyword spotting dataset

In [2]:
import keras
from keras import models
from keras.preprocessing.image import ImageDataGenerator
# model = model.load(os.path.join(data_root,"saved_model/12_27/CRNN_20000.pb"))

In [None]:
model = models.Model()
model.predict_generator()

In [10]:
# def training_network(network,input_images,class_num,training_step):
#     labels = tf.placeholder(tf.float32,[None,class_num],name='label')
#     loss = tf.reduce_mean(keras.losses.categorical_crossentropy(labels,network.predictions),name='training_loss')
#     acc = tf.reduce_mean(keras.metrics.categorical_accuracy(labels, network.predictions))
# #     network.predictions = tf.cast(network.predictions,tf.float32)
#     labels_logit = tf.argmax(labels,axis=1)
#     predictions_logits = tf.argmax(network.predictions,axis=1)
#     acc = tf.reduce_mean(tf.cast(tf.equal(labels_logit,predictions_logits),tf.float32))
# #     acc,_ = tf.metrics.accuracy(labels,network.predictions)
# #     train_acc = tf.summary.scalar("classification_accurcy/training",acc)
#     train_op = tf.train.AdamOptimizer(0.01).minimize(loss)
    
#     sess = tf.InteractiveSession()
#     init_op = tf.group(
#             tf.local_variables_initializer(),
#             tf.global_variables_initializer())
#     sess.run(init_op)
    
#     loss_his = []
#     acc_his = []
#     for step in tqdm.tqdm(range(training_step)):
#         batch_image , batch_label = training_gen.__next__()
#         _,pre_loss,t_acc = sess.run([train_op,loss,acc],feed_dict={input_images:batch_image,labels:batch_label})
# #         print(t_acc)
#         loss_his.append(pre_loss)
#         acc_his.append(t_acc)
#     return loss_his,acc_his