In [96]:
!nvidia-smi | head -31

Sun May  6 14:40:53 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:04:00.0 Off |                    0 |
| N/A   57C    P0    62W / 149W |   8465MiB / 11439MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           Off  | 00000000:05:00.0 Off |                    0 |
| N/A   60C    P0    90W / 149W |  10969MiB / 11439MiB |     18%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla K80           Off  | 00000000:08:00.0 Off |                    

In [None]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import scipy
import tflearn
import tensorflow as tf
import librosa
from scipy.io import wavfile
import os
import IPython
%matplotlib inline

In [None]:
GPUS = ['/gpu:0', '/gpu:3']
BATCH_SIZE = 16 * 2
SINGLE_SIZE = BATCH_SIZE // len(GPUS)
MODEL_NAME = 'CHINESE_CHARACTERS'

In [None]:
import time
import sys
class ProcessBar():
    def reset(self, length):
        self._length = length
        self._start = time.time()
    def show(self, i, msg=""):
        percents = (i + 1) / self._length
        equal_length = int(50 * percents) * "="
        empty_length = (49 - int(50 * percents)) * " "
        elapsed_time = time.time() - self._start
        eta_time = elapsed_time / percents * (1 - percents)
        line_str = "[{}>{}] {}/{} {:.1f}% {:.2f}s ETA:{:.2f}s {}"\
            .format(equal_length, empty_length, i, self._length,
                    100 * percents, eta_time, elapsed_time, msg)
        sys.stdout.write("\r" + line_str)
    def summary(self, i, msg=""):
        line_str = "[{}] {} {} {:.2f}s {}".format(50 * "=", i, self._length, time.time() - self._start, msg)
        sys.stdout.write("\r{}\n".format(line_str))
pb = ProcessBar()

In [None]:
aishell = "/data/icb/chinese_voice_recong/data/data_aishell/wav_uncompress/"

In [None]:
with open("/data/icb/chinese_voice_recong/data/data_aishell/transcript/aishell_transcript_v0.8.txt") as f:
    lines = f.readlines()

In [None]:
prefixs = ['train', 'test', 'dev']
prefixs_s = {k: next(os.walk(aishell + k))[1] for k in prefixs}
all_in_law = prefixs_s['train'] + prefixs_s['test'] + prefixs_s['dev']

In [None]:
def add_prefix(file_id):
    folder_id = file_id[6:11]
    prefix = [i for i in prefixs if folder_id in prefixs_s[i]][0]
    return "{}/{}/{}".format(prefix, folder_id, file_id)

In [None]:
X2y = {line.split(' ')[0]: "".join(line.replace('\n', '').split(' ')[1:]) for line in lines}

In [None]:
def is_no_exception(index, file_path):
    pb.show(index)
    try:
        wavfile.read(file_path)
        return True
    except:
        return False

In [None]:
X = []
for prefix in prefixs:
    r, fos, _ = next(os.walk(aishell + prefix + '/'))
    for fo in fos:
        r2, _, fs = next(os.walk(r + fo + '/'))
        X += [r2 + f for f in fs]
pb.reset(len(X))
X = [i for index, i in enumerate(X) if i.split('/')[-1].replace('.wav', '') in X2y.keys() and is_no_exception(index, i)]

In [None]:
y = [X2y[i.split('/')[-1].replace('.wav', '')] for i in X]

In [None]:
# hanzi2id = {v:k for k, v in enumerate(list(set(''.join(y))))}
# id2hanzi = {k:v for k, v in enumerate(list(set(''.join(y))))}
# id2hanzi[len(hanzi2id)] = '\n'
# import pickle
# with open('dictionary.pkl', 'wb') as f:
#     pickle.dump((hanzi2id, id2hanzi), f)

In [93]:
import pickle
with open('dictionary.pkl', 'rb') as f:
    hanzi2id, id2hanzi = pickle.load(f)

In [None]:
def get_length(index, file_id):
    wav = wavfile.read(file_id)
    pb.show(index)
    return len(wav[1])

In [None]:
pb.reset(len(X))
wav_lengths = [get_length(index, i) for index, i in enumerate(X)]

In [None]:
def fft_feature_extract(sample, n_fft=320, hop_length=160, win_length=320,
                        window=scipy.signal.hamming, normalize=True):
    D = librosa.stft(sample, n_fft=n_fft, hop_length=hop_length,
                     win_length=win_length, window=window)
    spect, phase = librosa.magphase(D)
    spect = np.log1p(spect)
    if normalize:
        mean = np.mean(spect)
        std = np.std(spect)
        spect -= mean
        spect /= std
    spect[np.isnan(spect)] = 0.
    return spect

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []
    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)
    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)
    return indices, values, shape

def embedding_labels(label):
    label_code = [hanzi2id[ch] for ch in label]
    return label_code

class BaseVoiceFlow():
    def __init__(self, X, y):
        self._X = X
        self._y = y
    def __getitem__(self, ids):
        specs, label_codes, len_seq = [], [], []
        for index in ids:
            sample = wavfile.read(self._X[index])[1] * 65536.0
            spec = fft_feature_extract(sample)
            specs.append(spec)
            label_codes.append(embedding_labels(self._y[index]))
            len_seq.append(spec.shape[1])
        max_length = np.max(len_seq)
        np_specs = np.zeros([len(ids), specs[0].shape[0], max_length, 1])
        for index, spec in enumerate(specs):
            np_specs[index, :, :spec.shape[1], 0] = spec
        sparse_labels = sparse_tuple_from(label_codes)
        return np_specs, sparse_labels, len_seq
    def __len__(self):
        return len(self._X)

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid, train_lengths, _ =\
    train_test_split(X, y, wav_lengths, test_size=0.1, random_state=42)
def get_flow(data_flow, shuffle = True, batch_size = 128,
             order=False, threads=4, max_queue=8):
    coord = tf.train.Coordinator()
    flow = tflearn.data_flow.FeedDictFlow({
        "data": data_flow
    },  coord, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        continuous=True, num_threads=threads, max_queue=max_queue,
        ensure_data_order=order)
    return flow
ten_stages = [i for i in range(0, 100, 10)]
ten_indics = [np.where((train_lengths >= np.percentile(train_lengths, i))
                       & (train_lengths < np.percentile(train_lengths, i + 10)))[0]
              for i in ten_stages]
ten_sets = [BaseVoiceFlow(np.take(X_train, i), np.take(y_train, i)) for i in ten_indics]
ten_flows = [get_flow(i, batch_size=BATCH_SIZE) for i in ten_sets]
[i.start() for i in ten_flows]

valid_set = BaseVoiceFlow(X_valid, y_valid)
valid_flow = get_flow(valid_set, batch_size=BATCH_SIZE)
valid_flow.start()

In [None]:
def conv_layer(input_tensor,training,kernel_size=(3,3),filters=32,dropout=None
               ,clip_net=20.,batch_normalization=True,strides=(1,1,1,1),name="conv_layer"):
    with tf.variable_scope(name):
        net = tf.layers.conv2d(input_tensor,filters=filters,kernel_size=kernel_size,padding='same',strides=strides, )
        if batch_normalization:
            net = tf.layers.batch_normalization(net,training=training)
        if dropout is not None:
            net = tf.layers.dropout(net,dropout=1.-dropout)
        net = tf.nn.relu(net)
    with tf.variable_scope("minimal"):
        if clip_net is not None and clip_net != False and clip_net > 0:
            net = tf.minimum(net, tf.constant(clip_net))
    return net

def rnn_layer(input_tensor, layer_num, rnn_size, len_squence, training, name="rnn_layer"):
    with tf.variable_scope(name):
        net = input_tensor
        final_state = None
        for i in range(layer_num):
            with tf.variable_scope("{}_layer{}".format(name,i + 1)):
                forward_cell = tf.contrib.rnn.GRUCell(rnn_size)
                backward_cell = tf.contrib.rnn.GRUCell(rnn_size)
                bi_outputs, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
                    forward_cell, backward_cell, net,
                    sequence_length = len_squence, time_major = False,dtype=tf.float32)
                outputs_fw, outputs_bw = bi_outputs
                rnn_outputs = outputs_fw + outputs_bw
                # THE FOLLOWING LINE IS BECAUSE THE FINAL LAYER GRADIENT IS NOT DEPENDED BY GRADIENT IN CLASSIFIER
                rnn_outputs = tf.layers.batch_normalization(rnn_outputs, training=training)
                net = rnn_outputs
                final_state = bi_encoder_state
        return net, final_state

def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        grads = []
        for g, _ in grad_and_vars:
            expanded_g = tf.expand_dims(g, 0)
            grads.append(expanded_g)

        grad = tf.concat(grads,0)
        grad = tf.reduce_mean(grad, 0)

        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [None]:
import tensorflow as tf
def create_epoch_input(name):
    input=tf.placeholder(tf.float32,name='{}_input'.format(name))
    op=tf.summary.scalar(name,input)
    return input,op
class TensorBoardLogger:
    def __init__(self,sess,names,path):
        self._sess=sess
        self._feed={}
        self._values={}
        with tf.name_scope('train'):
            for name in names:
                self._feed['train_{}'.format(name)]=create_epoch_input(name)
        with tf.name_scope('epoch'):
            for name in names:
                self._feed['epoch_{}'.format(name)]=create_epoch_input(name)
                self._values['epoch_{}'.format(name)]=0.
        with tf.name_scope('valid'):
            for name in names:
                self._feed['valid_{}'.format(name)]=create_epoch_input(name)
                self._values['valid_{}'.format(name)]=0.
        self._file_summary=tf.summary.FileWriter(path,sess.graph)
    def log(self,params,step):
        ops=[]
        feed_dict={}
        for name in params.keys():
            input,op=self._feed['train_'+name]
            summary=self._sess.run(op,feed_dict={input:params[name]})
            self._file_summary.add_summary(summary,step)
    def update(self,params,mode):
        for name in params.keys():
            self._values['{}_{}'.format(mode,name)]+=params[name]
    def summary(self,mode,steps,epoch):
        return_dict={}
        for key in self._values.keys():
            if key.startswith(mode):
                input,op=self._feed[key]
                if self._values[key] == 0.0:
                    continue
                value=self._values[key]/steps
                return_dict[key.replace(mode+'_','')]=value
                self._values[key]=0.
                summary=self._sess.run(op,feed_dict={input:value})
                self._file_summary.add_summary(summary,epoch)
        return return_dict

In [None]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    input_specs = tf.placeholder(tf.float32, shape=[BATCH_SIZE, 161, None, 1], name='sound_spects')
    input_sparse_labels = tf.sparse_placeholder(tf.int32, name='sparse_labels')
    input_len_sequences = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name='sound_lengths')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    is_training = tf.placeholder(tf.bool, name='is_training')
    global_step_tensor = tf.Variable(1, dtype=tf.int64, trainable=False, name="global_step")
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    
    tower_gradients = []
    with tf.variable_scope(tf.get_variable_scope()):
        for index, core_name in enumerate(GPUS):
            start_index = index * SINGLE_SIZE
            end_index = index * SINGLE_SIZE + SINGLE_SIZE
            print('init tower_{}'.format(index))
            with tf.name_scope('tower_{}'.format(index)) as vscope, tf.device(core_name):
                input_spec = input_specs[start_index: end_index]
                input_len_sequence = input_len_sequences[start_index: end_index]
                # Assume the max length of labels not larger than 100...
                input_sparse_label = tf.sparse_slice(input_sparse_labels,
                                                     [start_index, 0], [SINGLE_SIZE, 100])
                net = conv_layer(input_spec, is_training, kernel_size=(3,3),
                                   filters=32, strides=(1,1), name="deepspeech_conv_layer1/conv1")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=32, strides=(2,2), name="deepspeech_conv_layer1/conv2")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=64, strides=(1,1), name="deepspeech_conv_layer2/conv1")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=64, strides=(2,2), name="deepspeech_conv_layer2/conv2")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=96, strides=(1,1), name="deepspeech_conv_layer3/conv1")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=96, strides=(2,2), name="deepspeech_conv_layer3/conv2")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=128, strides=(1,1), name="deepspeech_conv_layer4/conv1")
                net = conv_layer(net, is_training, kernel_size=(3,3),
                                   filters=128, strides=(2,2), name="deepspeech_conv_layer4/conv2")
                input_len_sequence = tf.div(input_len_sequence, 16)
                net = tf.transpose(net, [0, 2, 1, 3])
                net_shape = net.get_shape().as_list()
                net = tf.reshape(net, [SINGLE_SIZE, -1, net_shape[-1] * net_shape[-2]])
                rnn_out, out_state = rnn_layer(net, 5, 400, 
                                                       len_squence=input_len_sequence,
                                                       training = is_training, 
                                                       name="deepspeech_rnn_layer")
                net_code = tf.layers.dense(rnn_out, 4328 + 1, activation=None, name="deepspeech_dense")
                tf.get_variable_scope().reuse_variables()
                ctc_loss = tf.nn.ctc_loss(labels=input_sparse_label, inputs=net_code,
                                                sequence_length=input_len_sequence,
                                                time_major=False)
                ctc_loss = tf.reduce_mean(ctc_loss)
                net_code = tf.transpose(net_code, [1, 0, 2])
                decoded, log_prob = tf.nn.ctc_greedy_decoder(net_code, sequence_length=input_len_sequence,
                                                                  merge_repeated=False)
                dense_decoded = tf.cast(tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32)
                edit_dis = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), input_sparse_label))
                
                trainable_params = tf.trainable_variables()
                grads = optimizer.compute_gradients(ctc_loss, var_list=tf.trainable_variables())
                tower_gradients.append(grads)
    grads = average_gradients(tower_gradients)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        global_step = tf.train.get_global_step()
        train_op = optimizer.apply_gradients(grads, global_step=global_step)
with graph.as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    tf.train.global_step(sess, global_step_tensor)
    LOGGER=TensorBoardLogger(sess,['ctc_loss', 'edist'], 'logs/{}'.format(MODEL_NAME))

In [None]:
ops_in_train = [train_op, ctc_loss, edit_dis, global_step_tensor]
ops_in_valid = [ctc_loss, edit_dis]

LR = 3e-4
RESTORE_EPOCH = -1
DECAY_EPOCH = 14
DECAY_EPOCH_LEN = 3
with graph.as_default():
    saver=tf.train.Saver(max_to_keep=400)
if RESTORE_EPOCH > -1:
    saver.restore(sess,"models/{}/{}_{}".format(MODEL_NAME, MODEL_NAME, RESTORE_EPOCH))
epochs = 20
batch_per_epoch = int(len(X_train)/BATCH_SIZE)
batch_per_valid = int(len(X_valid)/BATCH_SIZE)
train_flows = ten_flows

for epoch in range(RESTORE_EPOCH + 1, epochs):
    if epoch >= DECAY_EPOCH:
        LR=LR/np.power(2,(epoch-DECAY_EPOCH)//DECAY_EPOCH_LEN+1)
    pb.reset(batch_per_epoch)
    for i in range(batch_per_epoch):
        train_flow = train_flows[i % len(train_flows)]
        X_batch, y_batch, len_seq_batch = train_flow.next()['data']
        if len(X_batch)<BATCH_SIZE:
            i -= 1
            continue
        feed_dict={input_specs: X_batch,
                   input_sparse_labels: y_batch, input_len_sequences: len_seq_batch}
        feed_dict[is_training]=True
        feed_dict[learning_rate]=LR
        _, step_loss, step_edist, step_value =\
            sess.run(ops_in_train, feed_dict=feed_dict)
        log_dict = {'ctc_loss': step_loss, 'edist': step_edist}
        LOGGER.log(log_dict, step_value)
        LOGGER.update(log_dict, 'epoch')
        pb.show(i, msg="loss:{:.2f}".format(step_loss))
    dicts=LOGGER.summary('epoch', batch_per_epoch, epoch)
    pb.summary(epoch)
    pb.reset(batch_per_valid)
    for i in range(batch_per_valid):
        X_batch, y_batch, len_seq_batch = valid_flow.next()['data']
        if len(X_batch)<BATCH_SIZE:
            continue
        feed_dict={input_specs: X_batch,
                   input_sparse_labels: y_batch, input_len_sequences: len_seq_batch}
        feed_dict[is_training] = False
        step_loss, step_edist = sess.run(ops_in_valid, feed_dict=feed_dict)
        log_dict = {'ctc_loss': step_loss, 'edist': step_edist}
        LOGGER.update(log_dict, 'valid')
        pb.show(i,msg="loss: {:.3f}".format(step_loss))
    dicts=LOGGER.summary('valid',batch_per_valid,epoch)
    pb.summary(epoch)
    saver.save(sess,"models/{}/{}_{}".format(MODEL_NAME, MODEL_NAME, epoch))

In [95]:
with graph.as_default():
    saver = tf.train.Saver(max_to_keep=400, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
    saver.restore(sess,"models/{}/{}_{}".format(MODEL_NAME, MODEL_NAME, 19))
rand_indexes = np.random.choice(len(valid_set), BATCH_SIZE)
X_valid_np = np.asarray(X_valid)
y_valid_np = np.asarray(y_valid)
X_batch, y_batch, len_seq_batch = valid_set[rand_indexes]
feed_dict = feed_dict={input_specs: X_batch, 
                       input_sparse_labels: y_batch, 
                       input_len_sequences: len_seq_batch}
feed_dict[is_training] = False
step_net_code = sess.run(dense_decoded, feed_dict=feed_dict)
# predict_0 = np.argmax(np.transpose(step_net_code, [1, 0, 2]), axis=2)
predict_0 = ["".join([id2hanzi[i] for i in j]).strip('净').replace('\n', '') for j in step_net_code]
real_0 = y_valid_np[rand_indexes][SINGLE_SIZE:]
real_X = X_valid_np[rand_indexes][SINGLE_SIZE:]
display_tuples = ["{} ---> {}".format(predict_0[i], real_0[i]) for i in range(SINGLE_SIZE)]
for i in range(SINGLE_SIZE):
    print(display_tuples[i])
    IPython.display.display(IPython.display.Audio(real_X[i]))

INFO:tensorflow:Restoring parameters from models/CHINESE_CHARACTERS/CHINESE_CHARACTERS_19


INFO:tensorflow:Restoring parameters from models/CHINESE_CHARACTERS/CHINESE_CHARACTERS_19


目前促进农民工和外来流动人口的社会融目城市 ---> 目前促进农民工和外来流动人口的社会融入城市


他将自编自导字眼 ---> 他将自编自导自演


其中提出八项革新措施 ---> 其中提出八项革新措施


为了一址房价过会上涨 ---> 为了抑制房价过快上涨


调病检象用至虎头海表示危员和崇高 ---> 调兵遣将用金制虎头牌表示威严和崇高


只要在民天的最后一战中赢下东道主日本 ---> 只要在明天的最后一战中赢下东道主日本


并保持优势最先触比 ---> 并保持优势最先触壁


新疆皮山发生四零级地制震源深度一零千米 ---> 新疆皮山发生四零级地震震源深度一零千米


整体价格已经基本企稳 ---> 整体价格已经基本企稳


宝龙山至打造起标杆作品的重要意义 ---> 宝龙深知打造其标杆作品的重要意义


保证不动产交易安全 ---> 保证不动产交易安全


在俄俄罗斯世界杯预选赛上评中国香港复卡塔尔 ---> 在俄罗斯世界杯预选赛上平中国香港负卡塔尔


对于年龄超过三十五岁的女性而言 ---> 对于年龄超过三十五岁的女性而言


对于上证所防控新股炒作的后续工作安排 ---> 对于上证所防控新股炒作的后续工作安排


证监会下发会人制证券投资资咨询业务管理暂行规定 ---> 证监会下发会员制证券投资咨询业务管理暂行规定


海南省则提出增加经营性房地产产开发 ---> 海南省则提出增加经营性房地产开发
