In [1]:
import tensorflow as tf
import time
import numpy as np
import os
import shutil
from numpy import matrix
from tensorflow.python.ops import ctc_ops as ctc
n_features=13
n_classes=63

#Hyperparameters
num_layers=1
n_hidden=300
batch_size=1
n_epochs=100

#Target log path
logs_path = '/tmp/tensorflow/timit_speech_recognition'

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)

def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(matrix(seq)), range(len(matrix(seq)))))
        values.extend([seq])

    indices = np.asarray(indices, dtype=np.int32)
    indices[:,[0, 1]] = indices[:,[1, 0]]
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int32)

    return indices, values, shape


inputs = tf.placeholder(tf.float32,[None,None,n_features])
target_idx = tf.placeholder(tf.int64)
target_vals = tf.placeholder(tf.int32)
target_shape = tf.placeholder(tf.int64)
targets = tf.SparseTensor(target_idx, target_vals, target_shape)
seq_len = tf.placeholder(tf.int32)

# RNN output node weights and biases
weights = {
        'out': tf.Variable(tf.random_normal([n_hidden,n_classes]),dtype=tf.float32) # Weights_shape = hidden_units X vocab_size
    }
biases = {
        'out': tf.Variable(tf.random_normal([n_classes]),tf.float32)
        }
        
def LSTM_cell():
    cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, reuse=tf.get_variable_scope().reuse)
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.4)

def RNN_Model(inputs,seq_len,weights,biases):
    rnn_cell=LSTM_cell()
    #rnn_cell = tf.contrib.rnn.MultiRNNCell(LSTM_cell() for i in range(num_layers))
    outputs,_ = tf.nn.dynamic_rnn(rnn_cell,inputs,seq_len,dtype=tf.float32)
        
    outputs = tf.reshape(outputs,[-1,n_hidden])
    #print(outputs)
    logits = tf.matmul(outputs,weights['out']) + biases['out']
        
    with tf.name_scope('Weights'):
        variable_summaries(weights['out'])
    
    with tf.name_scope('Biases'):
        variable_summaries(biases['out'])
    
    with tf.name_scope('Activations'):
        tf.summary.histogram('Activations',logits)
        
    logits = tf.reshape(logits,[batch_size,-1,n_classes])
    logits = tf.transpose(logits,(1,0,2))
    return logits

logits=RNN_Model(inputs,seq_len,weights,biases)
loss = ctc.ctc_loss(targets,logits,seq_len)
with tf.name_scope("CTC_Loss"):
    cost = tf.reduce_mean(loss)
    tf.summary.scalar('CTC_Loss',cost)
        
optimizer = tf.train.MomentumOptimizer(learning_rate=0.005, momentum=0.9).minimize(cost)
decoded, log_prob = ctc.ctc_greedy_decoder(logits, seq_len)
        
label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets)) 
#computes the Levenshtein distance between sequences.
#This operation takes variable-length sequences (hypothesis and truth), each provided as a SparseTensor, and computes the Levenshtein distance.
# You can normalize the edit distance by length of truth by setting normalize to true.
def convert_to_sequence(val, type='phn'):
    ''' convert the output into sequences of characters or phonemes
    '''
    phn = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h',
       'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl',
       'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng',
       'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#',
       'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k',
       'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow',
       'oy', 'p', 'pau', 'pcl', 'q', 'r', 's',
       'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux',
       'v', 'w', 'y', 'z', 'zh']
    #print(val)
    seq=[]
    for v in val:
        #print(v)
        if(v>=(len(phn))):
            pass
        else :
            seq.append(phn[v])
    seq=' '.join(seq)
    return seq
    
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(logs_path)
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
saver = tf.train.Saver(tf.global_variables())
init=tf.global_variables_initializer()

def train():
    with tf.Session(config=config) as sess:
            sess.run(init)
            writer.add_graph(sess.graph)
            
            mfcc_files = [os.path.join(root, name) for root, dirs, files in os.walk('mfccnpyFiles') for name in files
             if name.endswith((".npy"))]
            phoneme_labels=[os.path.join(root, name) for root, dirs, files in os.walk('phonemeLabels') for name in files
             if name.endswith((".npy"))]
            for curr_epoch in range(n_epochs):
                train_cost=train_ler=0
                count_files=0
                for file in mfcc_files:
                    count=0
                    mfcc_No_Suffix=os.path.splitext(file)[0]
                    mfcc_File_Name=mfcc_No_Suffix.split('\\')[-1]
                    for label in phoneme_labels:
                        phoneme_No_Suffix=os.path.splitext(label)[0]
                        phoneme_File_Name=phoneme_No_Suffix.split('\\')[-1]
                        if(mfcc_File_Name==phoneme_File_Name):
                            count+=1
                            train_inputs=np.load(".\\mfccnpyFiles\\"+mfcc_File_Name+'.npy')
                            train_inputs = np.asarray(train_inputs[np.newaxis, :])
                            train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
                            train_seq_len = [train_inputs.shape[1]]
                            
                            train_targets=np.load(".\\phonemeLabels\\"+phoneme_File_Name+'.npy')
                            target_index,target_values,target_shapes = sparse_tuple_from(train_targets)
                            feed = {inputs: train_inputs,target_idx:target_index,target_vals:target_values,target_shape:target_shapes,seq_len: train_seq_len}
                            summary,batch_cost, _ = sess.run([merged, cost, optimizer], feed)
                            writer.add_summary(summary, count_files)
                            
                            train_cost += batch_cost * batch_size
                            print('Truth:\n' + convert_to_sequence(train_targets, type='phn'))
                            print('Output:\n' + convert_to_sequence(sess.run(decoded[0].values,feed_dict=feed),type='phn'))
                            #print("Decoded Values: " ,sess.run(decoded[0].values,feed_dict=feed))
                            train_ler += sess.run(label_error_rate, feed_dict=feed) * batch_size
                            count_files+=1
                            print("Batch Cost {0} after file {1} \n".format(batch_cost,count_files))
                            if(count_files==6300):
                                train_cost=(train_cost/6300)
                                tf.summary.scalar('Train_Cost',cost)
                                writer.add_summary(summary, curr_epoch)
                            
                            
                        if(count>=1):
                            break
                
                print(" Train Cost: {0} in Epoch {1}".format((train_cost),curr_epoch))
                saver.save(sess,"./saver/model.ckpt")
                print("Model saved ") 

In [None]:
train()

Truth:
h# sh iy hv ae dcl d y axr dcl d aa r kcl k s ux tcl t q ix n gcl g r iy s iy w ao sh epi w ao dx axr q ao l y ih axr h#
Output:
dx iy hv dcl axr b hv iy hv b pau b hv iy aa b axr hv d b ow hv b y ih iy d b uh b ah d hv w hv ow uh hv b hv hv hv iy ao hv hv d uh hv b ao b hv hv ao hv
Batch Cost 423.5744934082031 after file 1 

Truth:
h# d ow n tcl t ae s kcl m iy tcl t ix kcl k ih r iy eh n oy l iy r ae gcl g l ay kcl k dh ae tcl h#
Output:

Batch Cost 898.9334716796875 after file 2 

Truth:
h# b r ih kcl k s q aa r axr n ao l tcl t er nx ix tcl t ih v h#
Output:
k n er n ae hh r w eh m n h# n z zh r ch tcl ch ux n aa tcl aa n tcl n dh r eh k eh h# ch n w ae k epi eh ae h# n ae er n jh n tcl ch n tcl ch er l m r ih ey n ch pcl eh m k r
Batch Cost 1298.345947265625 after file 3 

Truth:
h# f ae tcl sh ow dcl d q ix n l uw s r ow l z bcl b ix n iy th ix sh er tcl h#
Output:
ow ey k
Batch Cost 687.2442016601562 after file 4 

Truth:
h# ix tcl s ah f axr z f r ax m ax l ae kcl k ix v

Truth:
h# dh ix kcl k aw w aa n d axr dcl f ah m dh ax f aa r m l ae n dcl d ix n bcl b ix kcl k ey m l ao s tcl h#
Output:
h# h# h# h# h# ae h# q axr h# h# uh h# h# h# q h# h# h# h# h# h# r h# h# h# k h# h#
Batch Cost 202.12957763671875 after file 29 

Truth:
h# k ah tcl t ix s m ao l kcl k ao nx axr q ao f q iy tcl ch q eh dcl jh h#
Output:
h# h# h# dh h# h# h# r h# h# h# h# ae h# h# kcl h# h# h# h# h# h# h# h# h# ae
Batch Cost 164.8043212890625 after file 30 

Truth:
h# sh iy hv ae dcl d y er dcl d aa r kcl k s uw dx ih ng gcl g r iy s iy w aa sh epi w aa dx er q ao l y iy axr h#
Output:
h# h# h# ae h# h# h# h# ix h# h# h# h# h# h# h# h# h# h# h# h# h# h#
Batch Cost 196.1984100341797 after file 31 

Truth:
h# d ow n ae s epi m iy tcl t ix kcl k eh r iy ix n oy l iy r ae gcl g l ay kcl k dh ae tcl h#
Output:
h# l h# ae h# h# ae ae h# aa h# h# kcl h# h# h# h# ae h# h#
Batch Cost 146.11187744140625 after file 32 

Truth:
h# hh ih z kcl k ae pcl t ix n w ax s th ih n ae n hv ae gcl g er

Truth:
h# dh ey r iy m ey n dcl d epi l ay f l ah ng epi f r eh n dcl z q ae n kcl k em pcl p ae n y ix n tcl s h#
Output:
h# iy ix h# iy tcl k iy n iy ix
Batch Cost 166.53427124023438 after file 66 

Truth:
h# b ay q iy dcl d iy ng y ow gcl g er dcl pau y ux m ey l ih v l ao ng gcl g axr h#
Output:
ix h#
Batch Cost 130.5674591064453 after file 67 

Truth:
h# dh ix f r ay tcl en tcl ch ay l dcl w ix z dcl jh eh n tcl l iy s ah bcl d ux dcl d bcl b ay hv ix z bcl b ih gcl b r ah dh axr h#
Output:
iy ix
Batch Cost 184.5084686279297 after file 68 

Truth:
h# dh iy q ow v ax w ey tcl ch aa r m ax kcl k ux dcl d s epi l ih pcl p oy z ih n q ih n tcl t uw q eh n iy w ah n z tcl t iy h#
Output:
ao h# s l
Batch Cost 197.25192260742188 after file 69 

Truth:
h# k l ih f s dcl d ih s pcl p l ey epi w ah s epi m ix s pcl p l ey s tcl t q aa n dh ix s kcl k r iy n h#
Output:
z
Batch Cost 166.9940948486328 after file 70 

Truth:
h# sh iy hv ae dcl d y uh dcl d aa kcl k s ux tcl t pau ih n gcl g r i

Truth:
h# d ow q ae s kcl k m iy tcl t ix kcl k ih r iy ix nx oy l iy r ae gcl g l ay kcl dh ae tcl t h#
Output:
h#
Batch Cost 140.30453491210938 after file 112 

Truth:
h# y axr v oy s ix z dcl d ax l ay tcl f el hh iy ix pcl p r ux v dcl d w ix dh ax w ao r m s m ay l h#
Output:
h#
Batch Cost 165.18353271484375 after file 113 

Truth:
h# t er bcl b y ih l eh n tcl t ay dcl d z r ow z ax z epi m ah tcl ch ix z f ih f tcl t iy f iy tcl t h#
Output:

Batch Cost 162.11134338378906 after file 114 

Truth:
h# s ah m tcl t ay m z epi s ow dcl jh er z epi r ow q l ae dx er z epi w aa l bcl b uh l ih tcl s epi w axr w ih z ix n ah bcl b aw tcl dh eh r hh eh dcl d z h#
Output:
iy ao m n m s
Batch Cost 244.47000122070312 after file 115 

Truth:
h# s ih kcl k l ax kcl k ow pcl p r ow gcl er m z w el n eh v axr kcl k ax m pcl p ay l h#
Output:

Batch Cost 142.0706787109375 after file 116 

Truth:
h# k l ih r pcl p r ax n ah n tcl s ih ey sh ix n eh z ix pcl p r ix sh ix ey dx ix dcl h#
Output:
tc

Truth:
h# q ax m ao th z ih gcl g z ae gcl d ax l ao ng dh ax pcl p ae th r uw q ao t ow z gcl aa r dx en h#
Output:
h# ih aa h# h# h#
Batch Cost 181.06216430664062 after file 159 

Truth:
h# w iy gcl g aa tcl dcl d r eh n tcl ch tcl t f r ax m dh ih q ah nx ix nx ix r ah pcl t ix dcl r ey n h#
Output:

Batch Cost 158.28225708007812 after file 160 

Truth:
h# sh ix hv ae dcl y axr dcl d aa r kcl k s ux tcl en gcl g r iy z ix w aa sh epi w ao dx axr q ao l y ix axr h#
Output:

Batch Cost 159.5186767578125 after file 161 

Truth:
h# d ow nx ae s epi m ix dx ix kcl k eh r iy ix n ao l iy r ae gcl g l ay kcl dh ae tcl h#
Output:

Batch Cost 127.3191909790039 after file 162 

Truth:
h# b aa dh ax l uh kcl k ah v ih m hv ix w ah z ix n tcl dh ae tcl f aa r gcl g ao n h#
Output:
h#
Batch Cost 132.72642517089844 after file 163 

Truth:
h# w axr y ux ix n l ah v w ax th ae tcl g er el h#
Output:
n
Batch Cost 88.77705383300781 after file 164 

Truth:
h# hh ih r dh eh dx ih dcl n aa tcl n iy dcl 

Truth:
h# ey r ow l ow epi w ay r hh l ey n ih axr dh ax w ao l h#
Output:

Batch Cost 91.57411193847656 after file 207 

Truth:
h# hh aw dcl d uw q oy s tcl t ax z m ey kcl p er l s h#
Output:
p
Batch Cost 98.61529541015625 after file 208 

Truth:
h# ih n q eh v r iy m ey dcl jh axr kcl k l ow v ax l iy f tcl t r ae f ix kcl k s ah m tcl t ay ng gcl g eh s bcl b ae kcl t ah pcl h#
Output:

Batch Cost 206.8291015625 after file 209 

Truth:
h# w ih m ix n m ey n eh v ax bcl b iy kcl k ah m kcl k ax m pcl p l iy tcl t l iy q iy kcl k w uh tcl t uw m eh n h#
Output:
h# ix ix ay epi tcl ix iy tcl
Batch Cost 186.21888732910156 after file 210 

Truth:
h# sh iy eh dcl d y er dcl d aa r kcl k s ux tcl q ih ng g r iy s iy w ao r sh epi w ao dx er q ao l y ih axr h#
Output:

Batch Cost 166.91517639160156 after file 211 

Truth:
h# d ow n ae s kcl m iy dcl d ih kcl k eh r iy ix n q oy l iy r ae gcl g l ay kcl dh ae tcl h#
Output:

Batch Cost 128.80056762695312 after file 212 

Truth:
h# dh iy tcl

Truth:
h# t r eh s pcl p ae s ih ng ix z f axr bcl b ih dx en ix n s ah bcl jh ih kcl t ix pcl p eh nx el tcl t iy h#
Output:

Batch Cost 159.9615020751953 after file 256 

Truth:
h# ow n l ix dh ix bcl b eh s tcl p l ey er z ix n dcl jh oy pcl p aa pcl p y ix l eh er tcl t iy h#
Output:

Batch Cost 142.69906616210938 after file 257 

Truth:
h# w iy l ay kcl b l ux tcl ch iy z bcl ix tcl v ih kcl t axr pcl p axr f er s w ih s tcl ch iy z h#
Output:
h# h#
Batch Cost 148.1240997314453 after file 258 

Truth:
h# ey gcl g uh dx ae dx ix tcl t ux dx ix z ah n bcl b iy dx ix bcl b el h#
Output:
iy
Batch Cost 107.11498260498047 after file 259 

Truth:
h# dh ey ao l ay kcl k l ao ng hv aa tcl t sh aw er z h#
Output:

Batch Cost 89.29225158691406 after file 260 

Truth:
h# sh iy hv ae dcl jh axr dcl d aa r kcl k s ux dx ix ng gcl g r iy z iy w aa sh epi w ao dx axr q ao l y ih axr h#
Output:

Batch Cost 169.76170349121094 after file 261 

Truth:
h# t ow n ae s kcl m iy dx ih kcl k eh r iy ix nx

Truth:
h# k l ae pcl ix ng s pcl er z tcl t ix dh ix bcl b r aa ng kcl k hh iy s eh dx ao f ix dx ix sh aa r pcl k ae n tcl t axr pau w ix th gcl g r ow ix ng q ax l aa r m h#
Output:

Batch Cost 250.59356689453125 after file 304 

Truth:
h# eh v r iy th iy eng w eh n q r ih l s epi m uw dh ix sh er ix f s eh dcl h#
Output:

Batch Cost 120.22505187988281 after file 305 

Truth:
h# w ay eh l tcl s w ax dcl d ae nx iy ax l aw ah dh axr z tcl t ix gcl g ow h#
Output:

Batch Cost 126.41519165039062 after file 306 

Truth:
h# dh iy q ae ng g r ix bcl b oy q ae n s axr dcl bcl b ah tcl d ih dcl en tcl l uh kcl k ah pcl h#
Output:

Batch Cost 139.27490234375 after file 307 

Truth:
h# dh ax kcl k aw w aa n axr dcl f axr m dh ax f aa r m l ae n q ix n bcl b iy kcl k ey m l ao s tcl t h#
Output:

Batch Cost 154.2710418701172 after file 308 

Truth:
h# s iy m epi s tcl t r ix s ix z q ix tcl t ae tcl ch z ih pcl p axr z epi w ax th ey th ih m bcl el pau n iy dx el q ix n dcl th r eh dcl h#
Outpu

Truth:
h# d ow n q ae s epi m iy dx ih kcl k ae r iy ix n oy l iy r ae gcl g l ay kcl th ae tcl h#
Output:

Batch Cost 127.47306060791016 after file 352 

Truth:
h# hh ah s kcl k iy y ah ng m ae n hv iy s eh dcl pau w ih dh epi m aa kcl pau d ih s tcl t ey s tcl t h#
Output:

Batch Cost 147.59893798828125 after file 353 

Truth:
h# aa hv aa hh iy th ao tcl q ey epi l ah sh dcl d ix v ao r s iy pau q ae q l ae s tcl t h#
Output:

Batch Cost 133.2965850830078 after file 354 

Truth:
h# dh ah s tcl t eh kcl n ix kcl k el q ax f ih sh ix n s iy q ih z ax tcl ch iy v pau q ae tcl dh iy ih kcl k s pcl p eh n tcl t s ah v ae tcl ch uw ax l q ix kcl k s pcl p ih r iy ix n tcl t s h#
Output:

Batch Cost 313.01531982421875 after file 355 

Truth:
h# aa bcl jh ih kcl s epi m ey dx ax v pcl p y uw dx axr r axr bcl b y uw dx ax f uh l h#
Output:

Batch Cost 138.7455596923828 after file 356 

Truth:
h# dh ix m ao nx iy ng dcl d uw q ah n dh uh s pcl p ay dx ih w eh bcl pau g l ih s ih n dcl d ax-h p

In [None]:
with tf.Session() as session:
    session.run(init)
    saver.restore(session,"./saver/model.ckpt")
    print("Restored")
    print(session.run(weights['out']))

# **NOTE**
When using any of Tensorflow’s rnn functions with padded inputs it is important to pass the sequence_length parameter. In my opinion this parameter should be required, not optional. sequence_length serves two purposes: 1. Save computational time and 2. Ensure Correctness.
Let’s say you have a batch of two examples, one is of length 13, and the other of length 20. Each one is a vector of 128 numbers. The length 13 example is 0-padded to length 20. Then your RNN input tensor is of shape [2, 20, 128]. The dynamic_rnn function returns a tuple of (outputs, state), where outputs is a tensor of size [2, 20, ...] with the last dimension being the RNN output at each time step. state is the last state for each example, and it’s a tensor of size [2, ...] where the last dimension also depends on what kind of RNN cell you’re using.
So, here’s the problem: Once your reach time step 13, your first example in the batch is already “done” and you don’t want to perform any additional calculation on it. The second example isn’t and must go through the RNN until step 20. By passing sequence_length=[13,20] you tell Tensorflow to stop calculations for example 1 at step 13 and simply copy the state from time step 13 to the end. The output will be set to 0 for all time steps past 13. You’ve just saved some computational cost. But more importantly, if you didn’t pass sequence_length you would get incorrect results! Without passing sequence_length, Tensorflow will continue calculating the state until T=20 instead of simply copying the state from T=13. This means you would calculate the state using the padded elements, which is not what you want.