In [None]:
import tensorflow as tf
import numpy as np
from math import ceil

In [None]:
def init_weights(CNN,FC): 
    
    #Weights Initializer
    fc_initializer = tf.contrib.layers.xavier_initializer()
    conv_initializer = tf.contrib.layers.xavier_initializer_conv2d()
    
    wconv_shapes = [] ; bconv_shape = []
    wfc_shapes = []; bfc_shapes = []
    
    #All theses params are created and returned...
    wconv = [] ; bconv = []
    wfc = [] ; bfc = []
    
    #Setup the conv_filter shapes
    for i in range(len(CNN)):
        # i goes from 0 to 4
        # i+1 goes from 1 to 5

        filter_size = CNN[i]['conv'][0]

        if i == 0:
            #Number of input channels = 1 in 1st conv layer
            ch_in = 1
        else:
            #Output channels/filters of the previous layer..
            ch_in = CNN[i-1]['conv'][2]
            
        #Number of output_channels/filters
        ch_out = CNN[i]['conv'][2]
        
        wconv_shapes.append([filter_size,filter_size,ch_in,ch_out])
        
    #Setup Fully connected weight shapes..
    for i in range(1,len(FC)):
        wfc_shapes.append([fc_units[i-1],fc_units[i]])
        bfc_shapes.append([fc_units[i]])

    #Create Weights and Biases
    for i in range(len(wconv)):
        wconv[i] = tf.Variable(conv_initializer(wconv_shape[i]))
        bconv[i] = tf.Variable(tf.zeros(bconv_shape)) 
   
    for i in range(len(wfc)):
        wfc[i] = tf.Variable(fc_initializer(wfc_shape[i]))
        bfc[i] = tf.Variable(tf.zeros(bfc_shape[i]))

    return wconv,bconv,wfc,bfc

In [None]:
#Global Debugging flag..
debug = True

In [None]:
#Format: 'conv':[filter_size,strides,num_filters], 'pool':[strides]

#Model Params
CNN = [
        {'conv':[3,1,25], 'activate':'relu', 'pool':2},
        {'conv':[3,1,50], 'activate':'relu', 'pool':2},
        {'conv':[3,1,100], 'activate':'relu', 'pool':2},
        {'conv':[3,1,200], 'activate':'relu', 'pool':3},
        {'conv':[3,1,400], 'activate':'relu' ,'pool':None},
]

BRNN = {
        'layers':5,
        'hidden_units':256,
}

FC = [
        {'units':2*BRNN['hidden_units'],'activate':None},
        {'units':2*vocab_size,'activate':None},
        {'units':vocab_size,'activate':None},
]

In [None]:
# with tf.device('/gpu:0'):

#Model
#----------------------------------------------------------------------------#

wconv,bconv,wfc,bfc = init_weights(CNN,FC)

dropout_conv = tf.placeholder(tf.float32,shape=[])
dropout_lstm = tf.placeholder(tf.float32,shape=[])
dropout_fc = tf.placeholder(tf.float32,shape=[])

#Input 'Image'
inputs = tf.placeholder(tf.float32,shape=[None,img_height,img_width])

X = tf.reshape(inputs,(-1,img_height,img_width,1))

#-------------------Convolution-----------------------#

conv = [None] * len(CNN)

for i in range(len(CNN)):
    conv[i] = conv_layer(X,wconv[i],bconv[i],sconv[i],CNN[i]['activate'],dropout_conv)
    
    if CNN[i]['pool']:
        conv[i] = max_pool(conv[i],CNN[i]['pool'])

#--------All right upto here------------#

#Calculate height and width of output from CNN
conv_out_height,conv_out_width = calc_out_dims(CNN,img_height,img_width)
print('Convolution_Output_size:({},{})'.format(conv_out_height,conv_out_width))

#----------------LSTM--------------------------#
#Treat a single pixel from each filter or feature map as an individual feature
#So number of features  = num_conv4 filters or feature maps
#length_of_sequence = width * height of the output from conv3 

filters_in_last_conv = CNN[-1]['conv'][2]
lstm_inputs = tf.reshape(conv[-1],(-1,conv_out_height*conv_out_width,filters_in_last_conv))

#Number of time_steps to unroll for..
seq_len = conv_out_height * conv_out_width

#So that we can use different batch size during testing...
time_steps = tf.placeholder(tf.int32,shape = [None])
targets = tf.sparse_placeholder(tf.int32,name='targets')

lstm_initializer = tf.contrib.layers.xavier_initializer()
fw_layer = lstm_layer(BRNN['layers'],BRNN['hidden_units'],lstm_initializer,dropout=dropout_lstm)
bw_layer = lstm_layer(BRNN['layers'],BRNN['hidden_units'],lstm_initializer,dropout=dropout_lstm)
(outputs_fw,outputs_bw),_ = tf.nn.bidirectional_dynamic_rnn(fw_layer,bw_layer,lstm_inputs,dtype=tf.float32)

# outputs,_,_ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw,cells_bw,inputs = lstm_inputs,dtype=tf.float32)

print('LSTM_Output_size:({},{})'.format(outputs_fw,outputs_bw))

#Concatenate the output from both cells (forward and backward)
blstm_outputs = tf.concat([outputs_fw,outputs_bw], 2)

#flatten out all except the last dimension
fc_inputs  = tf.reshape(blstm_outputs,[-1,2*rnn_hidden_units])

#Feed into the fully connected layer
#No activation cuz, the output of this layer is feeded into CTC Layer as logits
fc_outputs_1 = fc_layer(fc_inputs,wfc1,bfc1,activation=None,dropout=dropout_fc)
fc_outputs_2 = fc_layer(fc_outputs_1,wfc2,bfc2,activation=None,dropout=dropout_fc)

#Reshape back to batch_size, seq_len,vocab_size
logits = tf.reshape(fc_outputs_2,[-1,seq_len,vocab_size])

#convert them to time major
logits = tf.transpose(logits,[1,0,2])

#Calculate loss
loss = tf.nn.ctc_loss(targets, logits, time_steps)
cost = tf.reduce_mean(loss)

#Optimize
optimizer = tf.train.RMSPropOptimizer(learning_rate=alpha)
train = optimizer.minimize(loss)

# CTC decoder.
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, time_steps)
label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),targets))

In [None]:
def conv_layer(x,w,b,s,activation=None,dropout=1.0):
    conv = tf.nn.conv2d(input=x,filter=w,padding='SAME',strides=[1,s,s,1]) + b
    
    if activation == 'relu':
        conv = tf.nn.relu(conv)
    
    elif activation == 'leaky_relu':
        conv = tf.nn.leaky_relu(conv)
        
    elif activation == 'elu':
        conv = tf.nn.elu(conv)
        
    elif activation == 'tanh':
        conv = tf.nn.tanh(conv)
        
    conv = tf.nn.dropout(conv,dropout)
    
    return conv

In [None]:
def fc_layer(x,w,b,activation=None,dropout=1.0):
    
    out = tf.matmul(x,w) + b
    
    if activation == 'relu':
        out = tf.nn.relu(out)
    
    elif activation == 'leaky_relu':
        out = tf.nn.leaky_relu(out)
    
    elif activation == 'elu':
        out = tf.nn.elu(out)
        
    elif activation == 'tanh':
        out = tf.nn.elu(out)
        
    out = tf.nn.dropout(out,dropout)
    
    return out

In [None]:
def lstm_layer(num_layers,hidden_units,initializer,dropout=1.0):
    
    cells = []
    for _ in range(num_layers):
        cell = tf.contrib.rnn.LSTMCell(hidden_units,initializer=initializer)
        cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=dropout,dtype=tf.float32)
        cells.append(cell)
    
    layer = tf.contrib.rnn.MultiRNNCell(cells)
    
    return layer

In [None]:
def max_pool(x,s):
    
    pool = tf.nn.max_pool(x,ksize=[1,s,s,1],strides=[1,s,s,1],padding='SAME')
    
    return pool

In [None]:
def calc_out_dims(CNN,height,width):
    for i in range(len(CNN)):

        strides = CNN[i]['conv'][1]

        height = ceil(float(height) / float(strides))
        width = ceil(float(width) / float(strides))

        if CNN[i]['pool']:
            height = ceil(float(height) / float(CNN[i]['pool']))
            width = ceil(float(width) / float(CNN[i]['pool']))    
    
    return height,width