### Optical character recognition using Bidirectional RNNs

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import gzip
import csv

In [3]:
import numpy as np
import tensorflow as tf

  return f(*args, **kwds)


In [4]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

In [5]:
from six.moves import urllib

In [6]:
print(np.__version__)
print(tf.__version__)

1.16.2
1.13.1


In [7]:
URL_PATH = 'http://ai.stanford.edu/~btaskar/ocr/letter.data.gz'
DOWNLOADED_FILENAME = 'letter.data.gz'

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)
    
    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [8]:
download_data()

Found and verified file from this path:  http://ai.stanford.edu/~btaskar/ocr/letter.data.gz
Downloaded file:  letter.data.gz


In [9]:
def read_lines():
    with gzip.open(DOWNLOADED_FILENAME, 'rt') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = list(reader)

        return lines

In [10]:
lines = read_lines()

### Format of every line

* id
* letter
* next_id
* word_id
* position
* fold
* 16x8 columns of pixel values

In [11]:
lines[0][:8]

['1', 'o', '2', '1', '1', '0', '0', '0']

In [12]:
len(lines)

52152

In [13]:
def get_features_labels(lines):
    lines = sorted(lines, key=lambda x: int(x[0]))
    data, target = [], []
    
    next_id = -1
    
    word = []
    word_pixels = []

    for line in lines:
        next_id = int(line[2]) # The index for the next_id column

        pixels = np.array([int(x) for x in line[6:134]])
        pixels = pixels.reshape((16, 8))
        
        word_pixels.append(pixels)
        word.append(line[1])
        
        if next_id == -1:
            data.append(word_pixels)
            target.append(word)

            word = []
            word_pixels = []


    return data, target

In [14]:
data, target = get_features_labels(lines)

In [15]:
def pad_features_labels(data, target):    
    max_length = max(len(x) for x in target)
    padding = np.zeros((16, 8))

    data = [x + ([padding] * (max_length - len(x))) for x in data]
    target = [x + ([''] * (max_length - len(x))) for x in target]
    
    return np.array(data), np.array(target)

In [16]:
padded_data, padded_target = pad_features_labels(data, target)

In [17]:
padded_target[:10]

array([['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', '']],
      dtype='<U1')

#### The length of each sequence

We've padded all words so that their lengths are all equal to the length of the longest word

In [18]:
sequence_length = len(padded_target[0])

In [19]:
sequence_length

14

In [20]:
padded_data.shape

(6877, 14, 16, 8)

In [21]:
padded_data.shape[:2] + (-1,)

(6877, 14, -1)

In [22]:
reshaped_data = padded_data.reshape(padded_data.shape[:2] + (-1,))

In [23]:
reshaped_data.shape

(6877, 14, 128)

In [24]:
padded_target.shape

(6877, 14)

In [25]:
padded_target.shape + (26,)

(6877, 14, 26)

In [26]:
one_hot_target = np.zeros(padded_target.shape + (26,))

In [27]:
for index, letter in np.ndenumerate(padded_target):
    if letter:
        one_hot_target[index][ord(letter) - ord('a')] = 1

#### One-hot representation of the letter 'o'

* The letter 'o' represented by a 1 at the 14th index 
* Index positions start at 0

In [28]:
one_hot_target[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
shuffled_indices = np.random.permutation(len(reshaped_data))

shuffled_data = reshaped_data[shuffled_indices]
shuffled_target = one_hot_target[shuffled_indices]

In [30]:
split = int(0.66 * len(shuffled_data))

train_data = shuffled_data[:split]
train_target = shuffled_target[:split]

test_data = shuffled_data[split:]
test_target = shuffled_target[split:]

In [31]:
train_data.shape

(4538, 14, 128)

In [32]:
_, num_steps, num_inputs = train_data.shape

In [33]:
train_target.shape

(4538, 14, 26)

In [34]:
num_classes = train_target.shape[2]

In [35]:
tf.reset_default_graph()

In [36]:
X = tf.placeholder(tf.float64, [None, num_steps, num_inputs])

y = tf.placeholder(tf.float64, [None, num_steps, num_classes])

#### Sequence length calculation

In [37]:
used = tf.sign(tf.reduce_max(tf.abs(X), reduction_indices=2))

length = tf.reduce_sum(used, reduction_indices=1)
sequence_length = tf.cast(length, tf.int64)

In [38]:
sequence_length

<tf.Tensor 'Cast:0' shape=(?,) dtype=int64>

#### RNN for training and prediction

In [39]:
num_neurons = 300

#### Forward RNN to feed in each word in the right order

Make sure you specify a scope for each RNN so you can initialize multiple RNNs in the same graph (the default scope is *'rnn'* which will clash across the two RNNs we set up)

In [40]:
forward, _ = tf.nn.dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), X,
                               dtype=tf.float64, sequence_length=sequence_length,
                               scope='rnn-forward')

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.


#### Reverse the characters in each word and feed in to another forward RNN

* Reverse the 1st dimension i.e the characters
* Note that only the actual sequence length of the characters are reversed, the padding is not reversed

In [41]:
X_reversed = tf.reverse_sequence(X, sequence_length, seq_dim=1)

backward, _ = tf.nn.dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), X_reversed,
                               dtype=tf.float64, sequence_length=sequence_length,
                               scope='rnn-backward')

Instructions for updating:
seq_dim is deprecated, use seq_axis instead


#### Get output back in the original order

In [42]:
backward = tf.reverse_sequence(backward, sequence_length, seq_dim=1)

In [43]:
backward, forward

(<tf.Tensor 'ReverseSequence_1:0' shape=(?, 14, 300) dtype=float64>,
 <tf.Tensor 'rnn-forward/transpose_1:0' shape=(?, 14, 300) dtype=float64>)

In [44]:
output = tf.concat([forward, backward], axis=2)

In [45]:
output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(600)])

#### Shared softmax layer

In [46]:
weight = tf.Variable(tf.truncated_normal([num_neurons * 2, num_classes], stddev=0.01, dtype=tf.float64))

In [47]:
bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))

In [48]:
flattened_output = tf.reshape(output, [-1, num_neurons * 2])

In [49]:
flattened_output

<tf.Tensor 'Reshape:0' shape=(?, 600) dtype=float64>

In [50]:
logits = tf.matmul(flattened_output, weight) + bias

In [51]:
logits_reshaped = tf.reshape(logits, [-1, num_steps, num_classes])

#### Cost calculation

In [52]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [53]:
loss = tf.reduce_mean(cross_entropy)

#### Error calculation

In [54]:
mistakes = tf.not_equal(
            tf.argmax(y, 2), tf.argmax(logits_reshaped, 2))
mistakes = tf.cast(mistakes, tf.float64)

mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
mistakes *= mask

In [55]:
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(sequence_length, tf.float64)

In [56]:
error = tf.reduce_mean(mistakes)

#### Optimizer

In [57]:
optimizer = tf.train.RMSPropOptimizer(0.002)

In [58]:
gradient = optimizer.compute_gradients(loss)

In [59]:
optimize = optimizer.apply_gradients(gradient)

In [60]:
def batched(data, target, batch_size):
    epoch = 0
    offset = 0
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)

        # Offset wrapped around to the beginning so new epoch
        if offset < old_offset:
            # New epoch, need to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]

            epoch += 1

        batch_data = data[offset:(offset + batch_size), :]
        
        batch_target = target[offset:(offset + batch_size), :]

        yield batch_data, batch_target, epoch

In [61]:
batch_size = 10
batches = batched(train_data, train_target, batch_size)

In [62]:
epochs = 5

In [63]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    for index, batch in enumerate(batches):
        batch_data = batch[0]
        batch_target = batch[1]
    
        epoch = batch[2]

        if epoch >= epochs:
            break
        
        feed = {X: batch_data, y: batch_target}
        train_error, _ = sess.run([error, optimize], feed)
        
        print('{}: {:3.6f}%'.format(index + 1, 100 * train_error))

    test_feed = {X: test_data, y: test_target}
    test_error, _ = sess.run([error, optimize], test_feed)
    
    print('Test error: {:3.6f}%'.format(100 * test_error))

1: 100.000000%
2: 95.961538%
3: 97.222222%
4: 96.638889%
5: 100.000000%
6: 99.285714%
7: 89.444444%
8: 95.090909%
9: 97.888889%
10: 95.210317%
11: 94.285714%
12: 97.857143%
13: 94.079365%
14: 97.222222%
15: 98.333333%
16: 96.888889%
17: 89.532967%
18: 96.750000%
19: 92.154762%
20: 96.000000%
21: 96.333333%
22: 87.829365%
23: 92.906926%
24: 91.107753%
25: 94.238095%
26: 87.940476%
27: 90.555556%
28: 95.813492%
29: 90.336219%
30: 93.809524%
31: 90.142857%
32: 96.333333%
33: 88.238095%
34: 89.772727%
35: 90.246642%
36: 90.309524%
37: 83.531746%
38: 87.209596%
39: 84.935786%
40: 91.710317%
41: 90.027056%
42: 78.704545%
43: 84.523810%
44: 92.500000%
45: 90.959596%
46: 87.916667%
47: 90.642857%
48: 80.159951%
49: 80.652015%
50: 86.666667%
51: 82.090909%
52: 86.110390%
53: 91.000000%
54: 88.924242%
55: 84.230769%
56: 85.119048%
57: 91.765873%
58: 81.757576%
59: 88.836580%
60: 89.960317%
61: 91.944444%
62: 83.261905%
63: 77.987374%
64: 82.448662%
65: 81.564103%
66: 83.666667%
67: 95.000000%
68

528: 10.037879%
529: 10.000000%
530: 7.380952%
531: 1.623377%
532: 7.428571%
533: 3.611111%
534: 3.333333%
535: 6.524476%
536: 6.444444%
537: 2.909091%
538: 4.444444%
539: 2.020202%
540: 2.833333%
541: 5.213675%
542: 2.500000%
543: 0.714286%
544: 4.285714%
545: 4.761905%
546: 2.929293%
547: 10.476190%
548: 7.095238%
549: 4.523810%
550: 5.095238%
551: 5.000000%
552: 2.833333%
553: 0.000000%
554: 1.666667%
555: 8.666667%
556: 10.909091%
557: 5.634921%
558: 6.666667%
559: 10.357143%
560: 5.512821%
561: 8.088578%
562: 4.444444%
563: 1.111111%
564: 1.000000%
565: 11.238095%
566: 3.095238%
567: 2.111111%
568: 5.650794%
569: 3.031136%
570: 0.769231%
571: 5.059524%
572: 7.222222%
573: 0.000000%
574: 0.000000%
575: 3.750000%
576: 3.750000%
577: 7.222222%
578: 5.694444%
579: 4.583333%
580: 0.000000%
581: 0.714286%
582: 0.909091%
583: 4.761905%
584: 4.761905%
585: 2.769231%
586: 1.000000%
587: 4.444444%
588: 3.690476%
589: 8.412698%
590: 1.623377%
591: 4.500000%
592: 6.752137%
593: 9.583333%
594:

1071: 1.111111%
1072: 2.000000%
1073: 0.769231%
1074: 3.111111%
1075: 0.000000%
1076: 1.666667%
1077: 0.000000%
1078: 0.000000%
1079: 0.000000%
1080: 0.000000%
1081: 0.000000%
1082: 3.111111%
1083: 0.000000%
1084: 0.000000%
1085: 0.000000%
1086: 0.000000%
1087: 0.000000%
1088: 0.714286%
1089: 2.111111%
1090: 0.000000%
1091: 2.000000%
1092: 7.380952%
1093: 1.250000%
1094: 0.000000%
1095: 0.000000%
1096: 0.000000%
1097: 0.000000%
1098: 3.333333%
1099: 0.000000%
1100: 0.000000%
1101: 0.000000%
1102: 0.000000%
1103: 0.000000%
1104: 1.428571%
1105: 0.000000%
1106: 3.333333%
1107: 0.000000%
1108: 0.000000%
1109: 0.000000%
1110: 0.000000%
1111: 3.333333%
1112: 5.000000%
1113: 1.250000%
1114: 0.000000%
1115: 0.000000%
1116: 0.000000%
1117: 4.107143%
1118: 0.000000%
1119: 1.111111%
1120: 3.333333%
1121: 0.000000%
1122: 2.000000%
1123: 6.666667%
1124: 0.000000%
1125: 0.000000%
1126: 0.000000%
1127: 2.000000%
1128: 0.000000%
1129: 0.000000%
1130: 2.539683%
1131: 0.000000%
1132: 0.000000%
1133: 0.

1585: 0.000000%
1586: 0.000000%
1587: 0.000000%
1588: 0.000000%
1589: 0.000000%
1590: 0.000000%
1591: 0.000000%
1592: 1.000000%
1593: 0.000000%
1594: 3.333333%
1595: 0.000000%
1596: 0.000000%
1597: 0.000000%
1598: 0.000000%
1599: 0.000000%
1600: 0.000000%
1601: 0.000000%
1602: 0.000000%
1603: 0.000000%
1604: 0.000000%
1605: 0.000000%
1606: 0.000000%
1607: 0.000000%
1608: 0.000000%
1609: 0.000000%
1610: 6.666667%
1611: 0.000000%
1612: 0.000000%
1613: 0.000000%
1614: 0.000000%
1615: 0.000000%
1616: 1.111111%
1617: 0.000000%
1618: 0.000000%
1619: 0.000000%
1620: 1.428571%
1621: 0.000000%
1622: 0.000000%
1623: 0.000000%
1624: 0.000000%
1625: 0.000000%
1626: 0.000000%
1627: 0.000000%
1628: 1.250000%
1629: 0.000000%
1630: 0.000000%
1631: 3.333333%
1632: 0.000000%
1633: 0.000000%
1634: 0.000000%
1635: 1.666667%
1636: 0.000000%
1637: 0.000000%
1638: 1.666667%
1639: 0.000000%
1640: 0.000000%
1641: 0.714286%
1642: 0.000000%
1643: 0.000000%
1644: 0.000000%
1645: 0.000000%
1646: 0.000000%
1647: 0.

2101: 0.000000%
2102: 0.000000%
2103: 0.000000%
2104: 0.000000%
2105: 0.000000%
2106: 5.333333%
2107: 0.000000%
2108: 1.666667%
2109: 0.000000%
2110: 0.000000%
2111: 0.000000%
2112: 0.000000%
2113: 0.000000%
2114: 0.000000%
2115: 0.000000%
2116: 0.000000%
2117: 0.000000%
2118: 0.000000%
2119: 0.000000%
2120: 0.000000%
2121: 0.000000%
2122: 0.000000%
2123: 0.000000%
2124: 0.000000%
2125: 0.000000%
2126: 1.111111%
2127: 0.000000%
2128: 0.000000%
2129: 0.000000%
2130: 0.000000%
2131: 0.000000%
2132: 0.000000%
2133: 0.000000%
2134: 0.000000%
2135: 0.000000%
2136: 0.000000%
2137: 0.000000%
2138: 0.000000%
2139: 0.000000%
2140: 0.000000%
2141: 0.000000%
2142: 0.000000%
2143: 0.000000%
2144: 0.000000%
2145: 0.000000%
2146: 0.000000%
2147: 0.000000%
2148: 0.000000%
2149: 0.000000%
2150: 0.000000%
2151: 0.000000%
2152: 0.000000%
2153: 0.000000%
2154: 0.000000%
2155: 0.000000%
2156: 0.000000%
2157: 0.000000%
2158: 0.000000%
2159: 0.000000%
2160: 0.000000%
2161: 0.000000%
2162: 0.000000%
2163: 0.