### Optical character recognition using RNNs

In [1]:
!pip install --upgrade numpy
!pip install --upgrade tensorflow

Collecting numpy
  Using cached https://files.pythonhosted.org/packages/85/51/ba4564ded90e093dbb6adfc3e21f99ae953d9ad56477e1b0d4a93bacf7d3/numpy-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl
Installing collected packages: numpy
  Found existing installation: numpy 1.14.5
    Uninstalling numpy-1.14.5:
      Successfully uninstalled numpy-1.14.5
Successfully installed numpy-1.15.0
[33mYou are using pip version 9.0.3, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: tensorflow in /home/nbuser/anaconda2_20/lib/python2.7/site-packages
Requirement already up-to-date: enum34>=1.1.6 in /home/nbuser/anaconda2_20/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: mock>=2.0.0 in /home/nbuser/anaconda2_20/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: protobuf>=3.6.0 in /home/nbuser/anaconda2_20/lib/python2.7/site-packages (from tensorflow)
Require

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import os
import gzip
import csv

In [4]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [5]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

In [6]:
from six.moves import urllib

In [7]:
print(np.__version__)
print(tf.__version__)

1.14.5
1.10.0


In [8]:
URL_PATH = 'http://ai.stanford.edu/~btaskar/ocr/letter.data.gz'
DOWNLOADED_FILENAME = 'letter.data.gz'

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)
    
    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [9]:
download_data()

Found and verified file from this path:  http://ai.stanford.edu/~btaskar/ocr/letter.data.gz
Downloaded file:  letter.data.gz


In [10]:
def read_lines():
    with gzip.open(DOWNLOADED_FILENAME, 'rt') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = list(reader)

        return lines

In [11]:
lines = read_lines()

### Format of every line

* id
* letter
* next_id
* word_id
* position
* fold
* 16x8 columns of pixel values

In [12]:
lines[0][:8]

['1', 'o', '2', '1', '1', '0', '0', '0']

In [13]:
len(lines)

52152

In [14]:
def get_features_labels(lines):
    lines = sorted(lines, key=lambda x: int(x[0]))
    data, target = [], []
    
    next_id = -1
    
    word = []
    word_pixels = []

    for line in lines:
        next_id = int(line[2]) # The index for the next_id column

        pixels = np.array([int(x) for x in line[6:134]])
        pixels = pixels.reshape((16, 8))
        
        word_pixels.append(pixels)
        word.append(line[1])
        
        if next_id == -1:
            data.append(word_pixels)
            target.append(word)

            word = []
            word_pixels = []


    return data, target

In [15]:
data, target = get_features_labels(lines)

In [16]:
def pad_features_labels(data, target):    
    max_length = max(len(x) for x in target)
    padding = np.zeros((16, 8))

    data = [x + ([padding] * (max_length - len(x))) for x in data]
    target = [x + ([''] * (max_length - len(x))) for x in target]
    
    return np.array(data), np.array(target)

In [17]:
padded_data, padded_target = pad_features_labels(data, target)

In [18]:
padded_target[:10]

array([['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', '']],
      dtype='|S1')

#### The length of each sequence

We've padded all words so that their lengths are all equal to the length of the longest word

In [19]:
sequence_length = len(padded_target[0])

In [20]:
sequence_length

14

In [21]:
padded_data.shape

(6877, 14, 16, 8)

In [22]:
padded_data.shape[:2] + (-1,)

(6877, 14, -1)

In [23]:
reshaped_data = padded_data.reshape(padded_data.shape[:2] + (-1,))

In [24]:
reshaped_data.shape

(6877, 14, 128)

In [25]:
padded_target.shape

(6877, 14)

In [26]:
padded_target.shape + (26,)

(6877, 14, 26)

In [27]:
one_hot_target = np.zeros(padded_target.shape + (26,))

In [28]:
for index, letter in np.ndenumerate(padded_target):
    if letter:
        one_hot_target[index][ord(letter) - ord('a')] = 1

#### One-hot representation of the letter 'o'

* The letter 'o' represented by a 1 at the 14th index 
* Index positions start at 0

In [29]:
one_hot_target[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [30]:
shuffled_indices = np.random.permutation(len(reshaped_data))

shuffled_data = reshaped_data[shuffled_indices]
shuffled_target = one_hot_target[shuffled_indices]

In [31]:
split = int(0.66 * len(shuffled_data))

train_data = shuffled_data[:split]
train_target = shuffled_target[:split]

test_data = shuffled_data[split:]
test_target = shuffled_target[split:]

In [32]:
train_data.shape

(4538, 14, 128)

In [33]:
_, num_steps, num_inputs = train_data.shape

In [34]:
train_target.shape

(4538, 14, 26)

In [68]:
#keras implementation for BI-RNN

train_target_reshaped=train_target.reshape(train_target.shape[0],-1)
train_target_reshaped.shape

(4538, 364)

In [69]:
test_target_reshaped=test_target.reshape(test_target.shape[0],-1)
test_target_reshaped.shape

(2339, 364)

In [76]:
!pip install keras

Collecting keras
  Downloading https://files.pythonhosted.org/packages/34/7d/b1dedde8af99bd82f20ed7e9697aac0597de3049b1f786aa2aac3b9bd4da/Keras-2.2.2-py2.py3-none-any.whl (299kB)
[K    100% |ââââââââââââââââââââââââââââââââ| 307kB 1.6MB/s ta 0:00:01
Collecting keras-preprocessing==1.0.2 (from keras)
  Downloading https://files.pythonhosted.org/packages/71/26/1e778ebd737032749824d5cba7dbd3b0cf9234b87ab5ec79f5f0403ca7e9/Keras_Preprocessing-1.0.2-py2.py3-none-any.whl
Collecting keras-applications==1.0.4 (from keras)
  Downloading https://files.pythonhosted.org/packages/54/90/8f327deaa37a71caddb59b7b4aaa9d4b3e90c0e76f8c2d1572005278ddc5/Keras_Applications-1.0.4-py2.py3-none-any.whl (43kB)
[K    100% |ââââââââââââââââââââââââââââââââ| 51kB 8.4MB/s eta 0:00:01
Installing collected packages: keras-preprocessing, keras-applications, keras
Successfully installed keras

In [106]:
from keras.layers import GRU,LSTM,Dense,Dropout,Bidirectional,Input
from keras import optimizers 
from  keras.preprocessing  import sequence
from keras.models import Sequential
from keras.callbacks import EarlyStopping
model=Sequential()
#model.add(Input(input_shape=(14,128))
x=GRU(input_shape=(14,128),units=150, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.3, recurrent_dropout=0.4, implementation=1, return_sequences=True, return_state=False, go_backwards=False, stateful=False, unroll=True)
lstm=LSTM(input_shape=(14,128),units=150,return_sequences=True)
model.add(Bidirectional(lstm, merge_mode='concat', weights=None))
model.add(Dropout(0.25))
model.add(Dense(26,activation='sigmoid'))

#forward pass


In [107]:
model.compile(loss='binary_crossentropy',  
            optimizer='RMSProp',              
            metrics=['accuracy'])

In [109]:
BATCH_SIZE = 24
EPOCHS = 8
cbk_early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max')
model.fit(train_data, train_target, BATCH_SIZE, epochs=EPOCHS, 
             validation_data=(test_data, test_target),
            callbacks=[cbk_early_stopping] )

Train on 4538 samples, validate on 2339 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7faa9eedea90>

In [110]:
#tensorflow starts here

In [111]:
num_classes = train_target.shape[2]

In [112]:
tf.reset_default_graph()

In [113]:
X = tf.placeholder(tf.float64, [None, num_steps, num_inputs])

y = tf.placeholder(tf.float64, [None, num_steps, num_classes])

#### Sequence length calculation

In [114]:
used = tf.sign(tf.reduce_max(tf.abs(X), reduction_indices=2))

length = tf.reduce_sum(used, reduction_indices=1)
sequence_length = tf.cast(length, tf.int64)

In [115]:
sequence_length

<tf.Tensor 'Cast:0' shape=(?,) dtype=int64>

#### RNN for training and prediction

In [116]:
num_neurons = 300

#### Forward RNN to feed in each word in the right order

Make sure you specify a scope for each RNN so you can initialize multiple RNNs in the same graph (the default scope is *'rnn'* which will clash across the two RNNs we set up)

In [117]:
forward, _ = tf.nn.dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), X,
                               dtype=tf.float64, sequence_length=sequence_length,
                               scope='rnn-forward')

#### Reverse the characters in each word and feed in to another forward RNN

* Reverse the 1st dimension i.e the characters
* Note that only the actual sequence length of the characters are reversed, the padding is not reversed

In [118]:
X_reversed = tf.reverse_sequence(X, sequence_length, seq_dim=1)

backward, _ = tf.nn.dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), X_reversed,
                               dtype=tf.float64, sequence_length=sequence_length,
                               scope='rnn-backward')

#### Get output back in the original order

In [119]:
backward = tf.reverse_sequence(backward, sequence_length, seq_dim=1)

In [120]:
backward, forward

(<tf.Tensor 'ReverseSequence_1:0' shape=(?, 14, 300) dtype=float64>,
 <tf.Tensor 'rnn-forward/transpose_1:0' shape=(?, 14, 300) dtype=float64>)

In [121]:
output = tf.concat([forward, backward], axis=2)

In [122]:
output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(600)])

#### Shared softmax layer

In [123]:
weight = tf.Variable(tf.truncated_normal([num_neurons * 2, num_classes], stddev=0.01, dtype=tf.float64))

In [124]:
bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))

In [125]:
flattened_output = tf.reshape(output, [-1, num_neurons * 2])

In [126]:
flattened_output

<tf.Tensor 'Reshape:0' shape=(?, 600) dtype=float64>

In [127]:
logits = tf.matmul(flattened_output, weight) + bias

In [128]:
logits_reshaped = tf.reshape(logits, [-1, num_steps, num_classes])

#### Cost calculation

In [129]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)

In [130]:
loss = tf.reduce_mean(cross_entropy)

#### Error calculation

In [131]:
mistakes = tf.not_equal(
            tf.argmax(y, 2), tf.argmax(logits_reshaped, 2))
mistakes = tf.cast(mistakes, tf.float64)

mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
mistakes *= mask

In [132]:
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(sequence_length, tf.float64)

In [133]:
error = tf.reduce_mean(mistakes)

#### Optimizer

In [134]:
optimizer = tf.train.RMSPropOptimizer(0.002)

In [135]:
gradient = optimizer.compute_gradients(loss)

In [136]:
optimize = optimizer.apply_gradients(gradient)

In [137]:
def batched(data, target, batch_size):
    epoch = 0
    offset = 0
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)

        # Offset wrapped around to the beginning so new epoch
        if offset < old_offset:
            # New epoch, need to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]

            epoch += 1

        batch_data = data[offset:(offset + batch_size), :]
        
        batch_target = target[offset:(offset + batch_size), :]

        yield batch_data, batch_target, epoch

In [138]:
batch_size = 10
batches = batched(train_data, train_target, batch_size)

In [139]:
epochs = 5

In [140]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    for index, batch in enumerate(batches):
        batch_data = batch[0]
        batch_target = batch[1]
    
        epoch = batch[2]

        if epoch >= epochs:
            break
        
        feed = {X: batch_data, y: batch_target}
        train_error, _ = sess.run([error, optimize], feed)
        
        print('{}: {:3.6f}%'.format(index + 1, 100 * train_error))

    test_feed = {X: test_data, y: test_target}
    test_error, _ = sess.run([error, optimize], test_feed)
    
    print('Test error: {:3.6f}%'.format(100 * test_error))

1: 99.090909%
2: 96.750000%
3: 100.000000%
4: 95.238095%
5: 94.333333%
6: 92.564103%
7: 93.412698%
8: 100.000000%
9: 98.888889%
10: 99.000000%
11: 95.071429%
12: 95.202020%
13: 97.181818%
14: 94.841270%
15: 94.416667%
16: 90.555556%
17: 97.638889%
18: 94.432789%
19: 94.404762%
20: 95.691087%
21: 94.282246%
22: 91.158009%
23: 96.135531%
24: 93.960567%
25: 95.777778%
26: 86.861111%
27: 90.190476%
28: 92.397436%
29: 96.904762%
30: 89.713564%
31: 91.250000%
32: 92.699634%
33: 86.327839%
34: 95.638889%
35: 93.273810%
36: 91.468254%
37: 92.460317%
38: 91.515873%
39: 88.154762%
40: 90.445055%
41: 91.750000%
42: 84.360750%
43: 83.852092%
44: 87.904762%
45: 82.619658%
46: 89.671856%
47: 96.497253%
48: 86.472222%
49: 96.666667%
50: 84.547619%
51: 89.293651%
52: 93.397436%
53: 86.435786%
54: 90.354645%
55: 88.000000%
56: 86.100427%
57: 88.614358%
58: 92.015873%
59: 81.021062%
60: 92.643468%
61: 90.818071%
62: 92.365690%
63: 87.460317%
64: 84.255189%
65: 88.975469%
66: 94.350427%
67: 86.912587%
68

529: 7.857143%
530: 0.000000%
531: 10.277778%
532: 1.111111%
533: 3.333333%
534: 17.222222%
535: 6.949495%
536: 12.535714%
537: 12.579365%
538: 5.000000%
539: 1.964286%
540: 5.000000%
541: 2.857143%
542: 18.364469%
543: 5.587662%
544: 2.083333%
545: 1.623377%
546: 0.000000%
547: 3.333333%
548: 1.111111%
549: 0.000000%
550: 14.088023%
551: 4.047619%
552: 10.000000%
553: 7.619048%
554: 9.935897%
555: 4.242424%
556: 13.250000%
557: 2.020202%
558: 0.000000%
559: 2.500000%
560: 5.833333%
561: 10.769231%
562: 4.671717%
563: 5.000000%
564: 2.500000%
565: 3.333333%
566: 2.909091%
567: 13.333333%
568: 0.769231%
569: 0.833333%
570: 2.857143%
571: 7.916667%
572: 2.307692%
573: 4.761905%
574: 0.000000%
575: 8.888889%
576: 1.547619%
577: 5.416667%
578: 3.678322%
579: 0.000000%
580: 1.666667%
581: 1.000000%
582: 7.664336%
583: 1.111111%
584: 1.818182%
585: 1.000000%
586: 2.083333%
587: 3.750000%
588: 6.250000%
589: 8.095238%
590: 4.333333%
591: 2.777778%
592: 0.000000%
593: 6.666667%
594: 4.242424%


1069: 0.000000%
1070: 1.666667%
1071: 1.111111%
1072: 2.000000%
1073: 2.142857%
1074: 0.000000%
1075: 6.666667%
1076: 4.435897%
1077: 0.000000%
1078: 1.000000%
1079: 0.000000%
1080: 0.000000%
1081: 0.000000%
1082: 0.000000%
1083: 0.000000%
1084: 3.333333%
1085: 3.333333%
1086: 0.000000%
1087: 10.714286%
1088: 1.538462%
1089: 0.000000%
1090: 1.111111%
1091: 0.000000%
1092: 4.102564%
1093: 0.000000%
1094: 0.000000%
1095: 0.000000%
1096: 3.333333%
1097: 0.000000%
1098: 6.666667%
1099: 0.000000%
1100: 0.000000%
1101: 5.000000%
1102: 0.000000%
1103: 3.214286%
1104: 0.000000%
1105: 2.000000%
1106: 0.000000%
1107: 1.111111%
1108: 0.000000%
1109: 0.000000%
1110: 0.000000%
1111: 0.000000%
1112: 1.111111%
1113: 0.000000%
1114: 3.333333%
1115: 0.000000%
1116: 1.111111%
1117: 0.000000%
1118: 0.000000%
1119: 1.742424%
1120: 0.000000%
1121: 6.666667%
1122: 0.000000%
1123: 0.000000%
1124: 0.000000%
1125: 0.000000%
1126: 0.000000%
1127: 0.000000%
1128: 0.714286%
1129: 0.000000%
1130: 7.333333%
1131: 4

1581: 0.000000%
1582: 0.000000%
1583: 0.000000%
1584: 5.444444%
1585: 0.000000%
1586: 0.000000%
1587: 0.000000%
1588: 0.000000%
1589: 1.111111%
1590: 3.333333%
1591: 0.000000%
1592: 0.000000%
1593: 0.000000%
1594: 0.000000%
1595: 0.000000%
1596: 0.000000%
1597: 0.000000%
1598: 0.000000%
1599: 0.000000%
1600: 0.000000%
1601: 0.000000%
1602: 0.000000%
1603: 0.000000%
1604: 0.000000%
1605: 0.000000%
1606: 3.333333%
1607: 0.000000%
1608: 3.333333%
1609: 0.000000%
1610: 0.000000%
1611: 0.000000%
1612: 0.000000%
1613: 0.000000%
1614: 0.000000%
1615: 0.000000%
1616: 0.000000%
1617: 0.000000%
1618: 0.000000%
1619: 1.250000%
1620: 0.000000%
1621: 2.000000%
1622: 0.000000%
1623: 0.000000%
1624: 0.000000%
1625: 0.000000%
1626: 0.000000%
1627: 0.000000%
1628: 0.000000%
1629: 0.000000%
1630: 0.000000%
1631: 0.000000%
1632: 0.000000%
1633: 0.000000%
1634: 0.000000%
1635: 0.000000%
1636: 0.000000%
1637: 0.000000%
1638: 0.000000%
1639: 1.111111%
1640: 1.250000%
1641: 0.000000%
1642: 0.000000%
1643: 0.

2093: 0.000000%
2094: 0.000000%
2095: 0.000000%
2096: 0.000000%
2097: 0.000000%
2098: 0.000000%
2099: 0.000000%
2100: 0.000000%
2101: 0.000000%
2102: 0.000000%
2103: 0.000000%
2104: 0.000000%
2105: 0.000000%
2106: 0.000000%
2107: 0.000000%
2108: 0.000000%
2109: 0.000000%
2110: 0.000000%
2111: 0.000000%
2112: 0.000000%
2113: 0.000000%
2114: 3.333333%
2115: 0.000000%
2116: 0.000000%
2117: 0.000000%
2118: 0.000000%
2119: 1.111111%
2120: 0.000000%
2121: 0.000000%
2122: 0.833333%
2123: 0.000000%
2124: 0.000000%
2125: 0.000000%


KeyboardInterrupt: 