# Performing OCR for Hindi Letters

## Importing all the required files

In [1]:
# Normal ML data analysis
import numpy as np

# File manipulation
import os

# Image manipulation
from skimage.io import imread

# DL Training and evaluation
import tensorflow as tf

## Preprocessing by loading the data

In [2]:
def getChars(PATH):
    chars = os.listdir(PATH)
    new_chars = []
    for char in chars:
        new_chars.append(char)
    return new_chars

def getImages(PATH, categories):
    X = []
    y = []
    
    for i in categories:
        path = os.path.join(PATH,i) # Copying path of a specific image
        for img in os.listdir(path):
            img_array = imread(os.path.join(path,img))/255 # Taking input of image as an array
            X.append(img_array) # Storing image in the form of array
            y.append(categories[i]) # Storing its corresponding category
    
    # Converting to numpy array form    
    X = np.array(X)
    y = np.array(y)
    
    # Reshape X to pass into CNN without issues
    X = np.reshape(X, (X.shape[0], X.shape[1], X.shape[2], 1))
    return X, y

In [3]:
traindir = '../data/DevanagariHandwrittenCharacterDataset/Train'
testdir = '../data/DevanagariHandwrittenCharacterDataset/Test'

In [4]:
characters = getChars(traindir)
categories = {k: v for v, k in enumerate(characters)}
categories

{'character_10_yna': 0,
 'character_11_taamatar': 1,
 'character_12_thaa': 2,
 'character_13_daa': 3,
 'character_14_dhaa': 4,
 'character_15_adna': 5,
 'character_16_tabala': 6,
 'character_17_tha': 7,
 'character_18_da': 8,
 'character_19_dha': 9,
 'character_1_ka': 10,
 'character_20_na': 11,
 'character_21_pa': 12,
 'character_22_pha': 13,
 'character_23_ba': 14,
 'character_24_bha': 15,
 'character_25_ma': 16,
 'character_26_yaw': 17,
 'character_27_ra': 18,
 'character_28_la': 19,
 'character_29_waw': 20,
 'character_2_kha': 21,
 'character_30_motosaw': 22,
 'character_31_petchiryakha': 23,
 'character_32_patalosaw': 24,
 'character_33_ha': 25,
 'character_34_chhya': 26,
 'character_35_tra': 27,
 'character_36_gya': 28,
 'character_3_ga': 29,
 'character_4_gha': 30,
 'character_5_kna': 31,
 'character_6_cha': 32,
 'character_7_chha': 33,
 'character_8_ja': 34,
 'character_9_jha': 35,
 'digit_0': 36,
 'digit_1': 37,
 'digit_2': 38,
 'digit_3': 39,
 'digit_4': 40,
 'digit_5': 41,
 

### Training Data

In [5]:
X, y = getImages(traindir, categories)

In [6]:
X.shape

(78200, 32, 32, 1)

In [7]:
y.shape

(78200,)

### Testing Data

In [8]:
X_test, y_test = getImages(testdir, categories)

In [9]:
X_test.shape

(13800, 32, 32, 1)

In [10]:
y_test

array([ 0,  0,  0, ..., 45, 45, 45])

## Model Building and Evaluation

In [11]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(32, 32, 1)))
model.add(tf.keras.layers.MaxPooling2D((2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D((2,2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(len(characters)))


loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer_fn = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimizer_fn,
              loss=loss_fn,
              metrics=['accuracy'])

In [12]:
model.fit(X, y, batch_size=32, epochs=100, use_multiprocessing=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2a657dd7040>

In [13]:
model.evaluate(X_test, y_test)



[0.0876215249300003, 0.9877536296844482]

We have a 40+ class classification and our model is more than 95% accurate. This shows the effectivensess of CNNs in general and in this case, its effectiveness in learning character objects.

### Save the CNN model (H5) to disk

In [14]:
filename = '../models/NN_model.h5'
model.save(filename)