# Handwritten text recognition

In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    # find automatically the path of the folder containing "file_name" :
    import subprocess
    file_name = "cnn_solution.ipynb"
    # path_to_file = subprocess.check_output('find . -type f -name ' + str(file_name), shell=True).decode("utf-8")
    # path_to_file = path_to_file.replace(file_name,"").replace('\n',"")
    # if previous search failed or too long, comment the previous line and simply write down manually the path below :
    path_to_file = '/content/gdrive/My Drive/CS5242 Project/Solutions/CNN Solution/'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/CS5242 Project/CNN Solution/
/content/gdrive/.shortcut-targets-by-id/1fYoPc4cTre2OE-DPuGBLLQYb7zuw8sxY/CS5242 Project/CNN Solution


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint
import os
import time

### Use GPU

In [None]:
device= torch.device("cuda")
# device= torch.device("cpu")
print(device)

cuda


In [None]:
import numpy as np
import utils

### Prepare the dataset

In [None]:
train_data_folder = '../../ready_data/2/'

train_data_file = train_data_folder + 'data.pt'
train_labels_file = train_data_folder + 'labels.pt'

train_data = 1 - torch.load(train_data_file) / 255
train_labels = torch.load(train_labels_file)


test_data_folder = '../../ready_data/1/'

test_data_file = test_data_folder + 'data.pt'
test_labels_file = test_data_folder + 'labels.pt'

test_data = 1 - torch.load(test_data_file) / 255
test_labels = torch.load(test_labels_file)



### Augment the training data. 

Apply rotation, translation, scaling and color transformations for each image 20 times.

In [None]:
from torchvision import transforms

train_transform = transforms.Compose(
                    [
                    transforms.ToPILImage(),
                    transforms.RandomAffine(degrees=20, translate=(0.1,0.1), scale=(0.9, 1.1)),
                    transforms.ColorJitter(brightness=0.2, contrast=0.2),
                    transforms.ToTensor(),
                    ])

augmented_images = []
augmented_labels = []

for tensor, label in zip(train_data, train_labels):
    
    for i in range(60):
        new_image = train_transform(1 - tensor)
        augmented_images.append(1 - new_image)
        augmented_labels.append(label)

train_augmented_data = torch.stack(augmented_images)
train_augmented_labels = torch.stack(augmented_labels)


### Make a convnet-based class. 

Three-layer Convnet and two linear layers.

In [None]:
class HTR_convnet(nn.Module):

    def __init__(self):

        super(HTR_convnet, self).__init__()

        # CL1:   64 x 64  -->    64 x 64 x 64 
        self.conv1 = nn.Conv2d(1,   64,  kernel_size=3,  padding=1 )
        
        # MP1: 64 x 64 x 64 -->    64 x 32 x 32 
        self.pool1  = nn.MaxPool2d(2,2)
        
        # CL2:   64 x 32 x 32  -->    32 x 32 x 32 
        self.conv2 = nn.Conv2d(64, 32,  kernel_size=3,  padding=1 )
        
        # MP2: 32 x 32 x 32 -->    32 x 16 x 16
        self.pool2 = nn.MaxPool2d(2, 2)

        # CL3:   32 x 16 x 16  -->    16 x 16 x 16 
        self.conv3 = nn.Conv2d(32, 16, kernel_size=3,  padding=1 )
        
        # MP3: 16 x 16 x 16 -->  16 x 8 x 8
        self.pool3 = nn.MaxPool2d(2, 2)

        # LL1:   16 x 8 x 8 = 1024 -->  100 
        self.linear1 = nn.Linear(1024, 100)
        
        # LL2:   100  -->  48
        self.linear2 = nn.Linear(100, 48)

        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.25)        


    def forward(self, x):

        # CL1:   64 x 64  -->    64 x 64 x 64 
        x = self.conv1(x)
        x = F.relu(x)

        
        # MP1: 64 x 64 x 64 -->    64 x 32 x 32 
        x = self.pool1(x)

        x = self.dropout1(x)
        

        # CL2:   64 x 32 x 32  -->    32 x 32 x 32 
        x = self.conv2(x)
        x = torch.tanh(x)
        
        # MP2: 32 x 32 x 32 -->    32 x 16 x 16
        x = self.pool2(x)

        x = self.dropout2(x)

        # CL3:   32 x 16 x 16  -->    16 x 16 x 16 
        x = self.conv3(x)
        x = torch.tanh(x)
        
        # MP3: 16 x 16 x 16 -->  16 x 8 x 8
        x = self.pool3(x)

        # LL1:   16 x 8 x 8 = 1024 -->  100 
        x = x.view(-1, 1024)
        x = self.linear1(x)
        x = F.relu(x)
        
        # LL2:   100  -->  48
        x = self.linear2(x)
    
        return x

### Build the net. How many parameters in total?

In [None]:
net=HTR_convnet()
print(net)
utils.display_num_param(net)

HTR_convnet(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear1): Linear(in_features=1024, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=48, bias=True)
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.25, inplace=False)
)
There are 131076 (0.13 million) parameters in this neural network


### Send the weights of the networks to the GPU 

In [None]:
net = net.to(device)

### Choose the criterion, learning rate, and batch size.

In [None]:
criterion = nn.CrossEntropyLoss()

my_lr=0.3

bs= 50

### Function to evaluate the network on the test set

In [None]:
def eval_on_test_set():

    running_error=0
    num_batches=0

    for i in range(0,test_data.shape[0],bs):

        minibatch_data =  test_data[i:i+bs].unsqueeze(dim=1)
        minibatch_label= test_labels[i:i+bs]

        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = minibatch_data

        scores=net( inputs )

        error = utils.get_error( scores , minibatch_label)

        running_error += error.item()

        num_batches+=1


    total_error = running_error/num_batches
    print( 'error rate on test set =', total_error*100 ,'percent')

### Do 30 passes through the training set. Divide the learning rate by 2 every 5 epochs.

In [None]:
start=time.time()

N = train_augmented_data.shape[0]

for epoch in range(50):
    
    if not epoch % 5:
        my_lr = my_lr / 1.5
        
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    running_loss=0
    running_error=0
    num_batches=0
    
    shuffled_indices=torch.randperm(N)
 
    for count in range(0,N,bs):
        
        # FORWARD AND BACKWARD PASS
    
        optimizer.zero_grad()
             
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  train_augmented_data[indices]
        minibatch_label=  train_augmented_labels[indices]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        inputs = minibatch_data
        
        inputs.requires_grad_()

        scores=net( inputs ) 
        
        loss =  criterion( scores , minibatch_label) 
          
        loss.backward()
        
        optimizer.step()
        

        # COMPUTE STATS
        
        running_loss += loss.detach().item()
        
        error = utils.get_error( scores.detach() , minibatch_label)
        running_error += error.item()
        
        num_batches+=1        
    
    
    # AVERAGE STATS THEN DISPLAY
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    elapsed = (time.time()-start)/60
    
    print('epoch=',epoch, '\t time=', elapsed,'min', '\t lr=', my_lr  ,'\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
    eval_on_test_set() 
    print(' ')
    
    

epoch= 0 	 time= 0.576301884651184 min 	 lr= 0.19999999999999998 	 loss= 2.249062064759893 	 error= 61.76923163865054 percent
error rate on test set = 73.16666722297668 percent
 
epoch= 1 	 time= 1.1468231638272604 min 	 lr= 0.19999999999999998 	 loss= 0.8378577594846056 	 error= 26.283322312136413 percent
error rate on test set = 66.73333466053009 percent
 
epoch= 2 	 time= 1.7166314323743184 min 	 lr= 0.19999999999999998 	 loss= 0.5470426089079982 	 error= 17.54112959089664 percent
error rate on test set = 67.40000009536743 percent
 
epoch= 3 	 time= 2.2872519214948017 min 	 lr= 0.19999999999999998 	 loss= 0.4554595733565042 	 error= 14.5030489198506 percent
error rate on test set = 64.10000085830688 percent
 
epoch= 4 	 time= 2.857238761583964 min 	 lr= 0.19999999999999998 	 loss= 0.41265225469475925 	 error= 12.977915657265715 percent
error rate on test set = 61.30000114440918 percent
 
epoch= 5 	 time= 3.42788698275884 min 	 lr= 0.13333333333333333 	 loss= 0.21163742203600222 	 er

### Choose image at random from the test set and see how good/bad are the predictions

In [None]:
label = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Zeta', 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu', 'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 'Tau', 'Upsilon', 'Phi', 'Chi', 'Psi', 'Omega', 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega']

In [None]:
# choose a picture at random
idx=randint(0, 400)
im=test_data[idx]

answer = label[test_labels[idx]]

print(answer)

# diplay the picture
utils.show(im)

# send to device, rescale, and view as a batch of 1 
im = im.to(device)
im = im
im = im.view(1,64,64).unsqueeze(dim=1)

# feed it to the net and display the confidence scores
scores =  net(im) 
probs= F.softmax(scores, dim=1)
utils.show_prob_greek(probs.cpu())