In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets,transforms
from torch.utils.data import Dataset,DataLoader,SequentialSampler,SubsetRandomSampler
import matplotlib.pyplot as plt
import numpy as np

import cv2,os, warnings,joblib
from skimage import io,transform
import pandas as pd
from PIL import Image
warnings.filterwarnings('ignore')

### DataLoading

In [2]:
class IIITDataset(Dataset):
    def __init__(self, csv_filename, image_dir, transform=None):
        self.dataset = pd.read_csv(csv_filename)
        self.root_dir = image_dir
        self.transform = transform
    
    def __getitem__(self,index):
        if(torch.is_tensor(index)):
            index = index.tolist()
        label = self.dataset.iloc[index,1] # here label column = 1
        
        image_name = self.dataset.iloc[index,0] #here image path column=0
        path = os.path.join(self.root_dir,image_name)
        image = Image.open(path)
        if self.transform:
            image = self.transform(image)
        
        return image, label
        
    def __len__(self):
        return len(self.dataset)
    
class LabelTransformer(object):
    """
    encoder and decoder
    Args:
        letters (str): Letters contained in the data
    """

    def __init__(self, letters):
        self.encode_map = {letter: idx+1 for idx, letter in enumerate(letters)}
        self.decode_map = ' ' + letters

    def encode(self, text):
        if isinstance(text, str):
            length = [len(text)]
            result = [self.encode_map[letter] for letter in text]
        else:
            length = []
            result = []
            for word in text:
                length.append(len(word))
                result.extend([self.encode_map[letter] for letter in word])
        return torch.IntTensor(result), torch.IntTensor(length)

    def decode(self, text_code):
        result = []
        for code in text_code:
#             print(':',code,':')
            word = []
            for i in range(len(code)):
                if code[i] != 0 and (i == 0 or code[i] != code[i-1]):
                    word.append(self.decode_map[code[i]])
            result.append(''.join(word))
#         print(result)


In [3]:
train_path = r"E:\Sem3\AML\Assignments & Quiz\Assignment_5\Dataset\IIIT5K-Word_V3.0\IIIT5K"
test_path = r"E:\Sem3\AML\Assignments & Quiz\Assignment_5\Dataset\IIIT5K-Word_V3.0\IIIT5K"

# train_path = r"IIIT5K"
# test_path = r"IIIT5K"

device='cuda'
# device = 'cpu'
n_workers = 0
batch_size = 10

transform = transforms.Compose([
                                transforms.Resize((32,100),Image.BICUBIC),
                                transforms.Grayscale(),
                                transforms.ToTensor()
                               ])

train_data = IIITDataset(csv_filename="train.csv",image_dir=train_path,transform=transform)
test_data = IIITDataset(csv_filename="test.csv",image_dir=test_path,transform=transform)

train_loader = DataLoader(train_data, batch_size=batch_size, num_workers=n_workers)
test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=n_workers)

### Architecture

In [10]:
class Architecture(nn.Module):
    def __init__(self, in_channels, out_size):
        super(Architecture,self).__init__()
        
        hidden_size = 256
        self.conv_1 = nn.Conv2d(in_channels, 64, 3 ,padding=1, stride=1)
        self.pool_1 = nn.MaxPool2d((2,2),stride=(2,2))
        
        self.conv_2 = nn.Conv2d(64, 128, 3 ,padding=1, stride=1)
        self.pool_2 = nn.MaxPool2d((2,2),stride=(2,2))
        
        self.conv_3 = nn.Conv2d(128, 256, 3 ,padding=1, stride=1)
        self.conv_4 = nn.Conv2d(256, 256, 3 ,padding=1, stride=1)
        self.pool_3 = nn.MaxPool2d((2,1),stride=(2,1))
        
        self.conv_5 = nn.Conv2d(256, 512, 3 ,padding=1, stride=1)
        self.b_1 = nn.BatchNorm2d(512)
        
        self.conv_6 = nn.Conv2d(512, 512, 3 ,padding=1, stride=1)
        self.b_2 = nn.BatchNorm2d(512)
        self.pool_4 = nn.MaxPool2d((2,1),stride=(2,1))
        
        self.conv_7 = nn.Conv2d(512, 512, 2 ,padding=0, stride=1)
        
        self.L_1 = nn.LSTM(input_size = 512, hidden_size = hidden_size, bidirectional=True) 
        self.L_2 = nn.LSTM(input_size = hidden_size*2, hidden_size = 256, bidirectional=True)# input = inputsize*direction
        
        self.fc1 = nn.Linear(hidden_size*2,out_size)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
#         print(x.shape)
        x = self.relu(self.conv_1(x))
#         print('conv_1:',x.shape)
        x = self.pool_1(x)
#         print('pool_1:',x.shape)
        x = self.relu(self.conv_2(x))
#         print('conv_2:',x.shape)
        x = self.pool_2(x)
#         print('pool_2:',x.shape)
        x = self.relu(self.conv_3(x))
#         print('conv_3:',x.shape)
        x = self.relu(self.conv_4(x))
#         print('conv_4:',x.shape)
        x = self.pool_3(x)
#         print('pool_3:',x.shape)
        x = self.relu(self.conv_5(x))
#         print('conv_5:',x.shape)
        x = self.b_1(x)
#         print('b_1:',x.shape)
        x = self.relu(self.conv_6(x))
#         print('conv_6:',x.shape)
        x = self.b_2(x)
#         print('b_2:',x.shape)
        x = self.pool_4(x)
#         print('pool_4:',x.shape)
        
        x = self.relu(self.conv_7(x))
#         print('conv_7:',x.shape)
    
        x = x.squeeze(2)
#         print('sq:',x.shape)
        
        x = x.permute(2, 0, 1)
#         print('permute:',x.shape)
        
        results = self.L_1(x)
        out, hidden = results[0], results[1]
#         print('L_1:',out.shape,hidden[0].shape, hidden[1].shape)
        
        results = self.L_2(out)
        out, hidden = results[0], results[1]
#         print('L_2:',out.shape,hidden[0].shape, hidden[1].shape)
        
        x = out
        
        l, b, h = x.size()
#         print('width,batch,height(channels):',l,b,h)
        
        x = x.view(l*b,h) # length*batch, hidden_size*2
#         print('view:',x.shape)
        
        x = self.fc1(x)
#         print('fc:',x.shape)
        
        x = x.view(l, b, -1)   # length>=24, batch, output_size
#         print('view:',x.shape)
        return x
    

letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
model = Architecture(1,len(letters) + 1)
model.to(device)

criterion = nn.CTCLoss()
criterion.to(device)
optimizer = optim.Adagrad(model.parameters(), lr = 0.01)

In [11]:
# labelTransformer.decode_map

# labelTransformer.encode_map

In [13]:
labelTransformer = LabelTransformer(letters)
train_loss_plt, valid_loss_plt = [], []
test_accuracy_plt = []
test_loss_plt = []
n_epochs = 10
for epoch in range(n_epochs):
    train_loss, test_loss = 0, 0
    model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()

        images = images.to(device)
        label, label_length = labelTransformer.encode(labels)
        
        label.to(device)
        label_length.to(device)
        
        output = model(images)
        output_length = torch.tensor([output.size(0)]*output.size(1)).to(device)
        
#         print(label, label_length, output_length)
        loss = criterion(output, label, output_length, label_length)
        
        result = output.max(2)[1].transpose(0, 1)  # batch × length
        result = labelTransformer.decode(result.data)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * images.size(0)

    
    train_loss = train_loss / len(train_loader.dataset)
    train_loss_plt.append(train_loss)
    print('Epoch:', (epoch + 1), '\tTrain Loss:', train_loss)

tensor([25, 15, 21, 18,  5, 19,  3, 21,  5, 13,  9, 19, 19,  9, 15, 14,  8, 15,
        13,  5,  2, 15, 18,  4,  5, 18,  4, 15,  4,  1,  2,  1, 20, 15, 20,  5,
        28, 31, 30, 11, 13, 28, 36, 30, 11, 13, 30, 28, 32, 11, 13],
       dtype=torch.int32) tensor([3, 6, 7, 4, 6, 4, 6, 5, 5, 5], dtype=torch.int32) tensor([24, 24, 24, 24, 24, 24, 24, 24, 24, 24], device='cuda:0')
tensor([16, 12,  5,  1, 19,  5, 18, 15,  1,  4, 19, 15, 18,  7,  1, 14,  9, 19,
         1, 20,  9, 15, 14, 23,  9, 19,  8,  5, 19,  3,  8,  9, 14,  7,  1, 13,
        11,  9, 19,  8, 20, 23,  1, 18, 11, 21, 13,  1, 18,  3,  1, 18,  4, 31,
        30, 34, 34, 27, 28, 29, 30], dtype=torch.int32) tensor([ 6,  5, 12,  6,  7,  8,  5,  4,  4,  4], dtype=torch.int32) tensor([24, 24, 24, 24, 24, 24, 24, 24, 24, 24], device='cuda:0')
tensor([15,  6,  9, 14,  4,  9,  1, 19, 20,  1, 20,  5,  9, 14,  4,  9,  1, 19,
         1, 22,  5, 20,  8,  5, 12, 15, 19,  5, 15,  6,  9, 14,  4,  9,  1, 19,
         1, 10,  9, 20,  8], dt

KeyboardInterrupt: 