In [1]:
import os
import glob
import torch
import pprint
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from PIL import Image
from tqdm import tqdm

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection

In [2]:
'''
    OCR DATASET CLASS
    Dataset Used = BanglaWriting
    Dataset Manual = https://arxiv.org/pdf/2011.07499.pdf
    Dataset Download Link - https://data.mendeley.com/datasets/r43wkvdk4w/1
'''

class  OCRDataset(Dataset):
    
    def __init__(self, img_dir, targets):
        self.img_dir = img_dir
        self.targets = targets

    def __len__(self):
        return len(self.img_dir)
    
    def __getitem__(self, item):
        image = Image.open(self.img_dir[item])
        image = image.resize((128, 64), resample=Image.BILINEAR)

        targets = self.targets[item]

        image = np.array(image)
        image = np.stack((image,)*1, axis=-1)

        # Reshape to tensor format supported by Pytorch (C, H, W)
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }



In [3]:
# defining the model

class OCRModel(nn.Module):
    def __init__(self, num_chars):
        super(OCRModel, self).__init__()
        self.conv_1 = nn.Conv2d(1, 128, kernel_size=(3, 6), padding=(1, 1))
        self.pool_1 = nn.MaxPool2d(kernel_size=(2, 2))
        self.conv_2 = nn.Conv2d(128, 64, kernel_size=(3, 6), padding=(1, 1))
        self.pool_2 = nn.MaxPool2d(kernel_size=(2, 2))
        
        self.linear_1 = nn.Linear(1024, 64) # 1024 = 64*16
        self.drop_1 = nn.Dropout(0.2)
        self.gru = nn.GRU(64, 32, bidirectional=True, num_layers=2, dropout=0.25, batch_first=True)
        self.output = nn.Linear(64, num_chars + 1)

    def forward(self, images, targets=None):
        bs, c, h, w = images.size()
        # print("bs, c, h, w = ", bs, c, h, w)
        x = F.relu(self.conv_1(images))
        # print(x.size())
        x = self.pool_1(x)
        # print(x.size())
        x = F.relu(self.conv_2(x))
        # print(x.size())
        x = self.pool_2(x) # [8, 64, 16, 29] (bs, c, h, w)
        # print(x.size())
        
        x = x.permute(0, 3, 1, 2) # bs, w, c, h
        # print(x.size())           # 8, 29, 64, 16 
        x = x.view(bs, x.size(1), -1)
        # print(x.size())
        x = F.relu(self.linear_1(x))
        x = self.drop_1(x)
        # print(x.size())
        
        x, _ = self.gru(x)
        # print(x.size())
        x = self.output(x)
        # print(x.size())
        
        x = x.permute(1, 0, 2)

        if targets is not None:
            log_probs = F.log_softmax(x, 2).to(torch.float64)
            input_lengths = torch.full(
                size=(bs,), fill_value=log_probs.size(0), dtype=torch.int32
            )
            # print(input_lengths)
            target_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            # print(target_lengths)
            loss = nn.CTCLoss(blank=0)(
                log_probs, targets, input_lengths, target_lengths
            )
#             print(loss)
            return x, loss

        return x, None


#
if __name__ == "__main__":
    cm = OCRModel(115)
    img = torch.rand((32, 1, 64, 128))
    x, _ = cm(img, torch.rand((32, 15)))

In [4]:
def remove_duplicates(x):
    if len(x) < 2:
        return x
    fin = ""
    for j in x:
        if fin == "":
            fin = j
        else:
            if j == fin[-1]:
                continue
            else:
                fin = fin + j
    return fin


def decode_predictions(preds, encoder):
    preds = preds.permute(1, 0, 2)
    preds = torch.softmax(preds, 2)
    preds = torch.argmax(preds, 2)
    preds = preds.detach().cpu().numpy()
    word_preds = []
    for j in range(preds.shape[0]):
        temp = []
        for k in preds[j, :]:
            k = k - 1
            if k == -1:
                temp.append("°")
            else:
                p = encoder.inverse_transform([k])[0]
                temp.append(p)
        tp = "".join(temp)
        word_preds.append(remove_duplicates(tp))
    return word_preds


In [5]:
# define train and test functions

def train_fn(model, data_loader, optimizer):
    model.train()
    fin_loss = 0
    tk0 = tqdm(data_loader, total=len(data_loader))
    

    for data in tk0:
        for key, value in data.items():
            data[key] = value.to("cuda" if torch.cuda.is_available() else "cpu")
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        fin_loss += loss.item()
    return fin_loss / len(data_loader)


def eval_fn(model, data_loader):
    model.eval()
    fin_loss = 0
    fin_preds = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for data in tk0:
            for key, value in data.items():
                data[key] = value.to("cuda" if torch.cuda.is_available() else "cpu")
            batch_preds, loss = model(**data)
            fin_loss += loss.item()
            fin_preds.append(batch_preds)
        return fin_preds, fin_loss / len(data_loader)


In [6]:
filepath = './img' 

In [11]:
def train():
    print('train function is running')
    image_files = glob.glob(os.path.join(filepath, '*jpg'))
    targets_orig = [x.split("/")[1].split(" ")[0] for x in image_files]
#     print(targets_orig)
    targets = [[c for c in x] for x in targets_orig]
    targets_flat = [c for clist in targets for c in clist]
    
    lbl_enc = preprocessing.LabelEncoder()
    lbl_enc.fit(targets_flat)
    targets_enc = [lbl_enc.transform(x) for x in targets]
    targets_enc = np.array(targets_enc) + 1
#     print(targets_enc)
    
    #############################################################################################
#     num = 3635
#     print(targets[num])  # target length (# 12650 = 9)
#     print("Target label length =", len(targets_enc[num]))
    #############################################################################################
    
    
    # add padding to labels to make the target length equal for every target/label
    maxlen = len(max(targets, key=len)) # to get the length of the largest label
    # print(maxlen)
    # print(max(targets, key=len))
    
    # iterating over every target and adding 0 at the last
    for item in range(len(targets_enc)):
        difference = maxlen - len(targets_enc[item]) 
        for i in range(difference):
            targets_enc[item] = np.append(targets_enc[item], 0)
#             np.pad(targets_enc[item], (0, difference), 'constant')

    
    print("Total unique classes/characters:", len(lbl_enc.classes_))
#     print(lbl_enc.classes_[114])
#     print(np.unique(targets_flat))
    
    # divide into train test 
    (
        train_imgs,
        test_imgs,
        train_targets,
        test_targets,
        train_orig_targets,
        test_orig_targets,
    ) = model_selection.train_test_split (
        image_files, targets_enc, targets_orig, test_size = 0.2, random_state = 42
    )
    
    # loading images and their corresponding labels to train and test dataset
    train_dataset = OCRDataset(img_dir = train_imgs, targets = train_targets)
    test_dataset = OCRDataset(img_dir = test_imgs, targets = test_targets)
    
    # defining the data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
    
    
    # model goes here
    model = OCRModel(len(lbl_enc.classes_))
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.8, patience=5, verbose=True
    )

    # define number of epoch and start training
    num_epoch = 10
    for epoch in range(num_epoch):
        train_loss = train_fn(model, train_loader, optimizer)
        valid_preds, test_loss = eval_fn(model, test_loader)
        valid_word_preds = []
        
        for vp in valid_preds:
            current_preds = decode_predictions(vp, lbl_enc)
            valid_word_preds.extend(current_preds)
        combined = list(zip(test_orig_targets, valid_word_preds))
        print(combined[:10])
        test_dup_rem = [remove_duplicates(c) for c in test_orig_targets]
        accuracy = metrics.accuracy_score(test_dup_rem, valid_word_preds)
        pprint.pprint(list(zip(test_orig_targets, valid_word_preds))[6:11])
        print(
            f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={test_loss} Accuracy={accuracy}"
        )
        scheduler.step(test_loss)




    import matplotlib.pyplot as plt
    import numpy as np
    %matplotlib inline

    npimg = train_dataset[200]['images'].numpy()
    print(npimg.shape) # print current shape (torch style)

    # change the orientation of the image to display
    npimg = np.transpose(npimg, (1, 2, 0)).astype(np.float32)
    print(npimg.shape)

    plt.imshow(npimg)

train()

train function is running


UnboundLocalError: cannot access local variable 'np' where it is not associated with a value

In [None]:

# Visualize train data and its shape

# import matplotlib.pyplot as plt
# import numpy as np
# %matplotlib inline

# npimg = train_dataset[200]['images'].np()
# print(npimg.shape) # print current shape (torch style)

# # change the orientation of the image to display
# npimg = np.transpose(npimg, (1, 2, 0)).astype(np.float32)
# print(npimg.shape)

# plt.imshow(npimg)


NameError: name 'train_dataset' is not defined