In [11]:
import PIL
import json
import pandas as pd
import os
import ast
import numpy as np
import cv2
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.nn as nn

import albumentations as A
from albumentations.pytorch.transforms import ToTensor

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class HandwrittenDataset(Dataset):
    
    def __init__(self, df, visible_char_mapping, transform = None, image_resize = (1000,500)):
        
        self.data = list(df.itertuples(index=False))
        self.transform = transform
        
        self.visible_char_mapping = visible_char_mapping
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        sample = self.data[index]
        
        f_name = sample.filename
        image = PIL.Image.open(f_name).convert("RGB")
        #image = cv2.imread(f_name)
        #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        #image = cv2.resize(image, (1000, 400))
        #image_width = image.shape[1]
        #image_height = image.shape[0]
        
        image = np.asarray(image)
        height = sample.height
        width = sample.width
        visible_latex_chars = sample.visible_latex_chars
        xmaxs = np.array(sample.xmaxs)
        xmins = np.array(sample.xmins)
        ymaxs = np.array(sample.ymaxs)
        ymins = np.array(sample.ymins)
        
        #xmins = (xmins/image_width)*1000
        #xmaxs = (xmaxs/image_width)*1000
        #ymins = (ymins/image_height)*400
        #ymaxs = (ymaxs/image_height)*400
        
        boxes = []
        for box in zip(xmins, ymins, xmaxs, ymaxs):
            boxes.append(list(box))
        
        boxes = torch.as_tensor(boxes, dtype = torch.float32)
        labels = [*map(self.visible_char_mapping.get, visible_latex_chars)]
        labels = torch.as_tensor(labels, dtype = torch.int64)
        if self.transform:
            transformed_sample = self.transform(image =  image,
                                     bboxes = boxes,
                                     labels = labels)
            image = transformed_sample['image']
            boxes = torch.Tensor(transformed_sample['bboxes'])
       
        targets = {
            'boxes':boxes,
            'labels':labels
        }

        return image, targets
    
def collate_fn(batch):
    """
    To handle the data loading as different images may have different number 
    of objects and to handle varying size tensors as well.
    """
    return tuple(zip(*batch))

In [4]:
def create_data_frame(raw_data, image_path):
    """
    Create a Pandas DataFrame and a list for all the latex expressions

    Parameters
    ----------
    raw_data : list
        A list that contains all the image information

    Returns
    ----------
    df: DataFrame
        A Pandas DataFrame for running the analysis
    all_latex_lst: list
        A list for all the tokens, used for creating the token distribution
    """
    data = {}
    data['latex'] = []
    data['seq_len'] = []
    data['latex_string'] = []
    data['visible_latex_chars'] = []
    data['filename'] = []
    data['width'] = []
    data['height'] = []
    data['xmins_raw'] = []
    data['xmaxs_raw'] = []
    data['ymins_raw'] = []
    data['ymaxs_raw'] = []
    data['xmins'] = []
    data['xmaxs'] = []
    data['ymins'] = []
    data['ymaxs'] = []
    
    for image in raw_data:
        data['latex_string'].append(image['latex'])
        data['latex'].append(image['image_data']['full_latex_chars'])
        data['seq_len'].append(len(image['image_data']['full_latex_chars']))
        data['visible_latex_chars'].append(image['image_data']['visible_latex_chars'])
        data['filename'].append(os.path.join(image_path, image['filename']))
        data['xmins_raw'].append(image['image_data']['xmins_raw'])
        data['xmaxs_raw'].append(image['image_data']['xmaxs_raw'])
        data['ymins_raw'].append(image['image_data']['ymins_raw'])
        data['ymaxs_raw'].append(image['image_data']['ymaxs_raw'])
        data['xmins'].append(image['image_data']['xmins'])
        data['xmaxs'].append(image['image_data']['xmaxs'])
        data['ymins'].append(image['image_data']['ymins'])
        data['ymaxs'].append(image['image_data']['ymaxs'])
        
        data['width'].append(image['image_data']['width'])
        data['height'].append(image['image_data']['height'])


    df = pd.DataFrame.from_dict(data)
    return df

In [5]:
def load_data(path = 'data/all_data.csv'):
    if not os.path.isfile(path):
        df = pd.DataFrame()
        for i in range(1,11):
            print(f'data/batch_{i}/JSON/kaggle_data_{i}.json')
            with open(file=f'data/batch_{i}/JSON/kaggle_data_{i}.json') as f:
                raw_data = json.load(f)
            sub_df = create_data_frame(raw_data, f'data/batch_{i}/background_images')
            df = df.append(sub_df)
    else:
        df = pd.read_csv(path).drop(columns = 'Unnamed: 0')

    list_cols = ['xmins_raw', 'xmaxs_raw', 'ymins_raw', 'ymaxs_raw', 'xmins', 'xmaxs', 'ymins', 'ymaxs']
    for c in list_cols:
        df[c] = df[c].apply(json.loads)

    df['latex'] = df['latex'].replace("'\\\\", "'\\")
    df['latex'] = df['latex'].apply(ast.literal_eval)
    
    #vocab = df['latex'].explode().unique().tolist()[0]
    df['visible_latex_chars'] = df['visible_latex_chars'].replace("'\\\\", "'\\")
    df['visible_latex_chars'] = df['visible_latex_chars'].apply(ast.literal_eval)
    
    with open(file=f'data/extras/visible_char_map.json') as f:
        visible_char_map = json.load(f)
    
    return df, visible_char_map

In [23]:
def split_dataframe(df):
    X_train, X_test = train_test_split(df, test_size=0.20, random_state=4995)
    
    return X_train, X_test

def prepare_data(batch_size = 32):
    
    df, visible_char_map = load_data()
    
    num_classes = len(visible_char_map) + 1
    
    return df, visible_char_map, num_classes

def build_dataloaders(df, visible_char_map, batch_size = 32):
    data_transforms = A.Compose([
        A.Flip(0.5),
        A.RandomRotate90(0.5),
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.Blur(blur_limit=3, p=0.1),
        ToTensor()],
        bbox_params={
                'format': 'albumentations',
                'label_fields': ['labels']
    })
    
    train_df, val_df = split_dataframe(df)
    
    train_dataset = HandwrittenDataset(train_df, visible_char_map, transform = data_transforms)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, collate_fn = collate_fn)
    
    val_dataset = HandwrittenDataset(val_df, visible_char_map, transform = data_transforms)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, collate_fn = collate_fn)
    
    return train_loader, val_loader

In [15]:
def build_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True, pretrained_backbone = True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    model = model.to(DEVICE)
    return model

In [16]:
df, visible_char_map, num_classes = prepare_data()

NameError: name 'data_transforms' is not defined

In [24]:
train_loader, val_loader = build_dataloaders(df, visible_char_map, batch_size = 32)

In [25]:
model = build_model(num_classes)

In [26]:
def train_loop(model, train_loader, optimizer):
    model = model.train()
    
    train_loss_list = []
    
    for i, data in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        images, targets = data
        
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        
        loss = model(images, targets)
        
        losses = sum(l for loss in loss_dict.values())
        
        loss_value = losses.item()
        train_loss_list.append(loss_value)
        
        losses.backward()
        optimizer.step()
        
        if i % 50 == 0:
            print(loss_value)
    
    return np.mean(train_loss_list)

In [27]:
def val_loop(model, val_loader):
    
    model = model.val()
    
    val_loss_list = []
    
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader)):
            images, targets = data

            images = list(image.to(DEVICE) for image in images)
            targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

            loss = model(images, targets)

            losses = sum(l for loss in loss_dict.values())
            loss_value = losses.item()
            val_loss_list.append(loss_value)
            
    loss_mean = np.mean(val_loss_list)
    print(loss_mean)
        
    return loss_mean

In [28]:
def train(model, train_loader, val_loader, optimizer):
    
    train_losses = []
    val_losses = []
    
    train_losses.append(train_loop(model,train_loader, optimizer))
    val_losses.append(val_loop(model, val_loader))
    
    return train_losses, val_losses

In [None]:
optimizer = torch.optim.Adam(model.roi_heads.parameters(), lr = 0.001, weight_decay = 0.01)
train(model, train_loader, val_loader, optimizer)

  0%|                                                                     | 0/2500 [00:00<?, ?it/s]