In [6]:
import PIL
import json
import pandas as pd
import os
import ast
import numpy as np
import cv2
from tqdm import tqdm
import random

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.nn as nn

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [7]:
def seed_everything(seed_value=4995):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything()

In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
DEVICE

device(type='cuda')

In [10]:
class HandwrittenDataset(Dataset):
    
    def __init__(self, df, visible_char_mapping, transform = None, image_resize = (1000,500)):
        
        self.data = list(df.itertuples(index=False))
        self.transform = transform
        self.to_tensor = torchvision.transforms.ToTensor()
        
        self.visible_char_mapping = visible_char_mapping
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        sample = self.data[index]
        
        f_name = sample.filename
        image = PIL.Image.open(f_name).convert("RGB")
        #image = cv2.imread(f_name)
        #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        #image = cv2.resize(image, (800, 376))
        #image_width = image.shape[1]
        #image_height = image.shape[0]
        
        image = np.asarray(image)
        #height = sample.height
        #width = sample.width
        visible_latex_chars = sample.visible_latex_chars
        xmaxs = np.array(sample.xmaxs)
        xmins = np.array(sample.xmins)
        ymaxs = np.array(sample.ymaxs)
        ymins = np.array(sample.ymins)
        
        #xmins = (xmins/image_width)*1401
        #xmaxs = (xmaxs/image_width)*1401
        #ymins = (ymins/image_height)*376
        #ymaxs = (ymaxs/image_height)*376
        
        boxes = []
        for box in zip(xmins, ymins, xmaxs, ymaxs):
            boxes.append(list(box))
        
        boxes = torch.as_tensor(boxes, dtype = torch.float32)
        labels = [*map(self.visible_char_mapping.get, visible_latex_chars)]
        labels = torch.as_tensor(labels, dtype = torch.int64)
        if self.transform:
            transformed_sample = self.transform(image =  image,
                                     bboxes = boxes,
                                     labels = labels)
            image = self.to_tensor(transformed_sample['image'])
            #image = transformed_sample['image']
            boxes = torch.Tensor(transformed_sample['bboxes'])
        else:
            image = self.to_tensor(image)
            boxes = torch.Tensor(boxes)
       
        targets = {
            'boxes':boxes,
            'labels':labels
        }

        return image, targets
    
def collate_fn(batch):
    """
    To handle the data loading as different images may have different number 
    of objects and to handle varying size tensors as well.
    """
    return tuple(zip(*batch))

In [11]:
def create_data_frame(raw_data, image_path):
    """
    Create a Pandas DataFrame and a list for all the latex expressions

    Parameters
    ----------
    raw_data : list
        A list that contains all the image information

    Returns
    ----------
    df: DataFrame
        A Pandas DataFrame for running the analysis
    all_latex_lst: list
        A list for all the tokens, used for creating the token distribution
    """
    data = {}
    data['latex'] = []
    data['seq_len'] = []
    data['latex_string'] = []
    data['visible_latex_chars'] = []
    data['filename'] = []
    data['width'] = []
    data['height'] = []
    data['xmins_raw'] = []
    data['xmaxs_raw'] = []
    data['ymins_raw'] = []
    data['ymaxs_raw'] = []
    data['xmins'] = []
    data['xmaxs'] = []
    data['ymins'] = []
    data['ymaxs'] = []
    
    for image in raw_data:
        data['latex_string'].append(image['latex'])
        data['latex'].append(image['image_data']['full_latex_chars'])
        data['seq_len'].append(len(image['image_data']['full_latex_chars']))
        data['visible_latex_chars'].append(image['image_data']['visible_latex_chars'])
        data['filename'].append(os.path.join(image_path, image['filename']))
        data['xmins_raw'].append(image['image_data']['xmins_raw'])
        data['xmaxs_raw'].append(image['image_data']['xmaxs_raw'])
        data['ymins_raw'].append(image['image_data']['ymins_raw'])
        data['ymaxs_raw'].append(image['image_data']['ymaxs_raw'])
        data['xmins'].append(image['image_data']['xmins'])
        data['xmaxs'].append(image['image_data']['xmaxs'])
        data['ymins'].append(image['image_data']['ymins'])
        data['ymaxs'].append(image['image_data']['ymaxs'])
        
        data['width'].append(image['image_data']['width'])
        data['height'].append(image['image_data']['height'])


    df = pd.DataFrame.from_dict(data)
    return df

In [None]:
lab = []
for o in a: lab+=o["image_data"]["visible_latex_chars"]

In [None]:
labels= sorted(list(set(lab)))

In [46]:
l = []
for i in df['visible_latex_chars'].tolist():
    for j in i:
        l.append(j)

In [48]:
len(set(l))

54

In [34]:
def load_data(path = 'data/all_data.csv'):
    if not os.path.isfile(path):
        df = pd.DataFrame()
        for i in range(1,11):
            print(f'data/batch_{i}/JSON/kaggle_data_{i}.json')
            with open(file=f'data/batch_{i}/JSON/kaggle_data_{i}.json') as f:
                raw_data = json.load(f)
            sub_df = create_data_frame(raw_data, f'data/batch_{i}/background_images')
            df = df.append(sub_df)
        df.to_csv(path)
        df = pd.read_csv(path).drop(columns = 'Unnamed: 0')
    else:
        df = pd.read_csv(path).drop(columns = 'Unnamed: 0')

    list_cols = ['xmins_raw', 'xmaxs_raw', 'ymins_raw', 'ymaxs_raw', 'xmins', 'xmaxs', 'ymins', 'ymaxs']
    for c in list_cols:
        df[c] = df[c].apply(json.loads)

    df['latex'] = df['latex'].replace("'\\\\", "'\\")
    df['latex'] = df['latex'].apply(ast.literal_eval)
    
    #vocab = df['latex'].explode().unique().tolist()[0]
    df['visible_latex_chars'] = df['visible_latex_chars'].replace("'\\\\", "'\\")
    df['visible_latex_chars'] = df['visible_latex_chars'].apply(ast.literal_eval)
    
    with open(file=f'data/extras/visible_char_map.json') as f:
        visible_char_map = json.load(f)
    
    return df, visible_char_map

In [35]:
def split_dataframe(df):
    X_train, X_test = train_test_split(df, test_size=0.20, random_state=4995)
    
    return X_train, X_test

def prepare_data(batch_size = 32):
    
    df, visible_char_map = load_data()
    
    num_classes = len(visible_char_map) + 1
    
    return df, visible_char_map, num_classes

def build_dataloaders(df, visible_char_map, batch_size = 32):
    train_transforms = A.Compose([
      # A.Flip(0.5),
        A.Resize(896, 896), 
        A.ShiftScaleRotate(rotate_limit = 10)
        #A.Normalize(),
     #   A.RandomRotate90(0.5),
     #   A.MotionBlur(p=0.2),
     #   A.MedianBlur(blur_limit=3, p=0.1),
     #   A.Blur(blur_limit=3, p=0.1),
        #ToTensorV2(p=1),
        ],
        bbox_params={
                'format': 'albumentations',
                'label_fields': ['labels']
    })
    
    val_transforms = A.Compose([
        A.Resize(896, 896), 
        A.ShiftScaleRotate(rotate_limit = 10)

        #A.Normalize(),
    ], bbox_params={
        'format': 'albumentations', 
        'label_fields': ['labels']
    })
    
    train_df, val_df = split_dataframe(df)
    
    train_dataset = HandwrittenDataset(train_df, visible_char_map, transform = train_transforms)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers=0, collate_fn = collate_fn)
    
    val_dataset = HandwrittenDataset(val_df, visible_char_map)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False, num_workers=0, collate_fn = collate_fn)
    
    return train_loader, val_loader

In [36]:
def build_model(num_classes, model_path = None):
    if not model_path:
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True, pretrained_backbone = True)
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        #model.roi_heads.box_predictor.bbox_pred = torch.nn.Linear(
        #    in_features=in_features, out_features=num_classes*4, bias=True
        #)

    else:
        model = torchvision.models.detection.fasterrcnn_resnet50_fpn()
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
        model.load_state_dict(torch.load(model_path))
        
        
    model = model.to(DEVICE)
    return model

In [37]:
df, visible_char_map, num_classes = prepare_data()

In [38]:
from sklearn.utils import shuffle
red_df = shuffle(df, random_state = 1)

In [39]:
red_df = red_df[:5000]

In [18]:
train_loader, val_loader = build_dataloaders(red_df, visible_char_map, batch_size = 16)

In [41]:
num_classes

92

In [40]:
model = build_model(num_classes)

In [51]:
for n, param in model.named_parameters():
    if "roi_heads.box_predictor" not in n:
        param.requires_grad = False

In [29]:
def train_loop(model, train_loader, optimizer, scheduler):
    model = model.train()
    
    train_loss_list = []
    
    for i, data in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        images, targets = data
        
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        
        loss = model(images, targets)
        
        losses = sum(l for l in loss.values())
        
        loss_value = losses.item()
        train_loss_list.append(loss_value)
        
        losses.backward()
        optimizer.step()
        scheduler.step()
        
        if i % 10 == 0:
            print(loss_value)
    
    return np.mean(train_loss_list)

In [30]:
def val_loop(model, val_loader):
    
    #model = model.eval()
    model = model.train()
    val_loss_list = []
    
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader)):
            images, targets = data

            images = list(image.to(DEVICE) for image in images)
            targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

            loss = model(images, targets)

            losses = sum(l for l in loss.values())

            loss_value = losses.item()
            val_loss_list.append(loss_value)
            
    loss_mean = np.mean(val_loss_list)
    print("Eval loss:",loss_mean)
        
    return loss_mean

In [21]:
def train(model, train_loader, val_loader, optimizer, scheduler, epochs = 5,  model_name = 'fastrcnn', save_path = 'models'):
    
    train_losses = []
    val_losses = []
    
    best_val_loss = None
    
    for epoch in range(epochs):
        train_losses.append(train_loop(model, train_loader, optimizer, scheduler))
        print("Train loss:",train_losses[-1])
        val_loss = val_loop(model, val_loader)
        val_losses.append(val_loss)
        #scheduler.step()
        
        if not best_val_loss:
            best_val_loss = val_loss
            
            model_name_pt = model_name+'.pt'
            PATH = os.path.join(save_path, model_name_pt)
            model.to('cpu')
            #torch.save(model.state_dict(), PATH)
            torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, PATH)
            model.to(DEVICE)
            
        else:
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                
                model_name_pt = model_name+'.pt'
                PATH = os.path.join(save_path, model_name_pt)
                model.to('cpu')
                torch.save(model.state_dict(), PATH)
                model.to(DEVICE)
    
    return train_losses, val_losses

In [None]:
optimizer = torch.optim.Adam(model.roi_heads.box_predictor.parameters(), lr = 0.005, weight_decay = 0.01)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4], gamma = 0.1)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = len(train_loader) * 5)
v = train(model, train_loader, val_loader, optimizer, scheduler, epochs = 5)

  0%|                                                                                                                                   | 0/250 [00:01<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 626.00 MiB (GPU 0; 14.76 GiB total capacity; 12.85 GiB already allocated; 499.75 MiB free; 13.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [18]:
df.width.mean(), df.height.mean()

(1401.68117, 376.78104)