In [None]:
import PIL
import json
import pandas as pd
import os
import ast
import numpy as np
import cv2
from tqdm import tqdm
import random
import time
import matplotlib.pyplot as plt

from sklearn import metrics

from PIL import Image, ImageDraw, ImageFont

from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.nn as nn

import albumentations as A
from albumentations.pytorch import ToTensorV2

import gensim
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing

import torchmetrics

In [None]:
def seed_everything(seed_value=4995):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything()

In [None]:
def create_data_frame(raw_data, image_path):

    data = {}
    data['latex'] = []
    data['seq_len'] = []
    data['latex_string'] = []
    data['visible_latex_chars'] = []
    data['filename'] = []
    data['width'] = []
    data['height'] = []
    data['xmins_raw'] = []
    data['xmaxs_raw'] = []
    data['ymins_raw'] = []
    data['ymaxs_raw'] = []
    data['xmins'] = []
    data['xmaxs'] = []
    data['ymins'] = []
    data['ymaxs'] = []
    
    for image in raw_data:
        data['latex_string'].append(image['latex'])
        data['latex'].append(image['image_data']['full_latex_chars'])
        data['seq_len'].append(len(image['image_data']['full_latex_chars']))
        data['visible_latex_chars'].append(image['image_data']['visible_latex_chars'])
        data['filename'].append(os.path.join(image_path, image['filename']))
        data['xmins_raw'].append(image['image_data']['xmins_raw'])
        data['xmaxs_raw'].append(image['image_data']['xmaxs_raw'])
        data['ymins_raw'].append(image['image_data']['ymins_raw'])
        data['ymaxs_raw'].append(image['image_data']['ymaxs_raw'])
        data['xmins'].append(image['image_data']['xmins'])
        data['xmaxs'].append(image['image_data']['xmaxs'])
        data['ymins'].append(image['image_data']['ymins'])
        data['ymaxs'].append(image['image_data']['ymaxs'])
        
        data['width'].append(image['image_data']['width'])
        data['height'].append(image['image_data']['height'])


    df = pd.DataFrame.from_dict(data)
    return df

In [None]:
def load_data(path = 'data/all_data.csv'):
    if not os.path.isfile(path):
        df = pd.DataFrame()
        for i in range(1,11):
            print(f'data/batch_{i}/JSON/kaggle_data_{i}.json')
            with open(file=f'data/batch_{i}/JSON/kaggle_data_{i}.json') as f:
                raw_data = json.load(f)
            sub_df = create_data_frame(raw_data, f'data/batch_{i}/background_images')
            df = df.append(sub_df)
        df.to_csv(path)
        df = pd.read_csv(path).drop(columns = 'Unnamed: 0')
    else:
        df = pd.read_csv(path).drop(columns = 'Unnamed: 0')

    list_cols = ['xmins_raw', 'xmaxs_raw', 'ymins_raw', 'ymaxs_raw', 'xmins', 'xmaxs', 'ymins', 'ymaxs']
    for c in list_cols:
        df[c] = df[c].apply(json.loads)

    df['latex'] = df['latex'].replace("'\\\\", "'\\")
    df['latex'] = df['latex'].apply(ast.literal_eval)
    
    #vocab = df['latex'].explode().unique().tolist()[0]
    df['visible_latex_chars'] = df['visible_latex_chars'].replace("'\\\\", "'\\")
    df['visible_latex_chars'] = df['visible_latex_chars'].apply(ast.literal_eval)
    
    with open(file=f'data/extras/visible_char_map.json') as f:
        visible_char_map = json.load(f)
    
    return df, visible_char_map

In [None]:
def split_dataframe(df):
    X_train, X_test = train_test_split(df, test_size=0.20, random_state=4995)
    
    return X_train, X_test

def prepare_data(batch_size = 32, caption_task = False):
    df, visible_char_map = load_data()

    if caption_task:
        l = []
        for i in df['latex'].tolist():
            for j in i:
                l.append(j)

        classes = sorted(list(set(l)))
        num_classes = len(set(l))

        visible_char_map = {}
        for idx, symbol in enumerate(classes):
            visible_char_map[symbol] = idx + 1 

        return df, visible_char_map, num_classes, classes
        
    else:
        # num_classes = len(visible_char_map)

        l = []
        for i in df['visible_latex_chars'].tolist():
            for j in i:
                l.append(j)

        classes = sorted(list(set(l)))
        num_classes = len(set(l))

        visible_char_map = {}
        for idx, symbol in enumerate(classes):
            visible_char_map[symbol] = idx + 1 

        return df, visible_char_map, num_classes, classes

def build_dataloaders(df, visible_char_map, test_set = False, df2 = None,  batch_size = 32, bad_classes = None, caption = False, pad_index = 0):

    data_transforms = {
      'train': transforms.Compose([
          transforms.Resize((896,896)),
          transforms.RandomHorizontalFlip(),
          transforms.ToTensor(),
          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
      ]),
      'val': transforms.Compose([
          transforms.Resize((896,896)),
          #transforms.CenterCrop(256),
          #transforms.RandomHorizontalFlip(),
          transforms.ToTensor(),
          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
      ]),
    }
    
    if caption and test_set:
        test_dataset = HandwrittenCaptionDataset(df, visible_char_map, transform = data_transforms['train'], return_file_name = True, train_mode = False)
        test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, num_workers=1, collate_fn=lambda x: pad_batch_rnn_validation_file_name(x, pad_index=pad_index))
        return test_loader
    
    if df2 is None:
        train_df, val_df = split_dataframe(df)
    else:
        train_df, val_df = df, df2
    
    if caption:
        train_dataset = HandwrittenCaptionDataset(train_df, visible_char_map, transform = data_transforms['train'])
        train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers=1, collate_fn=lambda x: pad_batch_rnn(x, pad_index=pad_index))

        val_dataset = HandwrittenCaptionDataset(val_df, visible_char_map, transform = data_transforms['val'], train_mode = False)
        val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False, num_workers=1, collate_fn=lambda x: pad_batch_rnn_validation(x, pad_index=pad_index))
        
    else:
        train_dataset = HandwrittenDataset(train_df, visible_char_map, transform = data_transforms['train'], bad_classes = bad_classes)
        train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers=1)

        val_dataset = HandwrittenDataset(val_df, visible_char_map, transform = data_transforms['val'], bad_classes = bad_classes)
        val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False, num_workers=1)
    

    return train_loader, val_loader

In [None]:
df, visible_char_map, num_classes, classes = prepare_data(caption_task = True)

In [None]:
def train_embedding(data, num_classes):
    
    cores = (
            multiprocessing.cpu_count() - 1
        )
    
    #window = 2
    #skipgram = True
    vectorsize = 150
    #iterations = 1

    for skipgram in [True, False]:
        for window in [2,3,4]:
            for vocab_size in [num_classes]:
                for iterations in [15, 20]:
                    print(skipgram, window,iterations)

                    mod_name = f"skipgram_{str(skipgram)}_window_{str(window)}_iterations_{str(iterations)}_vocabsize_{str(vocab_size)}_"
                    print(mod_name)
                    model = gensim.models.Word2Vec(
                        data,
                        vector_size = vectorsize,
                        window=window,
                        workers=cores,
                        sg=skipgram,
                        epochs=iterations,
                        sample=0,
                    )

                    print(model)
                    print(model.total_train_time)
                    print(model.get_latest_training_loss())

                    filename = 'embedding_models/'+ mod_name  + ".model"
                    model.wv.save(filename)


In [None]:
sentences = df.latex.tolist()    

# Train Word embedding without special tokens

In [None]:
train_embedding(sentences, num_classes)