## Check number of images and ensure no duplicate paths or duplicate file names exist

In [None]:
import os
from glob import glob

images = []
for i in range(1, 11):
    batch = os.path.join(f'archive/batch_{i}/background_images/', '*.jpg')
    batch = glob(batch)
    
    images.extend(batch)

### check for duplicate paths

In [None]:
len(images)

In [None]:
len(images) == len(set(images))

### check for duplicate base names

In [None]:
basenames = [basename for basename in map(os.path.basename, images)]

In [None]:
len(basenames)

In [None]:
len(basenames) == len(set(basenames))

## Move all files to data/images

In [None]:
from tqdm import tqdm


def move_batch(batch_num):
    images = glob(f"archive/batch_{batch_num}/background_images/*.jpg")
    os.makedirs("data/images", exist_ok=True)
    
    for image in tqdm(images, desc=f"Moving batch {batch_num}", unit="image"):
        basename = os.path.basename(image)
        os.system(f"mv {image} data/images/{basename}")

In [None]:
for i in range(1, 11):
    move_batch(i)

## Make a new annotations file with image filenames being keys instead of a json array

In [None]:
import json
annotations = {}

for i in range(1, 11):
    batch_annotations = glob(f"archive/batch_{i}/JSON/*.json")[0]
    batch_annotations = json.load(open(batch_annotations))
    
    for annotation in batch_annotations:
        annotations[annotation["filename"]] = {
            "latex": annotation["latex"],
            "uuid": annotation["uuid"],
            "unicode_str": annotation["unicode_str"],
            "unicode_less_curlies": annotation["unicode_less_curlies"],
            "image_data": annotation["image_data"],
            "font": annotation["font"],
        }

In [None]:
json.dump(annotations, open("data/annotations.json", "w"), indent=4)