In [None]:
# Loading the training data set

import os
import json

path = "../../data/wb_recognition_dataset/train"
len(os.listdir(path))

char_id_list = sorted(list(map(int, os.listdir(path))))

# Retrieving the number of total samples in the dataset, and saving path for each image
sample_count_dict = {}
for label_id in char_id_list:
    sample_path = path + "/" + str(label_id)
    list_dir_res = os.listdir(sample_path)

    # Creating the dictionary of frequency of labels in training set
    sample_count_dict[label_id] = len(list_dir_res)

In [None]:
# Creating a dictionary of {"path": "label"} for training set

path_to_label_dict = {}
for label_id in char_id_list:
    sample_path = path + "/" + str(label_id)
    list_dir_res = os.listdir(sample_path)

    for image_path in list_dir_res:
        full_image_path = sample_path + "/" + image_path
        path_to_label_dict[full_image_path] = label_id

In [None]:
# Sorting the dictionary by frequency of labels in training set

sorted_sample_count_dict = sorted(sample_count_dict.items(), key=lambda x:x[1], reverse=True)
train_x = [str(label[0]) for label in sorted_sample_count_dict]

In [None]:
# Creating the train manifest

train_manifest = {}

for label in train_x:
    train_manifest[label] = []

    for path in path_to_label_dict:
        if path_to_label_dict[path] == int(label):
            train_manifest[label].append(path)

In [None]:
# Loading the validation dataset

import csv

csv_path = "../../data/wb_recognition_dataset/val/labels.csv"
with open(csv_path, "r") as csv_file:
    reader = csv.reader(csv_file)

    header = next(reader)
    data_dict = {}

    for row in reader:
        key = row[0]
        value = row[1]

        data_dict[key] = value

In [None]:
# Creating a dictionary of {"path": "label"} for validation set

img_path = "../../data/wb_recognition_dataset/val/images"
img_path_list = os.listdir(img_path)
val_path_to_label = {}

for img in img_path_list:
    val_full_path = img_path + "/" + img
    val_path_to_label[val_full_path] = data_dict[img[:-4]]

val_image_labels = list(val_path_to_label.values())
val_image_paths = list(val_path_to_label.keys())

In [None]:
# Creating the dictionary of frequency of labels in validation set

val_label_distribution = {}

for label in val_image_labels:
    if label not in val_label_distribution.keys():
        val_label_distribution[label] = 0

    val_label_distribution[label] += 1

In [None]:
# Sorting the dictionary of frequency of labels in validation set

val_sorted_sample_count_dict = sorted(val_label_distribution.items(), key=lambda x:x[1], reverse=True)
val_x = [str(label[0]) for label in val_sorted_sample_count_dict]

In [None]:
# Creating the validation set's manifest

val_manifest = {}

for label in val_x:
    val_manifest[label] = []

    for path in val_path_to_label:
        if int(val_path_to_label[path]) == int(label):
            val_manifest[label].append(path)

In [None]:
# Combines the 2 sets

total_manifest = {
    "train": train_manifest,
    "val": val_manifest
}

with open('../../data/manifest.json', 'w+') as fp:
    json.dump(total_manifest, fp, indent=4)