In [None]:
import cv2
import csv
import os
import json
import random
import shutil
from collections import defaultdict

In [None]:
def load_image(path):
    """
    Loads an image into memory as cv2 image object.
    
    :param path: String of path to image.
    :return: cv2 object of image.
    """
    return cv2.imread(path)

In [None]:
def save_image(image, output_directory, filename):
    """
    Saves an image.
    
    :param image: The image to save.
    :param output_path: The directory to write the file to. 
    :param filename: The filename of the output image.
    """
    path = os.path.join(output_directory, filename)
    cv2.imwrite(path, image)

In [None]:
def crop_image(image, x, y, width, height):
    """
    Crops an image to a rectangle.
    
    :param x: The x-coordinate of the bounding box to crop.
    :param y: The y-coordinate of the bounding box to crop.
    :param width: The width of the box to crop.
    :param height: The height of the box to crop.
    :return: The cropped image.
    """
    return image[y:y+height, x:x+width]

In [None]:
def load_json(json_file_path):
    """
    Loads a json file
    
    :param via_json_file_path: The path to the json file.
    :return: The loaded json file object.
    """
    with open(json_file_path) as file:
        return json.load(file)

In [None]:
def get_filename_and_regions_from_via_json(via_json_file_path):
    """
    Parses the output via json to get a dictionary where they key is the filename and the value is an array of
    json objects that describe the the regions to crop.
    
    :param via_json_file_path: The path to the via json file. 
    :return: A dictionary where the keys are the filenames and the values are the regions associated with the file.
    """
    json = load_json(via_json_file_path)
    image_data_info = json["_via_img_metadata"]
    return {image_data_info[entry]["filename"]: image_data_info[entry]["regions"] for entry in image_data_info}

In [None]:
def get_shape(region):
    """
    Takes in a region and gets the shape
    
    :param region: The region to get the shape of.
    :return: The shape of the region.
    """
    return region["shape_attributes"]["name"]
    

In [None]:
def get_rect_bbox(region):
    """
    Gets the bounding box of a region.
    
    :param region: The region to get the bounding box of.
    :return: List in the form [x, y, width, height] defining a rectangle.
    """
    shape_attributes = region["shape_attributes"]
    if get_shape(region) == "rect":
        bbox = [shape_attributes["x"], 
                shape_attributes["y"], 
                shape_attributes["width"], 
                shape_attributes["height"]]
        
        # doing it this way instead of using min in case have to find
        # these small mistakes later.
        for i, val in enumerate(bbox):
            if val < 0:
                bbox[i] = 0
        
        return bbox
    else:
        return None

In [None]:
def get_region_type(region):
    """
    Gets the type of the region.
    
    :param region: The region to get the type of.
    :return: The type of the region.
    """
    if "type" in region["region_attributes"]:
        region_type = region["region_attributes"]["type"]
        if type(region_type) == type(dict()):
            for key, value in region_type.items():
                if value:
                    return key
        else:
            return region_type
    else:
        return None

In [None]:
def create_directory_name_from_filename(filename):
    """
    Creates a directory name from a filename.
    
    :param filename: The name of the file to use to create a directory name from.
    :return: Directory name.
    """
    return filename.split(".")[0]

In [None]:
def create_directory(path, directory_name):
    """
    Creates a directory if it doesn't already exist
    
    :param path: The folder the directory will be made in.
    :param directory_name: The name of the directory to make.
    :return: The filepath to the directory.
    """
    cleaned_directory_name = clean_directory_name(directory_name)
    full_path = os.path.join(path, cleaned_directory_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    
    return full_path
    

In [None]:
def clean_directory_name(directory_name):
    """
    Cleans a directory name.
    
    :param directory_name: The directory output name.
    :return: The cleaned string.
    """
    cleaned = directory_name.replace("/", "-")
    cleaned = cleaned.replace(" ", "-")
    cleaned = cleaned.lower()
    return cleaned

In [None]:
def main(via_json_path, image_directory, output_directory, file_name_regions_dict):
    """
    Main entry point for the program.
    
    :param via_json_path: The path to the json file that describes the regions in the images.
    :param image_directory: Directory path which contains the images the json file describes.
    :param output_directory: The path to the output directory to write all the data to.
    :param file_name_regions_dict: A dictionary where the keys are the filenames and the values are a list of regions.
    """
    for filename, regions in file_name_regions_dict.items():
        image = load_image(os.path.join(image_directory, filename))
        if image is None:
            print("file {} could not be found".format(filename))
            continue
            
        current_file_output_directory = create_directory(output_directory, create_directory_name_from_filename(filename))
        for i, region in enumerate(regions):
            bbox = get_rect_bbox(region)
            region_type = get_region_type(region)
            if region_type is None:
                print("file {} had an uncategorized box".format(filename))
                region_type = "uncategorized"
                
            cropped_image = crop_image(image, *bbox)
            region_type_output_directory = create_directory(current_file_output_directory, region_type)
            save_image(cropped_image, region_type_output_directory, "{}.jpg".format(i))

In [None]:
def find_bad_files(via_json_path, image_directory, file_name_regions_dict, bad_categories_set=None):
    """
    Collects the file names of all files with problems with tagging. 
    
    :param via_json_path: The path to the json file that describes the regions in the images.
    :param image_directory: Directory path which contains the images the json file describes.
    :param file_name_regions_dict: A dictionary where the keys are the filenames and the values are a list of regions.
    :param bad_categories_set: Set of categories that contain bad files
    :return: A 2D list of files with tagging problems and the reason why they are bad.
    """
    bad_files = []
    for filename, regions in file_name_regions_dict.items():
        if not os.path.exists(os.path.join(image_directory, filename)):
            print("file {} could not be found".format(filename))
            continue

        for i, region in enumerate(regions):
            region_type = get_region_type(region)
            if region_type is None:
                bad_files.append([filename, "contains uncategorized region"])
            elif bad_categories_set is not None:
                cleaned_region_name = clean_directory_name(region_type)
                if cleaned_region_name in bad_categories_set: 
                    bad_files.append([filename, f"contains region named {region_type}"])
                
    return bad_files

In [None]:
def collect_files_by_category(base_directory):
    """
    Collects the path to each file from each category.
    
    :param base_directory: Directory to collect from.
    """
    classifications = defaultdict(list)
    list_subfolders_with_paths = [f.path for f in os.scandir(base_directory) if f.is_dir()]
    for path in list_subfolders_with_paths:
        sub_classes = [f.name for f in os.scandir(path) if f.is_dir()]
        for sub_class in sub_classes:
            current_dir = os.path.join(path, sub_class)
            list_of_images = os.listdir(current_dir)
            for image in list_of_images:
                output = os.path.join(current_dir, image)
                classifications[sub_class].append(output)
                
    return classifications

In [None]:
def get_category_count(base_directory):
    """
    Finds the number of files for each category.
    
    :param base_directory: Path to the base directory to count the number of each class of files.
    """
    categorized_files = collect_files_by_category(base_directory)
    counts = defaultdict(int)
    for key, value in categorized_files.items():
        counts[key] = len(value)
    
    return counts

In [None]:
def random_sample(base_directory, no_samples):
    """
    Randomly samples data from each category.
    
    :param base_directory: Directory to collect from.
    :param no_samples: The number of samples to take from each category
    """
    files_by_category = collect_files_by_category(base_directory)
    output = {}
    for key, value in files_by_category.items():
        max_samples = min(len(value), no_samples)
        sample = random.sample(value, max_samples)
        output[key] = sample
        
    return output

In [None]:
def collect_subset_of_files(output_dir, category_files_dict):
    """
    Copies a subset of files from each category to an output directory.
    
    :param output_dir: The directory to write the output files to.
    :param category_files_dict: Dictionary where the keys are the classes of the files and the
    value is a list of filepaths.
    """
    for key, value in category_files_dict.items():
        region_type_output_directory = create_directory(output_dir, key)
        for i, image_path in enumerate(value):
            image_name = f"{i}.jpg"
            output_path = os.path.join(region_type_output_directory, image_name)
            shutil.copy(image_path, output_path)

In [None]:
IMAGE_DIRECTORY = r"C:\Users\Ethan\Desktop\repos\princeton-bitmoji-project-new\data\confirmed-yes-2-15-2022"
VIA_JSON_PATH = r"C:\Users\Ethan\Desktop\repos\princeton-bitmoji-project-new\data\All Annotations Feb 20 2022.json"
OUTPUT_DIRECTORY = r"C:\Users\Ethan\Desktop\repos\princeton-bitmoji-project-new\output\all-annotations-feb-20-2022"
SAMPLE_OUTPUT_DIRECTORY = r"C:\Users\Ethan\Desktop\repos\princeton-bitmoji-project-new\output\examples"

In [None]:
file_name_regions_dict = get_filename_and_regions_from_via_json(VIA_JSON_PATH)

In [None]:
bad_categories = set(["teacher-outfir", "other", "poster--wall-hanging"])
bad_files = find_bad_files(VIA_JSON_PATH, IMAGE_DIRECTORY, file_name_regions_dict, bad_categories)
with open("bad_files.csv", "w", newline="") as file:
    headers = ["file-name", "reason"]
    writer = csv.writer(file)
    writer.writerow(headers)
    writer.writerows(bad_files)

In [None]:
random.seed(2)

In [None]:
# main(VIA_JSON_PATH, IMAGE_DIRECTORY, OUTPUT_DIRECTORY, file_name_regions_dict)
# sample = random_sample(OUTPUT_DIRECTORY, 10)
# collect_subset_of_files(SAMPLE_OUTPUT_DIRECTORY, sample)
get_category_count(OUTPUT_DIRECTORY)