# Goal of this notebook

Date: **26-02-2022**

[iNaturalist Dataset 2021](https://github.com/visipedia/inat_comp/tree/master/2021) contains 10,000 species under 11 'super-categories'. We will create a separate model for each super-category, instead of a single model to classify each of the categories as is required by the challenge as we believe that individual super-category level models would have higher accuracy. <P>
We will preprocess the 'mini dataset' first. This dataset consists of 500px images in jpeg format with 50 images per species, making up 500,000 images. The dataset also includes location, however we will not be making use of that today. There is also a `.json` file that accompanies the dataset and we will use that to help organise our images.

In [None]:
import os
from tqdm.notebook import tqdm
import numpy as np
import json
import shutil
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
DATA_DIR = './data/'
CURRENT_IMGS_DIR = os.path.join(DATA_DIR, 'train_mini')
NEW_IMGS_DIR = os.path.join(DATA_DIR, 'train_mini_supercategory')

# make new directory to store images split by superclasses
if not os.path.exists(NEW_IMGS_DIR):
    os.makedirs(NEW_IMGS_DIR)

In [None]:
# read train mini to a dict
with open(os.path.join(DATA_DIR, 'train_mini.json'), 'r') as jf:
    data = json.load(jf)

In [None]:
# write individual json files for each supercategory

# store values in dict
supercategory_dict = {}
for img_category in tqdm(data['categories']):
    
    if img_category['supercategory'] not in supercategory_dict:
        supercategory_dict[img_category['supercategory']] = [img_category]
    else:
        supercategory_dict[img_category['supercategory']].append(img_category)
        
# move json files
for supercategory in tqdm(supercategory_dict):
    supercat_dict = {idx: value for idx, value in enumerate(supercategory_dict[supercategory])}
    with open(os.path.join(NEW_IMGS_DIR, f"{supercategory}.json"), 'w') as jf:
        json.dump(supercat_dict, jf)
        

In [None]:
# plot number of categories per supercategory
keys, counts = zip(*[(key, len(supercategory_dict[key])) for key in supercategory_dict])
keys = list(keys)
counts = np.array(counts)
counts_df = pd.DataFrame({'keys':keys,'counts':counts})

ax = sns.barplot(y='keys', x='counts', data=counts_df)
ax.set_title('Number of species per supercategory')
ax.bar_label(ax.containers[0]);

In [None]:
for img_category in tqdm(data['categories']): 
    
    # make dir for supercategory if not exist
    supercategory_savepath = os.path.join(NEW_IMGS_DIR, img_category['supercategory'])
    if not os.path.exists(supercategory_savepath):
        os.makedirs(supercategory_savepath)
        
    # move directory
    img_dir = os.path.join(CURRENT_IMGS_DIR, img_category['image_dir_name'])
    new_img_dir = os.path.join(supercategory_savepath, img_category['name'])
    
    # if path exists; continue. Else makedir
    if os.path.exists(new_img_dir):
        continue
    os.makedirs(new_img_dir)
    
    # copy over
    for img_name in os.listdir(img_dir):
        shutil.move(os.path.join(img_dir,img_name), os.path.join(new_img_dir, img_name))
    