In [63]:
import numpy as np
import pandas as pd

import os
import shutil
import glob

from PIL import Image, ImageDraw

In [None]:
dataset = 'train'

In [None]:
def cropImages(df, source_dir, target_dir):
    
    # Group by ids to loop through index levels
    # and make sure the first columns are x/y values
    df = df.groupby(['image_id', 'tag_id']).first()
    
    # Create missing directory
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
        
    # Loop through image ids
    for image_id, image_df in df.groupby(level=0):
        
        # Find and open image without knowing the extension (jpg/tiff)
        image_file = glob.glob(os.path.join(source_dir, f'{image_id}.*'))[0]
        image = Image.open(image_file)

        # Loop through tag ids
        for tag_id, tag_entry in image_df.groupby(level=1):
            
            # Filename for saving image
            tag_file = f'{target_dir}{image_id}_{tag_id}.png'
            
            # Extract x/y values and create x/y vectors
            xyraw = tag_entry.values[0,:8]
            xvec, yvec = xyraw[0::2], xyraw[1::2]
            
            # Find x/y indices by size to adjust +/- padding
            pad = 5
            xind, yind = np.argsort(xvec), np.argsort(yvec)
        
            # Pad entire vector
            xvec[xind[:2]] -= pad; xvec[xind[2:]] += pad
            yvec[yind[:2]] -= pad; yvec[yind[2:]] += pad            
            
            # Find rectangle's (left, upper, right, lower)
            xmin, ymin = min(xvec), min(yvec)
            xmax, ymax = max(xvec), max(yvec)            
            
            # Crop image as rectangle
            cropped = image.crop((xmin, ymin, xmax, ymax))
    
            # List of polygon points adjusted to cropped image
            xy = list(zip(xvec - xmin, yvec - ymin))
        
            # Create mask to remove everything outside image polygon
            mask = Image.new('1', cropped.size, 1)
            ImageDraw.Draw(mask).polygon(xy, outline=0, fill=0)
            Image.composite(mask, cropped, mask).save(tag_file)

In [None]:
%%time

df = pd.read_csv(f'./dataset_v2/{dataset}.csv')
source_dir = f'./dataset_v2/{dataset}/'
target_dir = f'./dataset_v2/{dataset}/cropped/'

cropImages(df, source_dir, target_dir)

In [71]:
def copyImages(df, source_dir, target_dir):
    
    # Ignore impossible features
    df = df.replace(-1, 0)
    
    # Replace unwanted characters
    df = df.replace(' ', '_', regex=True)
    df = df.replace('/', '_', regex=True)
    
    # Loop through all categories starting at index 10
    for category in df.columns[10:]:
        
        # Group by the given category
        cat_df = df.groupby([category, 'image_id', 'tag_id']).first()

        # Create category directory
        target_dir = f'{target_dir}{category}'    
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        # Loop through all sub-categories
        for sub_category, sub_df in cat_df.groupby(level=0):

            # Create sub-category directory
            sub_target_dir = f'{target_dir}/{sub_category}/'
            if not os.path.exists(sub_target_dir):
                os.makedirs(sub_target_dir)

            # Loop through image ids
            for image_id, image_df in sub_df.groupby(level=1):

                # Loop through tag ids
                for tag_id, tag_entry in image_df.groupby(level=2):

                    # Get file name and copy to target directory
                    tag_file = f'{source_dir}{image_id}_{tag_id}.png'
                    shutil.copy2(tag_file, sub_target_dir)

In [73]:
%%time

df = pd.read_csv(f'./dataset_v2/{dataset}.csv')
source_dir = f'./dataset_v2/{dataset}/cropped/'
target_dir = f'./dataset_v2/{dataset}/divided/'

copyImages(df, source_dir, target_dir)

Wall time: 4min 26s


In [104]:
def moveSubclasses(df, source_dir):
    
    # Replace unwanted characters
    df = df.replace(' ', '_', regex=True)
    df = df.replace('/', '_', regex=True)
    
    # Group by general class to divide the set of sub-classes
    df = df.groupby(['general_class', 'sub_class', 'image_id', 'tag_id']).first();

    # Loop through general classes
    for general_class, general_df in df.groupby(level=0):
        
        # Loop through sub-classes
        for sub_class, sub_df in general_df.groupby(level=1):
            
            # Get source and target directories
            sub_source_dir = f'{source_dir}sub_class/{sub_class}/'
            sub_target_dir = f'{source_dir}{general_class}/{sub_class}/'

            # Move directory
            if not os.path.exists(sub_target_dir):
                shutil.move(sub_source_dir, sub_target_dir)

In [105]:
%%time

moveSubclasses(df, target_dir)

Wall time: 411 ms
