In [72]:
import numpy as np
import pandas as pd

import math
import os
import shutil
import glob

from PIL import Image, ImageDraw

In [2]:
dataset = 'train'

In [None]:
def cropImages(df, source_dir, target_dir):
    
    # Group by ids to loop through index levels
    # and make sure the first columns are x/y values
    df = df.groupby(['image_id', 'tag_id']).first()
    
    # Create missing directory
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
        
    # Loop through image ids
    for image_id, image_df in df.groupby(level=0):
        
        # Find and open image without knowing the extension (jpg/tiff)
        image_file = glob.glob(os.path.join(source_dir, f'{image_id}.*'))[0]
        image = Image.open(image_file)

        # Loop through tag ids
        for tag_id, tag_entry in image_df.groupby(level=1):
            
            # Filename for saving image
            tag_file = f'{target_dir}{image_id}_{tag_id}.png'
            
            # Extract x/y values and create x/y vectors
            xyraw = tag_entry.values[0,:8]
            xvec, yvec = xyraw[0::2], xyraw[1::2]
            
            # Find x/y indices by size to adjust +/- padding
            xind, yind = np.argsort(xvec), np.argsort(yvec)
            pad = 5
        
            # Pad entire vector
            xvec[xind[:2]] -= pad; xvec[xind[2:]] += pad
            yvec[yind[:2]] -= pad; yvec[yind[2:]] += pad            
            
            # Find rectangle's (left, upper, right, lower)
            xmin, ymin = min(xvec), min(yvec)
            xmax, ymax = max(xvec), max(yvec)            
            
            # Crop image as rectangle
            cropped = image.crop((xmin, ymin, xmax, ymax))
    
            # List of polygon points adjusted to cropped image
            xy = list(zip(xvec - xmin, yvec - ymin))
        
            # Create mask to remove everything outside image polygon
            mask = Image.new('1', cropped.size, 1)
            ImageDraw.Draw(mask).polygon(xy, outline=0, fill=0)
            Image.composite(mask, cropped, mask).save(tag_file)

In [None]:
%%time

df = pd.read_csv(f'./dataset_v2/{dataset}.csv')
source_dir = f'./dataset_v2/{dataset}/'
target_dir = f'./dataset_v2/{dataset}/cropped/'

cropImages(df, source_dir, target_dir)

In [160]:
def copyImagesClasses(df, source_dir, target_dir):
    
    # Replace unwanted characters
    df = df.replace(' ', '_', regex=True)
    df = df.replace('/', '_', regex=True)
    
    # Find categorical columns
    categorical = df.columns[df.dtypes == object]
    
    # Set category type and one-hot-encode
    for category in categorical:
        df[category] = df[category].astype('category')
    df = pd.get_dummies(df, prefix='', prefix_sep='')
    
    # Loop through all categories starting at index 10
    for category in df.columns[10:]:
        
        # Group by the given category and locate the group with value==1
        cat_df = df.groupby([category, 'image_id', 'tag_id']).first().loc[1]
        
        # Create category directory
        cat_target_dir = f'{target_dir}{category}/'
        if not os.path.exists(cat_target_dir):
            os.makedirs(cat_target_dir)

        # Loop through image ids
        for image_id, image_df in cat_df.groupby(level=0):

            # Loop through tag ids
            for tag_id, tag_entry in image_df.groupby(level=1):

                # Get file name and copy to target directory
                tag_file = f'{source_dir}{image_id}_{tag_id}.png'
                shutil.copy2(tag_file, cat_target_dir)
        
    

In [162]:
%%time

df = pd.read_csv(f'./dataset_v2/{dataset}.csv')
source_dir = f'./dataset_v2/{dataset}/cropped/'
target_dir = f'./dataset_v2/{dataset}/classes/'

copyImagesClasses(df, source_dir, target_dir)

Wall time: 38.9 ms


In [7]:
def copyImagesExclusive(df, source_dir, target_dir):
    
    # Ignore impossible features
#     df = df.replace(-1, 0)
    
    # Replace unwanted characters
    df = df.replace(' ', '_', regex=True)
    df = df.replace('/', '_', regex=True)
    
    # Loop through all categories starting at index 10
    for category in df.columns[10:]:
        
        # Group by the given category
        cat_df = df.groupby([category, 'image_id', 'tag_id']).first()

        # Create category directory
        cat_target_dir = f'{target_dir}{category}'    
        if not os.path.exists(cat_target_dir):
            os.makedirs(cat_target_dir)

        # Loop through all sub-categories
        for sub_category, sub_df in cat_df.groupby(level=0):

            # Skip impossible features
            if sub_category == -1: continue

            # Create sub-category directory
            sub_target_dir = f'{cat_target_dir}/{sub_category}/'
            if not os.path.exists(sub_target_dir):
                os.makedirs(sub_target_dir)

            # Loop through image ids
            for image_id, image_df in sub_df.groupby(level=1):

                # Loop through tag ids
                for tag_id, tag_entry in image_df.groupby(level=2):

                    # Get file name and copy to target directory
                    tag_file = f'{source_dir}{image_id}_{tag_id}.png'
                    shutil.copy2(tag_file, sub_target_dir)

In [8]:
%%time

df = pd.read_csv(f'./dataset_v2/{dataset}.csv')
source_dir = f'./dataset_v2/{dataset}/cropped/'
target_dir = f'./dataset_v2/{dataset}/divided/'

copyImagesExclusive(df, source_dir, target_dir)

Wall time: 2min 52s


In [9]:
def moveSubclasses(df, source_dir):
    
    # Replace unwanted characters
    df = df.replace(' ', '_', regex=True)
    df = df.replace('/', '_', regex=True)
    
    # Group by general class to divide the set of sub-classes
    df = df.groupby(['general_class', 'sub_class', 'image_id', 'tag_id']).first();

    # Loop through general classes
    for general_class, general_df in df.groupby(level=0):
        
        # Loop through sub-classes
        for sub_class, sub_df in general_df.groupby(level=1):
            
            # Get source and target directories
            sub_source_dir = f'{source_dir}sub_class/{sub_class}/'
            sub_target_dir = f'{source_dir}{general_class}/{sub_class}/'

            # Move directory
            if not os.path.exists(sub_target_dir):
                shutil.move(sub_source_dir, sub_target_dir)

In [10]:
%%time

moveSubclasses(df, target_dir)

Wall time: 4.47 s


In [84]:
from keras.preprocessing.image import ImageDataGenerator
        
def augmentImages(source_dir, target_dir):
    
    generator = ImageDataGenerator(
        rotation_range=45,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='constant',
        cval=255
    )
    
    for category in os.listdir(source_dir):
                
        print(f'Augmenting {category}')
        
        cat_source_dir = f'{source_dir}{category}/'
        
        sub_categories = os.listdir(cat_source_dir)
        
        sub_source_dirs = [f'{cat_source_dir}{sub_category}/' \
                                for sub_category in sub_categories]
        
        max_samples = max([len(os.listdir(sub_source_dir)) \
                                for sub_source_dir in sub_source_dirs])        
        
        for sub_category in sub_categories:
            
            sub_target_dir = f'{target_dir}{category}/{sub_category}'
            if not os.path.exists(sub_target_dir):
                os.makedirs(sub_target_dir)
            else: continue
            
            image_gen = generator.flow_from_directory(  
                classes=[sub_category],
                target_size=(128, 128),
                batch_size=32,
                directory= cat_source_dir,
                save_to_dir=sub_target_dir
            )
            
            steps = math.ceil(max_samples / image_gen.batch_size)
            
            for _ in range(steps):
                next(image_gen)
            
        
    
    

In [85]:
%%time

source_dir = f'./dataset_v2/{dataset}/divided/'
target_dir = f'./dataset_v2/{dataset}/augmented/'

augmentImages(source_dir, target_dir)

Found 368 images belonging to 1 classes.
Found 72 images belonging to 1 classes.
Found 1158 images belonging to 1 classes.
Found 742 images belonging to 1 classes.
Found 97 images belonging to 1 classes.
Found 626 images belonging to 1 classes.
Found 414 images belonging to 1 classes.
Found 3505 images belonging to 1 classes.
Found 4817 images belonging to 1 classes.
Found 258 images belonging to 1 classes.
Found 374 images belonging to 1 classes.
Found 133 images belonging to 1 classes.
Found 10939 images belonging to 1 classes.
Found 172 images belonging to 1 classes.
Found 429 images belonging to 1 classes.
Found 77 images belonging to 1 classes.
Found 503 images belonging to 1 classes.
Found 4 images belonging to 1 classes.
Found 505 images belonging to 1 classes.
Found 2 images belonging to 1 classes.
Found 53 images belonging to 1 classes.
Found 17 images belonging to 1 classes.
Found 16 images belonging to 1 classes.
Found 5 images belonging to 1 classes.
Found 164 images belong