This notebook allows to do over sampling with augmentation, so new instances are not the same. Also under-samling by capping the maximum number of images per category

## 1. Data augmentation & over-under sampling

In [19]:
import os
from keras.preprocessing.image import ImageDataGenerator

folder = './data/files/train_additional'
out_folder = './data/files/train_additional_aug'

categories_1 = []
categories_2 = []
categories_3 = []
categories_4 = []
categories_5 = []

categories_1_max = 15
categories_2_max = 40
categories_3_max = 150
categories_4_max = 5000

x = 3

for category in os.listdir(folder):
    files = os.listdir(os.path.join(folder, category))
    length = len(files)
    if length <= categories_1_max*x:
        categories_1.append(category)
    elif length <= categories_2_max*x: 
        categories_2.append(category)
    elif length <= categories_3_max*x: 
        categories_3.append(category)
    elif length <= categories_4_max*x: 
        categories_4.append(category)
    else: 
        categories_5.append(category)

print(len(categories_1))
print(len(categories_2))
print(len(categories_3))
print(len(categories_4))
print(len(categories_5))

1321
1140
1435
1310
64


In [20]:
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

datagen_aug = ImageDataGenerator(
        rotation_range=10,
        shear_range=0.2,
        zoom_range=(0.9, 1.05),
        horizontal_flip=True)

In [21]:
def augment_images(batch_size, categories, multiplier):
    for category in categories:
        shutil.copytree(os.path.join(folder, category), os.path.join(out_folder, category))
    i = 0
    class2indices = dict(zip(categories, range(len(categories))))
    indices2class = dict(zip(range(len(categories)), categories))
    length = 0
    for category in categories:
        length += len(os.listdir(os.path.join(folder,category)))
    for img_batch in datagen_aug.flow_from_directory(folder, class_mode = 'sparse', target_size=(180, 180),
            batch_size=batch_size,
            classes = categories):
        for img, ind in zip(img_batch[0], img_batch[1]):
            scipy.misc.imsave(os.path.join(out_folder, indices2class[ind], 'img' + str(i) +'.jpg'), img)
            i+=1
        if i > length*(multiplier-1):
            break

In [22]:
import scipy
import shutil
batch_size = 512

In [23]:
augment_images(batch_size, categories_1, 8)

Found 28688 images belonging to 1321 classes.


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.
  


In [24]:
augment_images(batch_size, categories_2, 4)
augment_images(batch_size, categories_3, 2)

Found 88479 images belonging to 1140 classes.


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.
  


Found 345842 images belonging to 1435 classes.


In [25]:
import shutil
for category in categories_4:
    shutil.copytree(os.path.join(folder, category), os.path.join(out_folder, category))

In [26]:
import random
for category in categories_5:
    os.mkdir(os.path.join(out_folder, category))
    files = os.listdir(os.path.join(folder, category))
    for i in range(int(categories_4_max*x)):
        file = files.pop(random.randrange(len(files)))
        shutil.copy(os.path.join(folder, category, file), os.path.join(out_folder, category))

## 2. Under and over sampling

Pure over under sampling without augmentation

In [3]:
import os
from keras.preprocessing.image import ImageDataGenerator

folder = './data/files/train_small_single'
out_folder = './data/files/train_small_single_UOS'

categories_1 = []
categories_2 = []
categories_3 = []
categories_4 = []
categories_5 = []

x = 1

for category in os.listdir(folder):
    files = os.listdir(os.path.join(folder, category))
    length = len(files)
    if length <= 15*x:
        categories_1.append(category)
    elif length <= 50*x: 
        categories_2.append(category)
    elif length <= 150*x: 
        categories_3.append(category)
    elif length <= 5000*x: 
        categories_4.append(category)
    else: 
        categories_5.append(category)

print(len(categories_1))
print(len(categories_2))
print(len(categories_3))
print(len(categories_4))
print(len(categories_5))

353
304
188
196
13


In [4]:
import shutil

def add_images(categories, multiplier):
    for category in categories:
        shutil.copytree(os.path.join(folder, category), os.path.join(out_folder, category))
        for i in range(multiplier - 1):
            for file in os.listdir(os.path.join(folder, category)):
                shutil.copyfile(os.path.join(folder, category, file), os.path.join(out_folder, category, str(i+1)+'_'+ file))

In [5]:
add_images(categories_1, 4)
add_images(categories_2, 3)
add_images(categories_3, 2)
add_images(categories_4, 1)

In [6]:
import random
for category in categories_5:
    os.mkdir(os.path.join(out_folder, category))
    files = os.listdir(os.path.join(folder, category))
    for i in range(5000*x):
        file = files.pop(random.randrange(len(files)))
        shutil.copy(os.path.join(folder, category, file), os.path.join(out_folder, category))