In [2]:
import os
import sys
import pandas as pd
import numpy as np
import random
import cv2
from skimage import io
from shutil import copyfile

### Load data from .csv file
* train-images-boxable.csv file contains the image name and image url
* train-annotations-bbox.csv file contains the bounding box info with the image id (name) and the image label name
* class-descriptions-boxable.csv file contains the image label name corresponding to its class name

In [3]:
images_boxable_fname = 'train-images-boxable.csv'
annotations_bbox_fname = 'train-annotations-bbox.csv'
class_descriptions_fname = 'class-descriptions-boxable2.csv'

annotations_cols = ['ImageID','LabelName','XMin','XMax','YMin','YMax']
annotations_types = {
	'ImageID': np.dtype(str),
	'LabelName': np.dtype(str),
	'Xmin': np.dtype(float),
	'Xmax': np.dtype(float),
	'Ymin': np.dtype(float),
	'Ymax': np.dtype(float) }

images_boxable = pd.read_csv(images_boxable_fname)
annotations_bbox = pd.read_csv(annotations_bbox_fname, usecols=annotations_cols,dtype=annotations_types)
class_descriptions = pd.read_csv(class_descriptions_fname)

### Define preprocessing parameters
* **class_desc:** list of all unique class descriptions, as written on the .csv files
* **classes:** manually written short terms for each class
* **imgs_per_class:** (imgs_per_class) X (N_CLASSES) = total amount of images of dataset to download
* **n_train_imgs:** amount of images reserved for training, leftover images used for testing

The default parameters are the ones used for the BodyParts (BP) Dataset

In [5]:
class_desc = ['Human arm','Human beard','Human ear','Human eye','Human face',
'Human foot','Human hair','Human hand','Human head','Human leg','Human mouth','Human nose','Skull']
classes = ['arm','beard','ear','eye','face','foot','hair','hand','head','leg','mouth','nose','skull']
N_CLASSES = len(classes)
imgs_per_class = 100
n_train_imgs = 80

base_path = 'BP'
csv_path = 'BP/classCSVs'
data_path = 'BP/data'

os.mkdir(base_path)
os.mkdir(csv_path)
os.mkdir(data_path)

### Write subsets of OpenImagesDB to a .csv

For each class in the custom dataset, I obtain a set with all annotations respective to that class. This set is then randomly shuffled, and only a subset of it is selected for use. A .csv is written with the urls of all images of the subset.

In [14]:
label_names = []
for i in range(N_CLASSES):
    class_pd = class_descriptions[class_descriptions['class']==class_desc[i]]
    label_name_class = class_pd['name'].values[0]
    label_names.append(label_name_class)
    class_bbox = annotations_bbox[annotations_bbox['LabelName']==label_name_class]
    class_img_id = np.unique(class_bbox['ImageID'])
    copy_class_id = class_img_id.copy()
    random.seed(1)
    random.shuffle(copy_class_id)

    subclass_img_id = copy_class_id[:imgs_per_class]
    subclass_img_url = [images_boxable[images_boxable['image_name']==name+'.jpg'] for name in subclass_img_id]
    subclass_pd = pd.DataFrame()
    for j in range(len(subclass_img_url)):
        subclass_pd = subclass_pd.append(subclass_img_url[j], ignore_index= True)
    subclass_pd.to_csv(os.path.join(csv_path, classes[i]+'_img_url.csv'))

Human arm
Human beard
Human ear
Human eye
Human face
Human foot
Human hair
Human hand
Human head
Human leg
Human mouth
Human nose
Skull


### Download images

In [16]:
for i in range(N_CLASSES):
    subclass_pd = pd.read_csv(os.path.join(base_path, csv_path, classes[i]+'_img_url.csv'))
    subclass_img_url = subclass_pd['image_url'].values
    saved_dir = os.path.join(data_path,classes[i])
    os.mkdir(saved_dir)
    for url in subclass_img_url:
        img = io.imread(url)
        saved_path = os.path.join(saved_dir, url[-20:])
        io.imsave(saved_path, img)

### Prepare dataset format for network

Train and test directories are created, and the set of downloaded images is divided as defined by n_train_imgs.

In [35]:
train_path = os.path.join(base_path, 'train')
os.mkdir(train_path)
test_path = os.path.join(base_path, 'test')
os.mkdir(test_path)
for i in range(N_CLASSES):
    
    all_imgs = os.listdir(os.path.join(data_path, classes[i]))
    all_imgs = [f for f in all_imgs if not f.startswith('.')]
    random.seed(1)
    random.shuffle(all_imgs)
    
    train_imgs = all_imgs[:n_train_imgs]
    test_imgs = all_imgs[n_train_imgs:]
    
    # Copy each classes' images to train directory
    for j in range(len(train_imgs)):
        original_path = os.path.join(os.path.join(base_path, 'data', classes[i]), train_imgs[j])
        new_path = os.path.join(train_path, train_imgs[j])
        copyfile(original_path, new_path)
    
    # Copy each classes' images to test directory
    for j in range(len(test_imgs)):
        original_path = os.path.join(os.path.join(base_path, 'data', classes[i]), test_imgs[j])
        new_path = os.path.join(test_path, test_imgs[j])
        copyfile(original_path, new_path)

format of dataframes: (fname_path, xmin, xmax, ymin, ymax, class_name)

In [38]:
train_df = pd.DataFrame(columns=['FileName', 'XMin', 'XMax', 'YMin', 'YMax', 'ClassName'])
# Find boxes in each image and put them in a dataframe
train_imgs = os.listdir(train_path)
train_imgs = [name for name in train_imgs if not name.startswith('.')]

for i in range(len(train_imgs)):
	sys.stdout.write('Parse train_imgs ' + str(i) + '; Number of boxes: ' + str(len(train_df)) + '\r')
	sys.stdout.flush()
	img_name = train_imgs[i]
	img_id = img_name[0:16]
	tmp_df = annotations_bbox[annotations_bbox['ImageID']==img_id]
	for index, row in tmp_df.iterrows():
		labelName = row['LabelName']
		for i in range(len(label_names)):
			if labelName == label_names[i]:
				train_df = train_df.append({'FileName': img_name, 
											'XMin': row['XMin'], 
											'XMax': row['XMax'], 
											'YMin': row['YMin'], 
											'YMax': row['YMax'], 
											'ClassName': classes[i]}, 
											ignore_index=True)

Parse train_imgs 1033; Number of boxes: 14059

In [39]:
test_df = pd.DataFrame(columns=['FileName', 'XMin', 'XMax', 'YMin', 'YMax', 'ClassName'])

# Find boxes in each image and put them in a dataframe
test_imgs = os.listdir(test_path)
test_imgs = [name for name in test_imgs if not name.startswith('.')]

for i in range(len(test_imgs)):
    sys.stdout.write('Parse test_imgs ' + str(i) + '; Number of boxes: ' + str(len(test_df)) + '\r')
    sys.stdout.flush()
    img_name = test_imgs[i]
    img_id = img_name[0:16]
    tmp_df = annotations_bbox[annotations_bbox['ImageID']==img_id]
    for index, row in tmp_df.iterrows():
        labelName = row['LabelName']
        for i in range(len(label_names)):
            if labelName == label_names[i]:
                test_df = test_df.append({'FileName': img_name, 
                                            'XMin': row['XMin'], 
                                            'XMax': row['XMax'], 
                                            'YMin': row['YMin'], 
                                            'YMax': row['YMax'], 
                                            'ClassName': classes[i]}, 
                                           ignore_index=True)

Parse test_imgs 258; Number of boxes: 3441

Save each dataframe to .csv under the dataset's directory

In [40]:
train_df.to_csv(os.path.join(base_path, 'train.csv'))
test_df.to_csv(os.path.join(base_path, 'test.csv'))

### Write train.csv to annotation.txt

Format of annotation.txt: (imgname, x1, y1, x2, y2, label)

In [41]:
train_df = pd.read_csv(os.path.join(base_path, 'train.csv'))

f= open(base_path + "/annotations.txt","w+")
for idx, row in train_df.iterrows():
	img = cv2.imread((base_path + '/train/' + row['FileName']))
	height, width = img.shape[:2]
	x1 = int(row['XMin'] * width)
	x2 = int(row['XMax'] * width)
	y1 = int(row['YMin'] * height)
	y2 = int(row['YMax'] * height)
	fileName = row['FileName']
	className = row['ClassName']
	f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')
f.close()