In [3]:
import json 
with open("data/annotations.json", "r") as f:
    annotations = json.load(f)

In [4]:
annotations.keys()

dict_keys(['info', 'images', 'annotations', 'scene_annotations', 'licenses', 'categories', 'scene_categories'])

In [5]:
images = annotations["images"]

In [6]:
from PIL import Image
import numpy as np 

In [7]:
path = images[0]["file_name"]

# open image from path
img = Image.open("data/" + path)
img_array = np.array(img)

In [8]:
img.show()

In [9]:
import os

In [10]:
batchFolderNames = []
for filename in os.listdir("data"):
    if not filename.startswith('.') and os.path.splitext(filename)[1] != '.csv' and os.path.splitext(filename)[1] != '.json':
        batchFolderNames.append(filename)
print(batchFolderNames)

['batch_14', 'batch_13', 'batch_1', 'batch_6', 'batch_8', 'batch_12', 'batch_15', 'batch_9', 'batch_7', 'batch_5', 'batch_2', 'batch_10', 'batch_3', 'batch_4', 'batch_11']


In [11]:
import shutil

In [12]:
def splitData(batchFolderName, trainTestValCumulativeSplit):
    os.makedirs("data/" + batchFolderName + "/train")
    os.makedirs("data/" + batchFolderName + "/test")
    os.makedirs("data/" + batchFolderName + "/val")
    fileNames = []
    for image in images:
        if image['file_name'].split('/')[0] == batchFolderName:
            fileNames.append(image['file_name'])
    fileNames.sort()
    print(fileNames)
    for i, file in enumerate(fileNames):
        file = list(file.split("/"))
        if (i / len(fileNames)) < trainTestValCumulativeSplit[0]:
            shutil.move("data/" + file[0] + '/' + file[1], "data/" + file[0] + '/train/' + file[1])
        if trainTestValCumulativeSplit[0] < (i / len(fileNames)) < trainTestValCumulativeSplit[1]:
            shutil.move("data/" + file[0] + '/' + file[1], "data/" + file[0] + '/test/' + file[1])
        if trainTestValCumulativeSplit[1] < (i / len(fileNames)):
            shutil.move("data/" + file[0] + '/' + file[1], "data/" + file[0] + '/val/' + file[1])

In [13]:
trainTestValCumulativeSplit = [0.8, 0.9, 1.0]
for batch in batchFolderNames:
    splitData(batch, trainTestValCumulativeSplit)

['batch_14/000000.jpg', 'batch_14/000001.jpg', 'batch_14/000002.jpg', 'batch_14/000003.jpg', 'batch_14/000004.jpg', 'batch_14/000005.jpg', 'batch_14/000006.jpg', 'batch_14/000007.jpg', 'batch_14/000008.jpg', 'batch_14/000009.jpg', 'batch_14/000010.jpg', 'batch_14/000011.jpg', 'batch_14/000012.jpg', 'batch_14/000013.jpg', 'batch_14/000014.jpg', 'batch_14/000015.jpg', 'batch_14/000016.jpg', 'batch_14/000017.jpg', 'batch_14/000018.jpg', 'batch_14/000019.jpg', 'batch_14/000020.jpg', 'batch_14/000021.jpg', 'batch_14/000022.jpg', 'batch_14/000023.jpg', 'batch_14/000024.jpg', 'batch_14/000025.jpg', 'batch_14/000026.jpg', 'batch_14/000027.jpg', 'batch_14/000028.jpg', 'batch_14/000029.jpg', 'batch_14/000030.jpg', 'batch_14/000031.jpg', 'batch_14/000032.jpg', 'batch_14/000033.jpg', 'batch_14/000034.jpg', 'batch_14/000035.jpg', 'batch_14/000036.jpg', 'batch_14/000037.jpg', 'batch_14/000038.jpg', 'batch_14/000039.jpg', 'batch_14/000040.jpg', 'batch_14/000041.jpg', 'batch_14/000042.jpg', 'batch_14/

# Error Explanation

The function is splitting the data into 3 sets. These threshold is computed from the input list and eventually the 3 if statements. What's going on is that the code is doing something like:

$${\rm if} \  x < a$$
$${\rm if} \  a < x < b$$
$${\rm if} \  x > b$$

However it misses the cases where $x=a$ and $x=b$. You can fix this by changing the if statements to:

$${\rm if} \  x \leq a$$
$${\rm if} \  a < x \leq b$$
$${\rm if} \  x > b$$

or even

$${\rm if} \  x < a$$
$${\rm if} \  a \leq x < b$$
$${\rm if} \  x \geq b$$

However, we can just clean up by moving all the extra data into the train folders.

In [14]:
#move all leftover files to train
for folder in batchFolderNames:
    for file in os.listdir("data/" + folder):
        if file[-3:] == 'jpg':
            shutil.move("data/" + folder + '/' + file, "data/" + folder + '/train/' + file)