# 02_split_uecfood100

- Split UECFOOD256 dataset to training, val, testing sets with ratio 0.7, 0.2, 0.1
- Save img_dir, category_id, x1, y1, x2, y2 into txt file under train_uec256.txt, val_uec256.txt and test_uec256.txt.

In [1]:
import random
import itertools
import numpy as np

In [2]:
def split_dataset():
    dataset_disk = '/home/weiyao/data/'
#     uecfood100_path = dataset_disk + 'UECFOOD100_448'
    uecfood100_path = dataset_disk + 'UECFOOD100/'
    category = 'category.txt'
    bbox_info = 'new_bb_info.txt'

    split = [1., 0., 0.]
    files_generated = ['trainval.txt', 'val_uec100.txt', 'test_uec100.txt']

    # Put first column (id) and second column (name) from category.txt into two lists
    category_ids = []
    category_names = []
    with open(uecfood100_path + '/' + category, 'r') as category_list:
        for i, line in enumerate(category_list):
            if i > 0:
                line = line.rstrip('\n')  # delete \n in the end of th
                # e line
                line = line.split('\t')
                category_ids.append(int(line[0]))
                category_names.append(line[1])

    # Read bb_info.txt based on category id
    category_images = []
    category_bbox = []
    for id_index, id in enumerate(category_ids):
        category_images.append([])
        category_bbox.append([])
        with open(uecfood100_path + '/' + str(id) + '/' + bbox_info, 'r') as bbox_list:
            for i, line in enumerate(bbox_list):
                if i > 0:
                    line = line.rstrip('\n')
                    line = line.split(' ')
                    category_images[id_index].append(line[0])
                    category_bbox[id_index].append(list(map(float, line[1:])))

    # Split categories to train/val/test with ratio define before
    train_uec100 = []
    val_uec100 = []
    test_uec100 = []
    for id_index, id in enumerate(category_ids):
        # divide each category with 70% training, 20% val, 10% testing
        n_imgs = len(category_images[id_index])
        n_train = int(np.floor(n_imgs * split[0]))
        n_val = int(np.floor(n_imgs * split[1]))
        n_test = int(n_imgs - n_train - n_val)

        # shuffle images
        shuffled_imgs = random.sample(category_images[id_index], n_imgs)

        train_uec100.append(shuffled_imgs[:n_train])  # not including the last one
        val_uec100.append(shuffled_imgs[n_train:n_train + n_val])
        test_uec100.append(shuffled_imgs[n_train + n_val:])

    all_train_list = list(np.unique(list(itertools.chain(*train_uec100))))
    all_val_list = list(np.unique(list(itertools.chain(*val_uec100))))
    all_test_list = list(np.unique(list(itertools.chain(*test_uec100))))

    # Pop out element in training set if it's in testing or val also
    i = 0
    while i < len(all_train_list):  # give priority to val and test over train
        if all_train_list[i] in all_val_list:  # training sample is in val set too
            all_train_list.pop(i)
        elif all_train_list[i] in all_test_list:  # training sample is in test set too
            all_train_list.pop(i)
        else:
            i += 1

    # Pop out element in testing set if it's in val also
    i = 0
    while i < len(all_test_list):  # give priority to val over test
        if all_test_list[i] in all_val_list:  # test sample is in val set too
            all_test_list.pop(i)
        else:
            i += 1

    # Split bounding box with train, val, test sets
    imgs_format = 'jpg'
    file = open(uecfood100_path + '/classes.txt', 'w')
    for c in category_names:
        file.write(c + '\n')
    file.close()

    #### Training set
    file = open(uecfood100_path + '/' + files_generated[0], 'w')
    #file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood100_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_id = img_path.split('/')[-1].split('.')[0]
            file.write(img_id + '\n')
            #img_category = occ[1]
            #img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            #file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()
    return

    #### Val set
    file = open(uecfood100_path + '/' + files_generated[1], 'w')
    file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood100_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_category = occ[1]
            img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()

    #### Testing set
    file = open(uecfood100_path + '/' + files_generated[2], 'w')
    file.write('img category_id x1 y1 x2 y2\n')  # header
    for img in all_train_list:
        # it is possible that one image in several categories
        occurrences = []
        for id_index, id in enumerate(category_ids):
            occ = [[uecfood100_path + '/' + str(id) + '/' + img + '.' + imgs_format, str(id)] +
                   category_bbox[id_index][i] for i, elem in enumerate(category_images[id_index]) if elem == img]
            occurrences += occ

        for occ in occurrences:
            img_path = occ[0]
            img_category = occ[1]
            img_bbox = str(occ[2]) + ' ' + str(occ[3]) + ' ' + str(occ[4]) + ' ' + str(occ[5])
            file.write(img_path + ' ' + img_category + ' ' + img_bbox + '\n')
    file.close()

    print('Done!')

In [3]:
split_dataset()

- **The generated txt file should like this: **   
/Volumes/JS/UECFOOD100_JS/1/10.jpg 1 81.0 20.0 546.0 421.0  
/Volumes/JS/UECFOOD100_JS/2/100.jpg 2 58.0 0.0 748.0 582.0  
/Volumes/JS/UECFOOD100_JS/11/1000.jpg 11 28.0 17.0 611.0 594.0  
/Volumes/JS/UECFOOD100_JS/11/1001.jpg 11 54.0 38.0 667.0 573.0  
/Volumes/JS/UECFOOD100_JS/11/1003.jpg 11 0.0 0.0 800.0 600.0    
...