In [32]:
# If not running on Google Colab/Drive, skip this
# Run this block first and follow the instructions to authorize mounting
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
import zipfile as zf
import os
import random
import shutil
import urllib.request

In [2]:
# If not running on Google Colab/Drive, modify this to root, "."
GDRIVE = os.path.join('/', 'content', 'gdrive', 'My Drive')

In [4]:
RANDOM_SEED = 0

TRAIN_SPLIT = 0.6 # percentage of total data reserved for training
VAL_TEST_SPLIT = 0.5 # after allocating the training data, how do you want to split the remaining data between val and test?

DATASET_ZIP_FILE = 'dataset-resized.zip'
DATASET_URL = f'https://github.com/garythung/trashnet/raw/master/data/{DATASET_ZIP_FILE}'
INPUT_DATA_DIR = DATASET_ZIP_FILE.split('.')[0] # folder containing data downloaded from GitHub
OUTPUT_DATA_DIR = os.path.join(GDRIVE, 'data') # we will store train/ valid/ test/ directories here
TRAIN_DATA_DIR = os.path.join(OUTPUT_DATA_DIR, 'train')
VAL_DATA_DIR = os.path.join(OUTPUT_DATA_DIR, 'valid')
TEST_DATA_DIR = os.path.join(OUTPUT_DATA_DIR, 'test')

WASTE_TYPES = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']

random.seed(RANDOM_SEED)

In [15]:
total_train = total_val = total_test = 0

In [6]:
# Only run the following 5 blocks if you are trying to rebuild the data/ directory
urllib.request.urlretrieve(DATASET_URL, DATASET_ZIP_FILE)

('dataset-resized.zip', <http.client.HTTPMessage at 0x1194dc610>)

In [16]:
files = zf.ZipFile(DATASET_ZIP_FILE, 'r')
files.extractall()
files.close()

In [17]:
# Adapted from: https://github.com/collindching/Waste-Sorter
## helper functions ##

## splits indices for a folder into train, validation, and test indices with random sampling
    ## input: folder path
    ## output: train, valid, and test indices    
def split_indices(folder):    
    n = len(os.listdir(folder))
    full_set = list(range(1, n + 1))

    ## train indices
    train = random.sample(list(range(1, n + 1)), int(TRAIN_SPLIT * n))

    ## temp
    remain = list(set(full_set) - set(train))

    ## separate remaining into validation and test
    valid = random.sample(remain, int(VAL_TEST_SPLIT * len(remain)))
    test = list(set(remain) - set(valid))
    
    return train, valid, test

## gets file names for a particular type of trash, given indices
    ## input: waste category and indices
    ## output: file names 
def get_names(waste_type, indices):
    return [f'{waste_type}{i}.jpg' for i in indices]    

## moves group of source files to another folder
    ## input: list of source files and destination folder
    ## no output
def move_files(source_files, destination_folder):
    for file in source_files:
        shutil.move(file, destination_folder)

In [None]:
# If not running on Google Colab/Drive, modify this to root, "."
rm -rf /content/gdrive/My\ Drive/data

In [18]:
# Adapted from: https://github.com/collindching/Waste-Sorter
## paths will be train/cardboard, train/glass, etc...

## create destination folders for data subset and waste type
for data_dir in [TRAIN_DATA_DIR, VAL_DATA_DIR, TEST_DATA_DIR]:
    for waste_type in WASTE_TYPES:
        folder = os.path.join(data_dir, waste_type)
        if not os.path.exists(folder):
            os.makedirs(folder)
            
## move files to destination folders for each waste type
for waste_type in WASTE_TYPES:
    source_folder = os.path.join(INPUT_DATA_DIR, waste_type)
    train_ind, valid_ind, test_ind = split_indices(source_folder)
    
    ## move source files to train
    train_names = get_names(waste_type, train_ind)
    train_source_files = [os.path.join(source_folder, name) for name in train_names]
    train_dest = os.path.join(TRAIN_DATA_DIR, waste_type)
    move_files(train_source_files, train_dest)
    total_train += len(train_names)
    
    ## move source files to valid
    valid_names = get_names(waste_type, valid_ind)
    valid_source_files = [os.path.join(source_folder, name) for name in valid_names]
    valid_dest = os.path.join(VAL_DATA_DIR, waste_type)
    move_files(valid_source_files, valid_dest)
    total_val += len(valid_names)
    
    ## move source files to test
    test_names = get_names(waste_type, test_ind)
    test_source_files = [os.path.join(source_folder, name) for name in test_names]
    test_dest = os.path.join(TEST_DATA_DIR, waste_type)
    move_files(test_source_files, test_dest)
    total_test += len(test_names)

shutil.rmtree(INPUT_DATA_DIR)

In [19]:
print(total_train, total_val, total_test)

1514 505 508
