## Retrieving the Dataset
1. Change runtime type to `Python 3 with GPU`
2. Authorize the notebook to mount your Google Drive
3. Decompress the Dataset onto the local disk (Ephemeral Storage of the provisioned Google Colab VM)

In [None]:
import csv
import os
import shutil
import time
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
start = time.time()
!cat /content/gdrive/My\ Drive/DashGC/20bn-jester-v1-?? | tar -zx
stop = time.time()
print('Decompression took', round(((stop - start) / 60), 2), 'mins')

In [None]:
print(os.getcwd())
dirs = os.listdir('20bn-jester-v1')
print(len(dirs))

directories = {dir: len(os.listdir('20bn-jester-v1/' + dir))
               for dir in dirs}
print(len(directories.keys()))
print(min(directories.values()), max(directories.values()))

In [None]:
def copier(filedir, lastfile):
    '''
    Copy the image multiple times.
    filedir: path to directory
    lastfile: file to duplicate within the filedir directory
    '''

    with open('20bn-jester-v1/' + filedir + '/' + lastfile, 'rb') as mainfile:
        data = mainfile.read()
    last = int(lastfile[0:-4])
    while last < 70:
        last += 1
        new = str(last)
        fname = ('0' * (5 - len(new))) + new + '.jpg'
        with open('20bn-jester-v1/' + filedir + '/' + fname, 'wb') as file:
            file.write(data)

In [None]:
tic = time.time()
for dir in dirs:
    count = len(os.listdir('20bn-jester-v1/' + dir))
    chars = 1 if count < 10 else 2
    lastfile = '0' * (5 - chars) + str(count) + '.jpg'
    copier(dir, lastfile)
toc = time.time()
print('Created duplicates in', round(((toc - tic) / 60), 2), 'mins')

In [None]:
def organise_dataset(csvfile, mode, srcdir = '/content/20bn-jester-v1/'):
    with open(csvfile, newline = '\n') as cf:
        reader = csv.reader(cf, delimiter=';')
        filetree = {}
        for row in reader:
            #print(row)
            if row[1] not in filetree.keys():
                filetree[row[1]] = []
            filetree[row[1]].append(row[0])
        print('Number of labels (subdirectories)', len(filetree.keys()))
        ln = []
        print('Number of samples for particular label')
        for label in filetree.keys():
            print(label, len(filetree[label]))
            ln.append(len(filetree[label]))
        print('minimum samples: ', min(ln), 'maximum samples: ', max(ln))
        
    if not os.path.exists('/content/Dataset'):
        os.mkdir('/content/Dataset')
    
    if mode in ['train', 'Train', 'training', 'Training']:
        path = '/content/Dataset/Train'
    elif mode in ['validation', 'Validation']:
        path = '/content/Dataset/Validation'
    
    os.mkdir(path)
    path += '/'

    print('Number of copied samples for particular label')
    for fd in filetree.keys():
        os.mkdir(path + fd)
        #print(len(os.listdir('/content/20bn-jester-v1/')))       
        for folder in filetree[fd]:
            os.mkdir(path + fd + '/' + folder)
            for file in os.listdir(srcdir + folder):
                shutil.copy(srcdir + folder + '/' + file, path + fd + '/' + folder)
                #shutil.copy2()        
        print(fd, len(os.listdir(path + fd)))

In [None]:
tic = time.time()
organise_dataset('/content/gdrive/My Drive/DashGC/jester-v1-train.csv', 'Train')
toc = time.time()
print('Training Dataset organised in ', round(((toc - tic) / 60), 2), 'mins')

In [None]:
tic = time.time()
organise_dataset('/content/gdrive/My Drive/DashGC/jester-v1-validation.csv', 'Validation')
toc = time.time()
print('Validation Dataset organised in ', round(((toc - tic) / 60), 2), 'mins')