## Retrieving the Dataset
1. Change runtime type to `Python 3 with GPU`
2. Authorize the notebook to mount your Google Drive
3. Decompress the Dataset onto the local disk (Ephemeral Storage of the provisioned Google Colab VM)

In [None]:
# Make necessary imports
import csv
import os
import shutil
import time

In [None]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Decompress the compressed "Jester" dataset in Google Colab VM

start = time.time()
!cat /content/gdrive/My\ Drive/DashGC/Original/20bn-jester-v1-?? | tar -zx
stop = time.time()
print('Decompression took', round(((stop - start) / 60), 2), 'mins')

In [None]:
# Print the total number of sub-directories in the decompressed dataset.
print(os.getcwd())
dirs = os.listdir('20bn-jester-v1')
print(len(dirs))

# Find and print the maximum and minimum number of images for a video in the entire dataset.
directories = {dir: len(os.listdir('20bn-jester-v1/' + dir))
               for dir in dirs}
print(len(directories.keys()))
print(min(directories.values()), max(directories.values()))

In [None]:
def copier(filedir, lastfile):
    '''
    Copy the image multiple times.
    filedir: path to directory
    lastfile: file to duplicate within the filedir directory
    '''
    
    # Read the data from the last image file
    with open('20bn-jester-v1/' + filedir + '/' + lastfile, 'rb') as mainfile:
        data = mainfile.read()
    
    # Convert last file's filename to an integer and iterate till 70.
    # Keep creating files and copy the data of the main file.
    # length of the filename is 5 characters, like "00005" + the extension ".jpg".
    last = int(lastfile[0:-4])
    while last < 70:
        last += 1
        new = str(last)
        fname = ('0' * (5 - len(new))) + new + '.jpg'
        with open('20bn-jester-v1/' + filedir + '/' + fname, 'wb') as file:
            file.write(data)

In [None]:
# The goal is to have 70 images in every sub-directory.
# If a sub-directory has less than 70 images then copy the last image (70 - n) times, where n is the
# number of images in that sub-directory.
# If the last file was "00005.jpg", then subsequent copies of "00005.jpg" will be "00006.jpg" upto
# "00070.jpg".

tic = time.time()
for dir in dirs:
    count = len(os.listdir('20bn-jester-v1/' + dir))
    chars = 1 if count < 10 else 2
    lastfile = '0' * (5 - chars) + str(count) + '.jpg'
    copier(dir, lastfile)
toc = time.time()
print('Created duplicates in', round(((toc - tic) / 60), 2), 'mins')

# After executing this verify that all sub-directories have 70 images by executing 4th code cell.

In [None]:
def organise_dataset(csvfile, mode, srcdir = './20bn-jester-v1/'):
    '''Organize dataset for Training and/or Validation'''
    # Read the csv file and find folders associated with specific gestures.
    # Display the number of labels (gestures) and number of folders for each gesture.
    with open(csvfile, newline = '\n') as cf:
        reader = csv.reader(cf, delimiter=';')
        filetree = {}
        for row in reader:
            #print(row)
            if row[1] not in filetree.keys():
                filetree[row[1]] = []
            filetree[row[1]].append(row[0])
        print('Number of labels (subdirectories)', len(filetree.keys()))

        # Display the minimum and maximum sample size
        ln = []
        accumulator = []
        print('Number of samples for particular label')
        for label in filetree.keys():
            accumulator.append(sum(list(map(lambda x: len(os.listdir(srcdir + x)), filetree[label]))))
            print(f'{label}: Videos {len(filetree[label])} Frames {accumulator[-1]}')
            ln.append(len(filetree[label]))
        print(f'minimum samples: {min(ln)}, maximum samples: {max(ln)}, total frames: {sum(accumulator)}')

    # Create approprite path
    if not os.path.exists('./Dataset'):
        os.mkdir('./Dataset')

    if mode in ['train', 'Train', 'training', 'Training']:
        path = './Dataset/Train'
    elif mode in ['validation', 'Validation']:
        path = './Dataset/Validation'

    os.mkdir(path)
    path += '/'

    # Copy the folders containing samples under their respective labels
    print('Number of copied samples for particular label')

    # Iterate over every label
    for label in filetree.keys():
        os.mkdir(path + label)

        # Create blank folders for every folder in that label
        for folder in filetree[label]:
            os.mkdir(path + label + '/' + folder)

            # Copy the sample data into that blank folder
            for f in os.listdir(srcdir + folder):
                shutil.copy2(srcdir + folder + '/' + f, path + label + '/' + folder)

        # Show the number of samples for each label
        print(label, len(os.listdir(path + label)), sum([len(os.listdir(path + label + '/' + i)) for i in os.listdir(path + label)]))

In [None]:
# Organize the training data

tic = time.time()
organise_dataset('/content/gdrive/My Drive/DashGC/Original/jester-v1-train.csv', 'Train')
toc = time.time()
print('Training Dataset organised in ', round(((toc - tic) / 60), 2), 'mins')

In [None]:
# Organize the validation data

tic = time.time()
organise_dataset('/content/gdrive/My Drive/DashGC/Original/jester-v1-validation.csv', 'Validation')
toc = time.time()
print('Validation Dataset organised in ', round(((toc - tic) / 60), 2), 'mins')

In [None]:
# Compress the reorganized Dataset

start = time.time()
!tar czf - ./Dataset/ | split --bytes=2048MB - Dataset.tar.gz.
stop = time.time()
print('Compression took', round(((stop - start) / 60), 2), 'mins')

In [None]:
# Copy the Compressed volumes of Dataset to Google drive

start = time.time()
!cp ./Dataset.tar.gz.* "/content/gdrive/My Drive/DashGC/Reorganized/"
stop = time.time()
print('Copying took', round(((stop - start) / 60), 2), 'mins')