# Preparing the Datasets for Image Classification using the Apache MXNet Vision Datasets Functions

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
%%bash

mkdir -p tmp/train/0 tmp/train/1 tmp/train/2 tmp/train/3 tmp/train/4
mkdir -p tmp/train/5 tmp/train/6 tmp/train/7 tmp/train/8 tmp/train/9
mkdir -p tmp/validation/0 tmp/validation/1 tmp/validation/2 tmp/validation/3 tmp/validation/4
mkdir -p tmp/validation/5 tmp/validation/6 tmp/validation/7 tmp/validation/8 tmp/validation/9

In [None]:
%%bash

mkdir -p tmp/train_lst
mkdir -p tmp/validation_lst
mkdir -p tmp/test

In [None]:
%%bash

ls -1F tmp/train

In [None]:
%%bash

ls -1F tmp/validation

In [None]:
import mxnet as mx
mx.random.seed(21)

In [None]:
def transform_fxn(data, label):
    data = data.astype('float32')
    data = data / 255
    return data, label

ds = mx.gluon.data.vision.datasets.MNIST(
    train=True, 
    transform=transform_fxn
)
training_and_validation_dataset = ds

ds = mx.gluon.data.vision.datasets.MNIST(
    train=False, 
    transform=transform_fxn
)
test_dataset = ds

In [None]:
print(len(training_and_validation_dataset))
print(len(test_dataset))

In [None]:
def get_training_row_indexes(row_count, 
                             percent=0.5, 
                             ratio=0.8):
    training_index_start = 0
    end = int(row_count * ratio * percent)
    training_index_end = end
    
    print("Range Index Start:", 
          training_index_start)
    print("Range Index End:", 
          training_index_end)
    
    output = list(range(training_index_start, 
                        training_index_end))
    
    print("Output Length:", len(output))
    print("Last Index:", output[-1])
    
    return output

In [None]:
def get_validation_row_indexes(row_count, 
                               percent=0.5, 
                               ratio=0.8):
    start = int(row_count * ratio)
    validation_index_start = start
    
    count = int((1 - ratio) * row_count * percent) + 1
    element_count = count
    validation_index_end = validation_index_start + element_count
    
    print("Range Index Start:", 
          validation_index_start)
    print("Element Count:", 
          element_count)
    print("Range Index End:", 
          validation_index_end)
    
    output = list(range(validation_index_start, 
                        validation_index_end))
    
    print("Output Length:", len(output))
    print("Last Index:", output[-1])
    
    return output

In [None]:
def get_test_row_indexes(row_count, 
                         percent=0.5):
    test_index_start = 0
    test_index_end = int(row_count * percent)
    
    print("Range Index Start:", 
          test_index_start)
    print("Range Index End:", 
          test_index_end)
    
    output = list(range(test_index_start, 
                        test_index_end))
    
    print("Output Length:", len(output))
    print("Last Index:", output[-1])
    
    return output

In [None]:
get_training_row_indexes(row_count=60000, 
                         percent=0.5)

In [None]:
get_validation_row_indexes(row_count=60000, 
                           percent=0.5)

In [None]:
get_test_row_indexes(row_count=10000, 
                     percent=0.1)

In [None]:
import string 
import random

def generate_random_string():
    return ''.join(
        random.sample(
        string.ascii_uppercase,12)
    )


In [None]:
generate_random_string()

In [None]:
import matplotlib
import matplotlib.pyplot

def save_image(image_data, filename):
    matplotlib.pyplot.imsave(
        f"tmp/{filename}", 
        image_data[:,:,0].asnumpy())

In [None]:
def generate_image_files_and_lst_dict(
    dataset, 
    indexes, 
    tag
):
    list_of_lst_dicts = []
    
    for index in indexes:
        image_label_pair = dataset[index]
        image_data = image_label_pair[0]
        label = image_label_pair[1]
        random_string = generate_random_string()

        if tag == "test":
            rp = f"{random_string}.png"
            relative_path = rp
            filename = f"{tag}/{relative_path}"
        else:
            rp = f"{label}/{random_string}.png"
            relative_path = rp
            filename = f"{tag}/{relative_path}"

        save_image(
            image_data, 
            filename=filename
        )
        
        lst_dict = {
            'relative_path': relative_path, 
            'class': label
        }
        list_of_lst_dicts.append(lst_dict)

    return list_of_lst_dicts

In [None]:
train_dataset_length = len(
    training_and_validation_dataset
)
train_indexes = get_training_row_indexes(
    row_count=train_dataset_length, 
    percent=0.01)

t = generate_image_files_and_lst_dict(
    dataset=training_and_validation_dataset,
    indexes=train_indexes,
    tag = "train"
)
train_lst_dict = t

In [None]:
train_lst_dict

In [None]:
train_dataset_length = len(
    training_and_validation_dataset
)
validation_indexes = get_validation_row_indexes(
     row_count=train_dataset_length, 
     percent=0.01)

v = generate_image_files_and_lst_dict(
    dataset=training_and_validation_dataset,
    indexes=validation_indexes,
    tag = "validation"
)
validation_lst_dict = v

In [None]:
validation_lst_dict

In [None]:
test_dataset_length = len(test_dataset)
test_indexes = get_test_row_indexes(
    row_count=test_dataset_length, 
    percent=0.01)

test_lst_dict = generate_image_files_and_lst_dict(
    dataset=test_dataset,
    indexes=test_indexes,
    tag = "test"
)

In [None]:
test_lst_dict

In [None]:
def save_lsts_to_file(values, filename):
    with open(filename, 'w') as output:
        for index, row in enumerate(
            values, 
            start=1
        ):
            relative_path = row['relative_path']
            cls = row['class']
            tmp = f"{index}\t{cls}\t{relative_path}\n"
            output.write(tmp)

In [None]:
save_lsts_to_file(
    train_lst_dict, 
    filename="tmp/train_lst/train.lst"
)
save_lsts_to_file(
    validation_lst_dict, 
    filename="tmp/validation_lst/validation.lst"
)

In [None]:
%%bash

head tmp/train_lst/train.lst

In [None]:
s3_bucket = "sagemaker-cookbook-bucket"
prefix = "image-experiments"
!aws s3 cp tmp/.  s3://{s3_bucket}/{prefix}/ --recursive

In [None]:
%store s3_bucket
%store prefix