# Preprocess the ubyte files into images

In [22]:
import gzip
import csv
import numpy as np
import cv2 as cv
import os

In [23]:
LAB_PATH=os.path.abspath(os.curdir)
DATASET_PATH=os.path.join(LAB_PATH,"datasets","byclass","data")
train_data_filename = os.path.join(DATASET_PATH,"emnist-byclass-train-images-idx3-ubyte.gz")
train_labels_filename= os.path.join(DATASET_PATH,"emnist-byclass-train-labels-idx1-ubyte.gz")
test_data_filename =os.path.join(DATASET_PATH,"emnist-byclass-test-images-idx3-ubyte.gz")
test_labels_filename = os.path.join(DATASET_PATH,"emnist-byclass-test-labels-idx1-ubyte.gz")


In [24]:
try:
    os.makedirs("train")
except:
    pass

In [25]:
try:
    os.makedirs("test")
except:
    pass

In [26]:
def extract_data(filename, num_images):
    """Extract the images into a 4D tensor [image index, y, x, channels].
    Values are rescaled from [0, 255] down to [-0.5, 0.5].
    """
    print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        bytestream.read(16)
        buf = bytestream.read(28 * 28 * num_images)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        #data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
        data = data.reshape(num_images, 28, 28, 1)
    return data

In [27]:
def extract_labels(filename, num_images):
    """Extract the labels into a vector of int64 label IDs."""
    print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        bytestream.read(8)
        buf = bytestream.read(1 * num_images)
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
    return labels

In [28]:
train_data = extract_data(train_data_filename, 70000)
train_labels = extract_labels(train_labels_filename, 70000)

Extracting C:\Users\Admin\Desktop\Parashara\PESU\7th semester\AML\Lab 2\datasets\byclass\data\emnist-byclass-train-images-idx3-ubyte.gz
Extracting C:\Users\Admin\Desktop\Parashara\PESU\7th semester\AML\Lab 2\datasets\byclass\data\emnist-byclass-train-labels-idx1-ubyte.gz


In [29]:
with open("train-labels.csv", 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=',', quotechar='"')
    for i in range(len(train_data)):
        cv.imwrite("train/" + str(i) + ".jpg", train_data[i][:,:,0])
        writer.writerow(["train/" + str(i) + ".jpg", train_labels[i]])


In [30]:
test_data = extract_data(test_data_filename, 15000)
test_labels = extract_labels(test_labels_filename, 15000)

Extracting C:\Users\Admin\Desktop\Parashara\PESU\7th semester\AML\Lab 2\datasets\byclass\data\emnist-byclass-test-images-idx3-ubyte.gz
Extracting C:\Users\Admin\Desktop\Parashara\PESU\7th semester\AML\Lab 2\datasets\byclass\data\emnist-byclass-test-labels-idx1-ubyte.gz


In [31]:
with open("test-labels.csv", 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=',', quotechar='"')
    for i in range(len(test_data)):
        cv.imwrite("test/" + str(i) + ".jpg", test_data[i][:,:,0])
        writer.writerow(["test/" + str(i) + ".jpg", test_labels[i]])