# Export MNIST

read images and labels in ubyte format

In [102]:
import numpy as np
import struct

images_path = 'mnist/train-images-idx3-ubyte/train-images-idx3-ubyte'
labels_path = 'mnist/train-labels-idx1-ubyte/train-labels-idx1-ubyte'
out_path = 'mnist/train/'

# https://stackoverflow.com/questions/39969045/parsing-yann-lecuns-mnist-idx-file-format

with open(images_path,'rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    images = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    images = images.reshape((size, nrows, ncols))

with open(labels_path,'rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    labels = labels.reshape((size,)) # (Optional)

define helper function to generate polygon for the images

In [104]:
import cv2

def image2poly(image, img_size = 28, max_size = 10):
    cnt, _ = cv2.findContours(np.array(image), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    cnt = max(cnt, key=cv2.contourArea)
    poly = cv2.approxPolyDP(cnt, 0.025 * cv2.arcLength(cnt, True), True) / img_size
    poly = poly.reshape(-1, 2)[:max_size]
    return poly

export images to png

In [105]:
from PIL import Image
from os.path import join

dataset_entries = []

idx = 0
for image, label in zip(images, labels):
    im = Image.fromarray(image)
    file_path = join("images", f"img{idx:05}_{label}.png")
    poly = image2poly(image).flatten().tolist()

    dataset_entries.append({
        "file_path": file_path,
        "label": label,
        "polygon": poly
    })

    im.save(join(out_path, file_path))
    idx += 1

save label and polygons to csv with an association to the file path they refer to

In [106]:
import csv

with open(join(out_path, "polygon-mnist.csv"), 'w', newline='') as file: 
    writer = csv.DictWriter(file, fieldnames = ["file_path", "label", "polygon"])
    writer.writeheader() 
    writer.writerows(dataset_entries)

Unnamed: 0,file_path,label,polygon
0,images/img00000_5.png,5,"[0.8214285714285714, 0.17857142857142858, 0.28..."
1,images/img00001_0.png,0,"[0.6785714285714286, 0.14285714285714285, 0.53..."
2,images/img00002_4.png,4,"[0.7857142857142857, 0.17857142857142858, 0.60..."
3,images/img00003_1.png,1,"[0.75, 0.17857142857142858, 0.6071428571428571..."
4,images/img00004_9.png,9,"[0.42857142857142855, 0.25, 0.2142857142857142..."
...,...,...,...
59995,images/img59995_8.png,8,"[0.8214285714285714, 0.25, 0.42857142857142855..."
59996,images/img59996_3.png,3,"[0.7142857142857143, 0.17857142857142858, 0.39..."
59997,images/img59997_5.png,5,"[0.8571428571428571, 0.17857142857142858, 0.35..."
59998,images/img59998_6.png,6,"[0.75, 0.07142857142857142, 0.3571428571428571..."
