In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [256]:
import os
import csv
import gzip
import numpy as np
import urllib.request
from scipy.misc import imsave

In [267]:
path = 'datasets/MNIST/'

In [264]:
# loading MNIST data

In [269]:
def get_data_dict(output_path):
    
    """
    Downloading and reading MNIST dataset
    Returns dict of train and val images (here called t10k)
    
    :param ouput_path: target directory
    
    """
    
    # If not already there, downloading MNIST data:
    
    files = ['train-images-idx3-ubyte.gz',
                'train-labels-idx1-ubyte.gz',
                't10k-images-idx3-ubyte.gz',
                't10k-labels-idx1-ubyte.gz']

    if not os.path.isdir(output_path):
        print('creating MNIST directory')
        os.mkdir(output_path)
    
    if not len(os.listdir(output_path)) != 0:
        for f in files: 
            URL = 'http://yann.lecun.com/exdb/mnist/'
            if not os.path.isfile(output_path + f):
                print(f)
                print(f"Downloading MNIST {f} dataset")
                fname, _ = urllib.request.urlretrieve(URL + f, output_path + f)
                fnames.append(fname)
                print('Done')
            

        
    
    # Reading and loading data from directory
    
    data = {'train': [[],[]], 't10k': [[],[]]}
                  
    for d in os.listdir(output_path):
        with gzip.open(output_path + d, 'rb') as f:
            ds = d.split('-')[0]
            if 'labels' in d:
                data[ds][1] = np.frombuffer(f.read(), np.uint8, offset = 8)
            else:
                data[ds][0] = np.frombuffer(f.read(), np.uint8, offset = 16).reshape(-1, 28, 28)
                
    print('data loaded')
    return data

In [270]:
data_dict = get_data_dict(path)


train_mean = ((data_dict['train'][0]) / 255).mean()
train_stdv = ((data_dict['train'][0]) / 255).std()
print(data_dict['train'][0].shape)
print(train_mean, train_stdv)

train-images-idx3-ubyte.gz
Downloading MNIST train-images-idx3-ubyte.gz dataset
Done
train-labels-idx1-ubyte.gz
Downloading MNIST train-labels-idx1-ubyte.gz dataset
Done
t10k-images-idx3-ubyte.gz
Downloading MNIST t10k-images-idx3-ubyte.gz dataset
Done
t10k-labels-idx1-ubyte.gz
Downloading MNIST t10k-labels-idx1-ubyte.gz dataset
Done
data loaded
(60000, 28, 28)
0.1306604762738429 0.3081078038564622


In [None]:
# Saving MNIST dataset as images and csv's

In [278]:
def save_as_csv_and_jpgs(data_dict, out_path):
    
    
    """
    Saving images as .jpg and labels in .csv file 
    TODO: add tqdm to track progress
    :param path: target directory
    
    """
    for key in data_dict.keys():
        full_path = out_path + key
        
        if not os.path.isdir(full_path):
            os.mkdir(full_path)
            
        with open(full_path + "/labels.csv", 'w', newline='') as csvFile:
            writer = csv.writer(csvFile, delimiter=',', quotechar='"')
            for i in range(len(data_dict[key][0])):
                imsave(full_path + '/' + str(i) + ".jpg", data_dict[key][0][i])
                writer.writerow([str(i) + ".jpg", data_dict[key][1][i]])


In [279]:
save_as_csv_and_jpgs(data_dict, path)

`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


0.1306604762738429 0.3081078038564622
