# Convert Train/Set Dataset to Numpy Array

In [1]:
from zipfile import ZipFile
from skimage.io import imread
import numpy as np

def loadData(filepath):
    X = []   # store images
    Y = []   # store labels
    
    with ZipFile(filepath, 'r') as zipdata:
        namelist = zipdata.namelist()
        
        # loop through every directory/file
        for file_path in namelist:
            # read image files and save to list
            if '.jpg' in str.lower(file_path) or '.png' in str.lower(file_path):
                # get image's label
                Y.append(file_path.split('/')[-2])
                
                # load image into list
                img_bytes = zipdata.open(file_path)
                image = imread(img_bytes)
                X.append(image)
    
    # close zip file
    zipdata.close()
    
    # return images/labels list
    return (X, Y)

In [2]:
from tqdm import tqdm

# load train/test images into list
X_test, y_test = tqdm(loadData('data/test.zip'))
X_valid, y_valid = tqdm(loadData('data/valid.zip'))
X_train, y_train = tqdm(loadData('data/train.zip'))

# convert list to numpy array
# test set
X_test = np.asarray(X_test, dtype='float32')
y_test = np.asarray(y_test).reshape(-1,1)

# validation set
X_valid = np.asarray(X_valid, dtype='float32')
y_valid = np.asarray(y_valid).reshape(-1,1)

# train set
X_train = np.asarray(X_train, dtype='float32')
y_train = np.asarray(y_train).reshape(-1,1)

# save numpy arrays
np.save('data/X_train.npy', X_train)
np.save('data/y_train.npy', y_train)
np.save('data/X_test.npy', X_test)
np.save('data/y_test.npy', y_test)
np.save('data/X_valid.npy', X_valid)
np.save('data/y_valid.npy', y_valid)

100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]


In [3]:
X_test.shape

(7472, 28, 28)

In [4]:
X_train.shape

(22542, 28, 28)

In [5]:
X_valid.shape

(7472, 28, 28)