In [2]:
# Necessary Libraries
import os
import cv2
import pickle
import numpy as np
from sklearn.utils import shuffle
from keras.utils import np_utils
from sklearn.model_selection import train_test_split


In [3]:
# Class deffinition to load dataset from directory

class ImageDataLoader:
        
    def __init__(self, data_path, imageSize, num_channel, num_classes=10):
        
        self.train_images = []
        self.train_labels = []

        self.test_images = []
        self.test_labels = []
    
        # Get the size of image and number of samples in each category
        self.data_path = data_path # List of data directories ['root/../train/', 'root/../test/']
                
        self.imageSize = imageSize
        self.num_channel = num_channel
        
    def call(self): # Function to hold all categories
         # Train images along labels and Test images along labels
        
        x_train, x_valid, y_train, y_valid, self.train_cls, self.valid_cls = self.load_train()
        
        self.x_test, self.y_test = self.load_test()
        
        # Split and normalized the training set into train and validation
        self.x_train = x_train / 255.0
        self.x_valid = x_valid / 255.0
        
        self.y_train = y_train
        self.y_valid = y_valid
        
        # Convert class-numbers to integer
        self.y_train = self.y_train.astype(np.int)
        self.y_test = self.y_test.astype(np.int)
        self.y_valid = self.y_valid.astype(np.int)

    # Function to read train image data from directory
    def read_train_data_from_dir(self):
        print("\n================================= Loading Train Images ===============================\n")
        for data in sorted(os.listdir(self.data_path[0])):

            for img in sorted(os.listdir(os.path.join(self.data_path[0], data))):
                
                image = cv2.imread(os.path.join(self.data_path[0], data, img), self.num_channel)
                image = cv2.resize(image, (self.imageSize, self.imageSize))

                self.train_images.append((np.array(image, dtype=np.uint8)))
                self.train_labels.append(int(data))  # name of the data_dir must be labeled in number (e.g 0, 1, 2 ..)

            print("Reading and Loading Train images from: %s" % os.path.join(self.data_path[0], data))

        return self.train_images, self.train_labels

    # Function to read test image data from directory
    def read_test_data_from_dir(self):
        print("\n================================= Loading Test Images ===============================\n")
        for data in sorted(os.listdir(self.data_path[1])):

            for img in sorted(os.listdir(os.path.join(self.data_path[1], data))):

                image = cv2.imread(os.path.join(self.data_path[1], data, img), self.num_channel)
                image = cv2.resize(image, (self.imageSize, self.imageSize))

                self.test_images.append((np.array(image, dtype=np.uint8)))
                self.test_labels.append(int(data))  # name of the data_dir must be labeled in number (e.g 0, 1, 2 ..)

            print("Reading and Loading Test images from: %s" % os.path.join(self.data_path[1], data))

        return self.test_images, self.test_labels
    
    # Split training images into train and validation    
    def load_train(self):
        
        tr_images, tr_labels = self.read_train_data_from_dir()
            
        # Convert list of images and labels into numpy array
        tr_images = np.array(tr_images, dtype=np.float32)
        tr_labels = np.array(tr_labels)

        # One-hot encoding 
        Y = np_utils.to_categorical(tr_labels)
        
        # Shuffling 
        X, y = shuffle(tr_images, Y, random_state=4)
        x_train_cls, y_valid_cls = shuffle(tr_images, tr_labels, random_state=4)
        
        x_train, x_valid, y_train, y_valid = train_test_split(X, y, stratify=None, test_size=0.2, random_state=4)
        train_img1, test_img1, train_cls, valid_cls = train_test_split(x_train_cls, y_valid_cls, stratify=None, test_size=0.3, random_state=2)

        # Here ignore the train_img1 and test_img1, just take the non-one_hot_encoded version of y_train and y_valid
        return x_train, x_valid, y_train, y_valid, train_cls, valid_cls
    
    # Normalizing Test images
    def load_test(self):
        
        tst_images, tst_labels = self.read_test_data_from_dir()
            
        # Convert list of images and labels into numpy array
        tst_images = np.array(tst_images, dtype=np.float64) / 255.0
        tst_labels = np.array(tst_labels)

        # One-hot encoding 
        Y = np_utils.to_categorical(tst_labels)
        
        # Shuffling 
        X, y = shuffle(tst_images, Y, random_state=4)

        return X, y
    
    def random_batch(self, images, labels, batch_size=32):
        
        # Create a random index into the training set
        indx = np.random.randint(low=0, high=int(len(images)), size=batch_size)
        
        # Use the index to lookup random training data
        x_batch = images[indx]
        y_batch = labels[indx]
        
        return x_batch, y_batch
    
    def pickle_data(self):
        
        print("[INFO] Saving image data to disk...\n")
        with open('x_train', 'wb') as f:
            pickle.dump(self.x_train, f)
#             del self.x_train
        with open('y_train', 'wb') as f:
            pickle.dump(self.y_train, f)
#             del self.y_train
        with open('x_valid', 'wb') as f:
            pickle.dump(self.x_valid, f)
#             del self.x_valid
        with open('y_valid', 'wb') as f:
            pickle.dump(self.y_valid, f)
#             del self.y_valid
        with open('x_test', 'wb') as f:
            pickle.dump(self.x_test, f)
#             del self.x_test
        with open('y_test', 'wb') as f:
            pickle.dump(self.y_test, f)
#             del self.y_test
        with open('array/x_train_cls', 'wb') as f:
            pickle.dump(self.train_cls, f)
#             del train_cls
        with open('array/y_test_cls', 'wb') as f:
            pickle.dump(self.valid_cls, f)
#             del test_cls
        print("Done.")


In [4]:
data_path = [os.path.join(os.getcwd(), 'data', 'train'), os.path.join(os.getcwd(), 'data', 'test')]
imageSize = 120
num_channel = 0

In [5]:
data = ImageDataLoader(data_path, imageSize, num_channel)

In [6]:
data.call()



Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\0
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\1
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\2
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\3
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\4
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\5
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\6
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\7
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\8
Reading and Loading Train images from: C:\Users\Lord Sanmilee\Desktop\git\data\train\9


Reading and Loading Test images from: C:\Users\Lord Sanmilee\Desktop\git\data\test\0
Reading and Loading Test images from: C:\

In [7]:
print(data.x_train.shape)
print(data.x_valid.shape)
print(data.x_test.shape, "\n")

print(data.y_train.shape)
print(data.y_valid.shape)
print(data.y_test.shape, "\n")

print(data.random_batch(data.x_train, data.y_train, 100)[0].shape)
print(data.random_batch(data.x_valid, data.y_valid, 100)[0].shape)
print(data.random_batch(data.x_test, data.y_test, 100)[0].shape, "\n")

print(data.train_cls.shape)
print(data.valid_cls.shape)

#print(data.train_cls)
#print(data.valid_cls)

#print(data.y_test)
#print(data.y_valid)

(40000, 120, 120)
(10000, 120, 120)
(12500, 120, 120) 

(40000, 10)
(10000, 10)
(12500, 10) 

(100, 120, 120)
(100, 120, 120)
(100, 120, 120) 

(35000,)
(15000,)


In [8]:
data.pickle_data()

[INFO] Saving image data to disk...

Done.


In [9]:
batch = data.random_batch(data.x_train, data.y_train, 100)[0].shape
# batch[1].shape
batch

(100, 120, 120)