In [1]:
import os
import cv2
import numpy as np
import imutils
import pickle
from skimage.feature import hog

In [2]:
airplane_datapath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/trainDataAirplane/"
cat_datapath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/trainDataCat/"

featureSTDPath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/STD/"
featureEDGPath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/EDG/"
featureHOGPath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/HOG/"

featureSTDbatch_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/STD/batchData/"
featureEDGbatch_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/EDG/batchData/"
featureHOGbatch_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/HOG/batchData/"

ImageDir = [airplane_datapath, cat_datapath]

DataDir = [featureSTDPath, featureEDGPath]#, featureHOGPath]

BatchDir = [featureSTDbatch_dir, featureEDGbatch_dir, featureHOGbatch_dir]

# parentPath = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/"

In [3]:
# We choose 18 orientation for the object recognition task
class HOG:
    def __init__(self, orientations = 18, pixelsPerCell = (8, 8),cellsPerBlock = (1, 1), block_norm='L1', visualise=False, transform_sqrt = False): 
        self.orienations = orientations
        self.pixelsPerCell = pixelsPerCell
        self.cellsPerBlock = cellsPerBlock
        self.block_norm = block_norm
        self.visualise = visualise
        self.transform_sqrt = transform_sqrt

    def describe(self, image):
        # Use transform_sqrt for Power law Compression before processing the image to increase the accuracy
        # Use visualise to return the image of the histogram
        hist , hog_image= hog(image,
                            orientations = self.orienations,
                            pixels_per_cell = self.pixelsPerCell,
                            cells_per_block = self.cellsPerBlock,
                            visualise= self.visualise,
                            transform_sqrt = transform_sqrt)
#                             normalise = self.normalize)        # normalise is deprecated from the API

        return hist, hog_image


The Below Process successfully creates the data set for the features:

1. Standarized (Normalized) Image
2. Image with Edge activation pixel
3. Image with HOG Features (One Filter Type)
4. To do ..... (HOG with multple filters)
    
The below code is generalized to store the features as ndarrays where,

--> The number of rows are the number of images,
--> The number of columns are the flattened feature set.

The dataset is compressed into pickle files and stored in their respective directories


In [49]:
def featureStandarize(image_pxlvals):
    return(image_pxlvals - 255.0/2)/255.0


def featureExtraction(pathTo_images, filenameArr, imageSize=32, mimNumImage=None, numChannels=3, HOG=None):
    datasetSTD = np.ndarray(shape=(len(filenameArr), imageSize, imageSize), dtype=np.float32)
    datasetEDG = np.ndarray(shape=(len(filenameArr), imageSize, imageSize), dtype=np.float32)
    
    for numImage, image in enumerate(filenameArr):
        imagePath = os.path.join(pathTo_images, image)
#         print (numImage)
        try:
            # Get the Image
#             print ('The input image path is ', imagePath)
            img = cv2.imread(imagePath)
            print ('Count of Non Zero pixels entries in img are: ', len(np.where(np.reshape(img, 32*32*3)!=0)[0]))

            
            # Convert the Image into Gray Scale
            imgGS = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 
            print ('Count of Non Zero pixels entries in imgGS are: ', len(np.where(np.reshape(imgGS, 32*32)!=0)[0]))
#             print (imgGS)
            
            # Get normalized image
            imgSTD = featureStandarize(imgGS)
            
            # Blurr the Gray Scale Image using a Gaussian Blurr
            imgBLR = cv2.GaussianBlur(imgGS, (3,3), 0)                # The filer size is chosen to be 3 and the standard deviation for the distribution is 0
            print ('Count of Non Zero pixels entries in imgBLR are: ', len(np.where(np.reshape(imgBLR, 32*32)!=0)[0]))            
            
            # Detect Edges using Canny Filter
            imgEDG = cv2.Canny(imgBLR, 30, 150)                        # The minimum threshold value chosen is 60 and the maximum threshold chosen is 150
            print ('Count of Non Zero pixels entries in imgEDG are: ', len(np.where(np.reshape(imgEDG, 32*32)!=0)[0])) 

            datasetSTD[numImage, :] = imgSTD
            datasetEDG[numImage, :] = imgEDG
            
            if numImage ==10:
                break
        except IOError as e:
            print('Could not read:', image, ':', e, '- hence skipping.')
            
    return datasetSTD.reshape((-1,imageSize*imageSize*numChannels)), datasetEDG.reshape((-1,imageSize*imageSize))
           
    
def main(forceDump=None):
    for image_dir in ImageDir:
        objectName = os.path.basename(os.path.normpath(image_dir))
        filenameArr =  os.listdir(image_dir)
        
        print ('The image directory is: ', image_dir)
        datasetSTD, datasetEDG = featureExtraction(image_dir, filenameArr)
        print ('Standarized Feature DataSet ', datasetSTD.shape)
        print ('Edge Feature DataSet ', datasetEDG.shape)
        
#         for num, i in enumerate(datasetEDG):
#             for j in i:
#                 print (j)
#             if num==10:
#                 break
                

        for data_dir in DataDir:        
            if not os.path.exists(data_dir):
                os.makedirs(data_dir)
                
            featureType = os.path.basename(os.path.normpath(data_dir))    
            fileName = data_dir+objectName+".pickle"

            # DUMP PICKLE FILES
            if os.path.exists(fileName) and not forceDump:
                print ('The path already exists, you should force the dump')
            else:
                try:
                    with open(fileName, 'wb') as f:
                        if featureType=='STD':
                            pickle.dump(datasetSTD, f, pickle.HIGHEST_PROTOCOL)
                        elif featureType=='EDG':
                            pickle.dump(datasetEDG, f, pickle.HIGHEST_PROTOCOL)
                except Exception as e:
                    print('Unable to save data to', fileName1, ':', e)
                    
#         break
    
main(forceDump=None)

The image directory is:  /Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/trainDataAirplane/
Count of Non Zero pixels entries in img are:  3072
Count of Non Zero pixels entries in imgGS are:  1024
Count of Non Zero pixels entries in imgBLR are:  1024
Count of Non Zero pixels entries in imgEDG are:  95
Count of Non Zero pixels entries in img are:  3072
Count of Non Zero pixels entries in imgGS are:  1024
Count of Non Zero pixels entries in imgBLR are:  1024
Count of Non Zero pixels entries in imgEDG are:  142
Count of Non Zero pixels entries in img are:  3072
Count of Non Zero pixels entries in imgGS are:  1024
Count of Non Zero pixels entries in imgBLR are:  1024
Count of Non Zero pixels entries in imgEDG are:  139
Count of Non Zero pixels entries in img are:  3072
Count of Non Zero pixels entries in imgGS are:  1024
Count of Non Zero pixels entries in imgBLR are:  1024
Count of Non Zero pixels entries in imgEDG are:  125
Count of Non Zero pixels entries in img are:  3072


ValueError: total size of new array must be unchanged

--> The Below code calls the class CreateBatches. Class CreateBatches 

1. Gets the pickled feature data from the directory, 
2. ranandomize the data
3. Divides the into train and test dataset. (In this case we dont code for test data as we have the test data as a different test file)
4. Finally, the training data is converted into 10 folds and stored into the respective directory.

In [3]:
import sys,os

# Get the current working directory
cwd = os.getcwd()
mydir = os.path.abspath(os.path.join(cwd, ".."))
sys.path.append(mydir)

from DataPreparation import CreateBatches

In [4]:
# Create 10 Batches and stores in into the provided Batch Directory for the Strandarized Feature set of Images
imageDim = 32*32*3
numBatches =10
maxNumImage = 5000
test_percntg = 0

root_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/STD/"
batch_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/STD/batchData/"

obj_STD = CreateBatches(dimensions=imageDim)
trainData, trainLabels, testLabels, _, _ = obj_STD.gen_TrainTestData(max_num_images=maxNumImage, dir_to_pickle_files=root_dir, test_percntg=test_percntg)

for batchNum, (trnBatchData, trnBatchLabel) in enumerate(obj_STD.generateBatches(dataset=trainData, labels=trainLabels, numBatches=numBatches)):
    obj_STD.dumpBatches(trnBatchData, trnBatchLabel, batch_dir, batchNum=batchNum)


The training Data set size is :  (10000, 3072)
The training Labels size is :  (10000,)
The test Data set size is :  (0, 3072)
The test Labels size is :  (0,)
Batch No:  0  : Training Batch Data Shape: (1000, 3072)
Batch No:  0  : Training Batch Labels Shape : (1000,)
Batch No:  1  : Training Batch Data Shape: (1000, 3072)
Batch No:  1  : Training Batch Labels Shape : (1000,)
Batch No:  2  : Training Batch Data Shape: (1000, 3072)
Batch No:  2  : Training Batch Labels Shape : (1000,)
Batch No:  3  : Training Batch Data Shape: (1000, 3072)
Batch No:  3  : Training Batch Labels Shape : (1000,)
Batch No:  4  : Training Batch Data Shape: (1000, 3072)
Batch No:  4  : Training Batch Labels Shape : (1000,)
Batch No:  5  : Training Batch Data Shape: (1000, 3072)
Batch No:  5  : Training Batch Labels Shape : (1000,)
Batch No:  6  : Training Batch Data Shape: (1000, 3072)
Batch No:  6  : Training Batch Labels Shape : (1000,)
Batch No:  7  : Training Batch Data Shape: (1000, 3072)
Batch No:  7  : 

In [5]:
# Create 10 Batches and stores in into the provided Batch Directory for the Edge Feature set of Images
imageDim=32*32
numBatches =10
maxNumImage = 5000
test_percntg=0

root_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/EDG/"
batch_dir = "/Users/sam/All-Program/App-DataSet/Kaggle-Challenges/CIFAR-10/Classification-1/EDG/batchData/"

obj_EDG = CreateBatches(dimensions=imageDim)
trainData, trainLabels, testLabels, _, _ = obj_EDG.gen_TrainTestData(max_num_images=maxNumImage, dir_to_pickle_files=root_dir, test_percntg=test_percntg)

for batchNum, (trnBatchData, trnBatchLabel) in enumerate(obj_EDG.generateBatches(dataset=trainData, labels=trainLabels, numBatches=numBatches)):
    obj_EDG.dumpBatches(trnBatchData, trnBatchLabel, batch_dir, batchNum=batchNum)

The training Data set size is :  (10000, 1024)
The training Labels size is :  (10000,)
The test Data set size is :  (0, 1024)
The test Labels size is :  (0,)
Batch No:  0  : Training Batch Data Shape: (1000, 1024)
Batch No:  0  : Training Batch Labels Shape : (1000,)
Batch No:  1  : Training Batch Data Shape: (1000, 1024)
Batch No:  1  : Training Batch Labels Shape : (1000,)
Batch No:  2  : Training Batch Data Shape: (1000, 1024)
Batch No:  2  : Training Batch Labels Shape : (1000,)
Batch No:  3  : Training Batch Data Shape: (1000, 1024)
Batch No:  3  : Training Batch Labels Shape : (1000,)
Batch No:  4  : Training Batch Data Shape: (1000, 1024)
Batch No:  4  : Training Batch Labels Shape : (1000,)
Batch No:  5  : Training Batch Data Shape: (1000, 1024)
Batch No:  5  : Training Batch Labels Shape : (1000,)
Batch No:  6  : Training Batch Data Shape: (1000, 1024)
Batch No:  6  : Training Batch Labels Shape : (1000,)
Batch No:  7  : Training Batch Data Shape: (1000, 1024)
Batch No:  7  : 