In [1]:
#import the libraries that will be used for the pre-processing module
import numpy as np #library that will be used to modify the arrays created
import random #library to shuffle the array with the data
import mahotas
import pickle #library that will be used to store the data created into binary files that will be used from the training module
import cv2 #library to convert images into numerical (binary files) data (arrays) that can be understood by the training module
import os #library that will be used to read folder names, paths

# https://gogul.dev/software/image-classification-python

In [2]:
# direcory where data is stored 
DATADIR = "/Users/TarekNagati/Desktop/Final_Year/FYP/Project_to_GetHub/Dataset/smallStructuredData/"

# list of categories that also correspond to the folders
CATEGORIES = ["0","1","2","3","4"]

# set the image size for the images to be resized
IMG_SIZE = 224

# array that will be used to store the data from the images and their labels
training_data = []

In [3]:
# Get Hu Moments from the image by: 
def getHuMoments(img): # shape                          
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convert coloured image to GREYSCALE
    moments = cv2.HuMoments(cv2.moments(img)).flatten() # calculate moments(shape) of image using moments and flatten the list
    return moments

In [4]:
# Get Haralick from image by:
def getHaralick(img): # texturet
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convert colored image to GREYSCALE
    # calculate haralick(texture) and use the mean value of the list
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    return haralick 


In [5]:
# Get Histogram from image by:
def getHistogram(img, mask=None): # colour
    img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # convert image to HSV color scale
    # calculate the histogram of the image
    # The arguments it expects are the image, channels, mask, histSize (bins) 
    # and ranges for each channel [typically 0-256). 
    hist  = cv2.calcHist([img],[0],None,[256],[0,256])
    cv2.normalize(hist, hist) # normalize the histogram, changes the range of pixel intensity
    hist = hist.flatten() # flatten the histogram array
    return hist


In [6]:
# method to fill the array (training_data) with the images(that are converted into numerical data) and their labels.
# for each folder in the folder (structuredData) that represents a category (level of diabetic-retinopahty),
# go through each image and:
# 1. convert it to numerical data 
# 2. resize them to IMAGE_SIZE
# 3. add the data to the list along with the label (that is the folder name)

def create_training_data():
    for category in CATEGORIES:
        path = os.path.join(DATADIR, category)
        image_label = int(category) # casting string into int
        for img in os.listdir(path): # for each image in the folder
                # image processing 2 steps:
                # 1)image converted into numerical data
                image = cv2.imread(os.path.join(path, img))
                # 2)the converted image is resized 
                #resize the image in order to increase focus and let the machine learning model
                #see diferences between images more clearly
                image = cv2.resize(image, (IMG_SIZE, IMG_SIZE)) 
                
                moments = getHuMoments(image)#get hu moments from the image 
                haralick   = getHaralick(image)#get haralick from image
                histogram  = getHistogram(image)#get histogram from the image
                
                #an image feature consists of those three image properties
                image_feature = np.hstack([moments, haralick, histogram])
                #add the image feature and image label in the data list
                training_data.append([image_feature,image_label])
            
create_training_data()

In [7]:
#shuffle data in order not to make connections from order of data
random.shuffle(training_data)

In [8]:
x = [] #image
y = [] #label of the image

#split training data list into two lists, x and y
for image_features, image_labels in training_data:
    x.append(image_features)
    y.append(image_labels)

In [9]:
# when the pre-processing is finished, the data is saved as seperate binary files (pickles)
# each represent the features (x) and labels(y) of the dataset
pickle_out = open("x_small.pickle", "wb") # wb = write binary, create the file or open and overwrite if already exists
pickle.dump(x, pickle_out) # save all data into the pickle
pickle_out.close() # close the pickle

pickle_out = open("y_small.pickle", "wb")
pickle.dump(y, pickle_out)
pickle_out.close()