In [1]:
#import the necessary modules
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 5000)
from subprocess import check_output

In [2]:
#specify the directory and filetype
directory = '/home/sarvesh/ML_Github/flowers/'
filetype = '/*.jpg'

#declare a directory object
path = os.path.dirname(directory)

In [3]:
#use check_output method of the subprocess package to check the folders
folders = list(check_output(["ls", path]).split('\n'))

#remove any unnecessary files
folders.remove('flowers.ipynb')
folders.remove('')
folders

['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']

In [4]:
#create two empty lists, one for images and the other for flower labels 
image_list = []
label_list = []
for folder in folders:
    images = [x for x in os.listdir(directory + folder + '/') if x.endswith(filetype[2:])]
    image_list.append(images)
    
    #assign corresponding flower name as label for images
    labels = [folder] * len(images)
    label_list.append(labels)
    
#flatten both the lists
image_list = sum(image_list, [])
label_list = sum(label_list, [])

In [5]:
#The sorting approach cannot be followed before because :
#When images are sorted lexicographically, image name with least character is placed first
#However when images are sorted by system, the image name with lesser number of characters goes first
#sort labels w.r.t lexicographic order of images
#label_list = [x for _, x in sorted(zip(image_list, label_list))]

#sort the image_list finally
#image_list = sorted(image_list)

In [6]:
#combine both resulting numpy arrays
df = pd.DataFrame(data = [image_list, label_list]).T
df.columns = ['Image', 'Flower']
df.head()

Unnamed: 0,Image,Flower
0,16819071290_471d99e166_m.jpg,daisy
1,3456403987_5bd5fa6ece_n.jpg,daisy
2,14350958832_29bdd3a254.jpg,daisy
3,8882282142_9be2524d38_m.jpg,daisy
4,34670512115_af22cce24d_n.jpg,daisy


In [7]:
#create a csv file containing the above data
#df.to_csv('flowers.csv')

In [8]:
#import the necessary modules for image processing
import cv2

In [14]:
#carry out preprocessing operations on a single image
img = cv2.imread(directory + folders[0] + '/' + image_list[0], 1)
print(img.shape)

#set dimensions for resizing
dim = (200, 200) #(width, height)

#resize image and grant it a new interpolation
resized = cv2.resize(img, dim, cv2.INTER_CUBIC)

#apply gaussian blur to perform first denoising operation
blur = cv2.GaussianBlur(resized, (15, 15), 0)            

#perform segmentation operation on the image by converting it into grayscale
#OpenCV reads colors in BGR rather than RGB format
#gray = cv2.cvtColor(blur, cv2.COLOR_BGR2GRAY)
#However, since color of flowers can become a major factor in prediction,
#we will not convert image to grayscale
            
#threshold the image, initially using only
#Inverse Binary Thresholding
ret, thresh = cv2.threshold(blur, 127, 255, cv2.THRESH_BINARY_INV)

#create a gaussian adaptive threshold with binary thresholding type
#gaus = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 115, 1)
#adaptive threshold cannot be applied either to the image 
#as image must be in grayscale as cv2.imread() function takes flag = 0
#for both grayscale and CV_8UC1

#Check otsu thresholding
#ret, otsu = cv2.threshold(blur, 125, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
#otsu threwsholding also requires grayscale images only

cv2.imshow('image', img)
cv2.imshow('resized', resized)
cv2.imshow('blur', blur)
cv2.imshow('threshold', thresh)
#cv2.imshow('gauss', gaus)
cv2.waitKey(0)
cv2.destroyAllWindows()

(240, 180, 3)


In [12]:
#define a common preprocessing function
def preprocess_images(folder):
    
    #process images in a specific folder wise order
    for i in os.listdir(directory + folder + '/'):
        if i.endsWith(filetype[2:]):
            
            #read in the image unchanged
            x = cv2.imread(i, -1)
            
            #set dimensions for resizing
            dim = (200, 200) #(width, height)
            
            #resize image and grant it a new interpolation
            resized_x = cv2.resize(x, dim, cv2.INTER_CUBIC)
            
            #apply gaussian blur to perform first denoising operation
            blur_x = cv2.GaussianBlur(resized_x, (15, 15), 0)
            
            #perform segmentation operation on the image by converting it into grayscale
            #OpenCV reads colors in BGR rather than RGB format
            gray_x = cv2.cvtColor(blur_x, cv2.COLOR_BGR2GRAY)
            
            #threshold the image, initially using only
            #Inverse Binary Thresholding
            ret, thresh = cv2.threshold(gray_x, 0, 255, cv2.THRESH_BINARY_INV)

In [13]:
image_data = []

#read in every image via a list comprehension
for folder in folders:
    image_data.append(preprocess_images(folder))
    
#flatten out the image data list
image_data = sum(image_data, [])

AttributeError: 'str' object has no attribute 'endsWith'