# Move images from source folders to training/validation/testing folders

This notebook prepares is designed to prepare a model with 3 classes for training. It moves images from their original folders as uploaded by the user to a training folder. The class labels are arranged alphabetically, so the Class 1 folder should house whichever class is first alphabetically. For the tomato plant use-case, Class 1 = Healthy, Class 2 = Mites, and Class 3 = Mold. The user must specify the class label to be attached to each file. The code below will attach HEALTHY to the filename:

random.choice(string.ascii_lowercase)+' '+'HEALTHY'+' '+f))


In [1]:
# calculate number of files in source
import os

##################################################
# specify source folder
source = r'E:\Data Prep\source\Plant Village - all classes\Orange___Haunglongbing_(Citrus_greening)'

##################################################

contents = []

for file in os.listdir(source):
    contents.append(file)

no_files = len(contents)

print('Number of files in source key: ',no_files)

Number of files in source key:  5507


In [2]:
# attach prefix and class name to files
import random, string

###########################################
# define category
category = 'Orange Citrus Greening'

###########################################

training = r'E:\\Data Prep\\training\\'+category
validation = r'E:\\Data Prep\\validation\\'+category
testing = r'E:\\Data Prep\\testing'
os.mkdir(training)
os.mkdir(validation)

for file in os.listdir(source):
    rando = random.choice(string.ascii_letters.lower())+random.choice(string.ascii_letters.lower())+random.choice(string.ascii_letters.lower())+random.choice(string.ascii_letters.lower())
    file_name = ' '+category+' '+file
    file_name = rando + file_name
    #print(file_name)
    os.rename(source+'\\'+file, source+'\\'+file_name)
    

print('Prefix attached to files!')

Prefix attached to files!


In [3]:
# determine how many for training, validation, testing for each class
print('Starting number of files: ',no_files)


no_train = int(no_files *.75)
no_valid = int(no_files *.15)
no_test = int(no_files *.10)


no_files -= no_train
no_files -= no_valid
no_files -= no_test

while no_files > 0:
    no_train += 1
    no_files -= 1
    if no_files == 0:
        break
    no_valid += 1
    no_files -= 1
    if no_files == 0:
        break
    no_test += 1
    no_files -= 1
    if no_files == 0:
        break
        
print('Training: ',no_train)
print('Validation: ',no_valid)
print('Testing: ',no_test)       
print('Number of files left: ',no_files)

Starting number of files:  5507
Training:  4131
Validation:  826
Testing:  550
Number of files left:  0


In [4]:
# copy files into training, validation, and testing folders
import random, string, shutil


for file in os.listdir(source):
    if no_train > 0:
        shutil.copy(source+'\\'+file,training+'\\'+file)
        no_train -= 1
    elif no_valid > 0:
        shutil.copy(source+'\\'+file,validation+'\\'+file)        
        no_valid -= 1
    elif no_test > 0:
        shutil.copy(source+'\\'+file,testing+'\\'+file)
        no_test -= 1
    #print(file)
    if no_test == 0:
            break


print('Images copied into training, validation, and testing folders:\n')
print(training+category)
print(validation+category)
print(testing)

Images copied into training, validation, and testing folders:

E:\Data Prep\training\Orange Citrus GreeningOrange Citrus Greening
E:\Data Prep\validation\Orange Citrus GreeningOrange Citrus Greening
E:\Data Prep\testing


# Convert to RecordIO format

Convert training and validation images to RecordIO format. Specify where you want the converted training and 
validation images will go with variable:

record_io_train = 'PlantVillage-data/training'
record_io_valid = 'PlantVillage-data/validation'

In [1]:
# pip install --user mxnet
# pip install opencv-python
import mxnet as mx
import cv2
a = mx.nd.ones((2, 3))
b = a * 2 + 1
b.asnumpy()

  from ._conv import register_converters as _register_converters


array([[3., 3., 3.],
       [3., 3., 3.]], dtype=float32)

In [9]:
# move images from s3 bucket to folders in notebook instance
import os, glob, shutil

io_convert = r'E:\Data Prep\incubator-mxnet-master\tools'

for f in glob.glob(training):
    shutil.move(f, io_convert+'\\Data')
for f in glob.glob(validation):
    shutil.move(f, io_convert+'\\Data_val')

print('Moved training and validation images from S3 bucket to:\n')
print('E:\Data Prep\incubator-mxnet-master\\tools\Data\\')
print('E:\Data Prep\incubator-mxnet-master\\tools\Data_val\\')

Moved training and validation images from S3 bucket to:

E:\Data Prep\incubator-mxnet-master\tools\Data\
E:\Data Prep\incubator-mxnet-master\tools\Data_val\


In [10]:
# convert to RecordIO format
import os

# change directory
os.chdir(r"E:")
os.chdir(r"\Data Prep\incubator-mxnet-master\tools")

# convert to recordIO
os.system('python im2rec.py Data_rec\Data_rec Data\ --recursive --list --num-thread 8')
os.system('python im2rec.py Data_val_rec\Data_val_rec Data_val\ --recursive --list --num-thread 8')
os.system('python im2rec.py Data_rec Data\ --recursive --pass-through --pack-label --no-shuffle --num-thread 8')
os.system('python im2rec.py Data_val_rec Data_val\ --recursive --pass-through --pack-label --no-shuffle --num-thread 8')


print('Converted to RecordIO format in:\n')
print(r'E:\Data Prep\incubator-mxnet-master\tools\Data_rec')
print('\nAND\n')
print(r'E:\Data Prep\incubator-mxnet-master\tools\Data_val_rec')

Converted to RecordIO format in:

E:\Data Prep\incubator-mxnet-master\tools\Data_rec

AND

E:\Data Prep\incubator-mxnet-master\tools\Data_val_rec


In [13]:
# move RecordIO files back to S3 bucket for training
import subprocess


###########################################################

# specify where .rec files will go for training and validation

record_io_train = 'data_prep/temp/training'
record_io_valid = 'data_prep/temp/validation'

# specify where testing data will go
testing_data = 'data_prep/temp/testing'

##########################################################

# training data
cmd = ['aws','s3','cp','E:/Data Prep/incubator-mxnet-master/tools/Data_rec/',
       's3://syscomanalytics/{}/'.format(record_io_train),'--recursive']
subprocess.call(cmd)

# validation data
cmd = ['aws','s3','cp','E:/Data Prep/incubator-mxnet-master/tools/Data_val_rec/',
       's3://syscomanalytics/{}/'.format(record_io_valid),'--recursive']
subprocess.call(cmd)

# testing data
cmd = ['aws','s3','cp','E:/Data Prep/testing/',
       's3://syscomanalytics/{}/'.format(testing_data),'--recursive']
subprocess.call(cmd)

print('Moved RecordIO files and test images to:\n')
print('syscomanalytics/{}'.format(record_io_train))
print('syscomanalytics/{}'.format(record_io_valid))
print('syscomanalytics/{}'.format(testing_data))

Moved RecordIO files and test images to:

syscomanalytics/data_prep/temp/training
syscomanalytics/data_prep/temp/validation
syscomanalytics/data_prep/temp/testing


# Clean-up

Run these cells to remove all images from S3 KEYS:


syscomanalytics/data_prep/source

syscomanalytics/data_prep/training

syscomanalytics/data_prep/validation

syscomanalytics/data_prep/testing



AND NOTEBOOK FOLDERS:



imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data

imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val

imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_rec

imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val_rec

In [14]:
# clean-up by deleting files in Data Prep folders
import shutil, os

# change directory
os.chdir("E:")
os.chdir(r"\Data Prep")

# delete folders and their content
shutil.rmtree('source')
shutil.rmtree('training')
shutil.rmtree('validation')
shutil.rmtree('testing')

# recreate folders with same names
os.mkdir('source')
os.mkdir('training')
os.mkdir('validation')
os.mkdir('testing')


#print('All files in '+source_key+' deleted!')
print('All files in Data Prep\source deleted!')
print('All files in Data Prep\\training deleted!')
print('All files in Data Prep\\validation deleted!')
print('All files in Data Prep\\testing deleted!')

All files in Data Prep\source deleted!
All files in Data Prep\training deleted!
All files in Data Prep\validation deleted!
All files in Data Prep\testing deleted!


In [15]:
# delete all RecordIO files
import shutil, os

# change directory
os.chdir("E:")
os.chdir(r"\Data Prep\incubator-mxnet-master\tools")

# delete folders and their content
shutil.rmtree('Data\\')
shutil.rmtree('Data_val\\')
shutil.rmtree('Data_rec\\')
shutil.rmtree('Data_val_rec\\')

# recreate folders with same names
os.mkdir('Data')
os.mkdir('Data_rec')
os.mkdir('Data_val')
os.mkdir('Data_val_rec')


print('Deleted all files from the Data, Data_rec, Data_val, and Data_val_rec directories')

Deleted all files from the Data, Data_rec, Data_val, and Data_val_rec directories


# Crop from image

In [None]:
###################################
# crop from original
###################################

import sys
sys.path.append('/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6/site-packages')
import numpy as np
import argparse
import imutils
import cv2

#https://www.tutorialkart.com/opencv/python/opencv-python-resize-image/
#http://tanbakuchi.com/posts/comparison-of-openv-interpolation-algorithms/

# load the image with imread()
imageSource = 'DSC00025.JPG'
img = cv2.imread(imageSource)



# **************** zoom/crop images ********************
cropfilename = 'crop_' + imageSource

#Cropped Image
scaleUpY = img.shape[0]
scaleUpX = img.shape[1]
beginY = int((scaleUpY/2) - (128))
beginX = int((scaleUpX/2) - (128))

print (scaleUpY,scaleUpX,beginY,beginX)
crop = img[beginY:beginY+256,beginX:beginX+256]
print(beginY+256,beginX+256)
cv2.imwrite(cropfilename,crop)


# Square an image

In [None]:
###################################
# square entire folder
###################################

import sys
sys.path.append('/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6/site-packages')
import numpy as np
import argparse
import imutils
import cv2
import os, shutil


path = './'
dest = '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/OpenCV_Data/Cropped/'


for imageSource in os.listdir(path):
    
    img = cv2.imread(imageSource)

    # **************** zoom/crop images ********************
    cropfilename = path + imageSource
    if cropfilename == './.ipynb_checkpoints':
        continue
    print(cropfilename)
    #Cropped Image
    scaleUpY = img.shape[0]
    scaleUpX = img.shape[1]
    diff = abs(scaleUpY-scaleUpX)
    offset = int(diff/2)

    if scaleUpY > scaleUpX:

        endY = int(scaleUpY-offset)
        crop = img[offset:endY,0:scaleUpX]
        print(offset,endY,0,scaleUpX)

    else:
        endX = int(scaleUpX-offset)
        crop = img[0:scaleUpY,offset:endX]
        print(0,scaleUpY,offset,endX)

    cv2.imwrite(cropfilename,crop)
    shutil.move(imageSource,dest)

# Resize an image (keep aspect ratio)

In [None]:
##########################
#Resize entire folder
##########################

import cv2, os, shutil

#############################################################################

width = 4000
width_prefix = ''
#############################################################################
path = './'
dest = '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/OpenCV_Data/Resized/'

for f in os.listdir(path):
    print(f)
    if f == '.ipynb_checkpoints':
        continue
    image = cv2.imread("./{}".format(f))
    r = width / image.shape[1]
    dim = (width, int(image.shape[0] * r))
    resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
    cv2.imwrite('./{} - {}'.format(width_prefix,f),resized)
    shutil.move(f,dest)

In [None]:
# CLI commands:



# create .lst file for training data
# python im2rec.py ./Data_rec/Data_rec ./Data/ --recursive --list --num-thread 8

# create .lst file for validation data
# python im2rec.py ./Data_val_rec/Data_val_rec ./Data_val/ --recursive --list --num-thread 8

# create .idx and .rec files
# python im2rec.py ./Data_rec ./Data/ --recursive --pass-through --pack-label --no-shuffle --num-thread 8

# create .idx and .rec files
# python im2rec.py ./Data_val_rec ./Data_val/ --recursive --pass-through --pack-label --no-shuffle --num-thread 8



In [None]:
###################
# retired cells
###################




'''
# move images from s3 bucket to folders in notebook instance

import subprocess

cmd = ['aws','s3','cp','s3://syscomanalytics/data_prep/training/',
      '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data/',
      '--recursive']

subprocess.call(cmd)

cmd = ['aws','s3','cp','s3://syscomanalytics/data_prep/validation/',
      '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val/',
      '--recursive']

subprocess.call(cmd)

print('Moved training and validation images from S3 bucket to:\n')
print('/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data/')
print('/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val/')

'''

'''
# convert to RecordIO format

import subprocess, os


# training .lst file
cmd = ['python','/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/im2rec.py',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_rec/Data_rec',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data/',
       '--recursive','--list','--num-thread','8']

subprocess.call(cmd)

# validation .lst file
cmd = ['python','/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/im2rec.py',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val_rec/Data_val_rec',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val/',
       '--recursive','--list','--num-thread','8']

subprocess.call(cmd)


# training .idx and .rec files
cmd = ['python','/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/im2rec.py',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_rec',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data/',
       '--recursive','--pass-through','--pack-label','--no-shuffle','--num-thread','8']

subprocess.call(cmd)

# validation .idx and .rec files
cmd = ['python','/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/im2rec.py',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val_rec',
       '/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val/',
       '--recursive','--pass-through','--pack-label','--no-shuffle','--num-thread','8']

subprocess.call(cmd)

print('Converted to RecordIO format in:\n')
print('/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_rec/')
print('\nAND\n')
print('/home/ec2-user/SageMaker/imageclassification_PlantVillage_Test1/incubator-mxnet-master/tools/Data_val_rec/')

'''