# Data Augmentation for Homegrown models
This generates augmented data for using with the homegrown models

In [None]:
import csv 
import time

# Config Params
Update {IMAGE PATH}

In [None]:
image_basepath                 = '{IMAGE PATH}'
feature_basepath               = '{IMAGE PATH}/resnet50_features_vectors/'
augmented_image_directory_name = '{IMAGE PATH}/augmented/train/'
cropped_image_directory_name   = '{IMAGE PATH}/cropped/'
csv_filename                   = '{IMAGE PATH}/labels.csv'

In [None]:
stime = time.time()

train_filenames = []
val_filenames   = []
test_filenames  = []
y_train = []
y_test  = []
y_val   = []

with open(csv_filename,'r') as f:
    reader = csv.reader(f,delimiter=',')
    for line in reader:
        # print(line)
        #sys.exit()
        
        filename = line[0]
        dataset = line[0].split('/')[0]
        label = line[1]
        
        if dataset == 'test':
            y_test.append(label)
            test_filenames.append(filename)
        elif dataset == 'val':
            y_val.append(label)
            val_filenames.append(filename)
        elif dataset == 'train':
            y_train.append(label)
            train_filenames.append(filename)


In [None]:
import csv
from PIL import Image
import numpy as np
import scipy.misc
import scipy.ndimage
from keras.preprocessing.image import array_to_img, img_to_array, load_img

x_train = []
x_val   = []
x_test  = []

id = dict()
id['train'] = dict()
id['test'] = dict()
id['val'] = dict()

N = 224
for i,filename in enumerate(train_filenames):
    pil_im = Image.open(image_basepath+filename,'r')
    im3 = scipy.misc.imresize(np.asarray(pil_im), (N,N,3))
    x_train.append(im3)
    id[filename.split('/')[0]][filename.split('/')[-1]] = i

for i,filename in enumerate(val_filenames):
    pil_im = Image.open(image_basepath+filename,'r')
    im3 = scipy.misc.imresize(np.asarray(pil_im), (N,N,3))
    x_val.append(im3)
    id[filename.split('/')[0]][filename.split('/')[-1]] = i

for i,filename in enumerate(test_filenames):
    pil_im = Image.open(image_basepath+filename,'r')
    im3 = scipy.misc.imresize(np.asarray(pil_im), (N,N,3))
    x_test.append(im3)
    id[filename.split('/')[0]][filename.split('/')[-1]] = i

x_train = np.array(x_train)
x_val = np.array(x_val)
x_test = np.array(x_test)

print(x_train.shape)
print(x_test.shape)
print(x_val.shape)


In [None]:
from sklearn import preprocessing
from keras.utils import to_categorical
le = preprocessing.LabelEncoder()

le.fit(y_train)
print(list(le.classes_))

y_train_num = np.array(le.transform(y_train)) 
y_test_num  = np.array(le.transform(y_test))
y_val_num   = np.array(le.transform(y_val))

y_train_cat = np.array(to_categorical(le.transform(y_train))) 
y_test_cat  = np.array(to_categorical(le.transform(y_test)))
y_val_cat   = np.array(to_categorical(le.transform(y_val)))

print(y_train_cat.shape)
print(y_test_cat.shape)
print(y_val_cat.shape)

# make augmentations on uncropped data (training only)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
import time

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.10,
    height_shift_range=0.10,
    horizontal_flip=True)

datagen.fit(x_train)

N = 20
x_resnet50_augmented = []
y_resnet50_augmented = []

stime = time.time()
for j in range(N):
    jtext = str(j)
    for i, filepath in enumerate(train_filenames):
        itext = str(i)
        
        filename = train_filenames[i].split('/')[-1].split('.')[0]

        img = x_train[i,:,:,:].reshape((1,224,224,3))
        yval = y_train_cat[i,:].reshape(1,1,4)
        for x_temp, y_temp in datagen.flow(img,yval, batch_size=1,shuffle=False,save_to_dir=augmented_image_directory_name+'full/', save_prefix=jtext+"_"+itext+"_"+filename, save_format='jpg'):
            x_resnet50_temp = model_resnet50.predict(x_temp)
            x_resnet50_temp = x_resnet50_temp.reshape((2048,))
            x_resnet50_augmented.append(x_resnet50_temp)
            y_resnet50_augmented.append(y_train_cat[i,:])
            break

    print(jtext+" of "+str(N)+" in "+str(time.time()-stime)+ " sec")

x_resnet50_augmented = np.array(x_resnet50_augmented)
y_resnet50_augmented = np.array(y_resnet50_augmented)

pickle.dump( {'x_train': x_resnet50_augmented,
              'classes': list(le.classes_),
              'y_train_cat': y_resnet50_augmented
             }, open( feature_basepath + "resnet50_full_augmented.p", "wb" ) )

# make cropped versions of all images (train, test,val)

In [None]:
# store cropped images to:
#       ../cropped/train/
#       ../cropped/test/
#       ../cropped/val/

import csv
from PIL import Image
import numpy as np
import scipy.misc
import scipy.ndimage
from keras.preprocessing.image import array_to_img, img_to_array, load_img
import matplotlib.pyplot as plt

N  = 224
cr = 75

n = 10
f, axarr = plt.subplots(2,n)
print(axarr.shape)
for i in range(x_train.shape[0]):
    if i % 500 == 0:
        print(i)
    filename = train_filenames[i]
    im3 = scipy.misc.imresize(x_train[i,cr:N-cr,cr:N-cr,:], (N,N,3))
    im = Image.fromarray(im3)
    im.save(cropped_image_directory_name+filename)
    
    if i < 10:
        axarr[0,i].imshow(np.asarray( x_train[i,:,:,:], dtype="uint8" ))
        axarr[0,i].get_xaxis().set_visible(False)
        axarr[0,i].get_yaxis().set_visible(False)
        
        axarr[1,i].imshow(np.asarray( im, dtype="uint8" ))
        axarr[1,i].get_xaxis().set_visible(False)
        axarr[1,i].get_yaxis().set_visible(False)
plt.show()    

for i in range(x_test.shape[0]):
    if i % 500 == 0:
        print(i)
    filename = test_filenames[i]
    im3 = scipy.misc.imresize(x_test[i,cr:N-cr,cr:N-cr,:], (N,N,3))
    im = Image.fromarray(im3)
    im.save(cropped_image_directory_name+filename)

for i in range(x_val.shape[0]):
    if i % 500 == 0:
        print(i)
    filename = val_filenames[i]
    im3 = scipy.misc.imresize(x_val[i,cr:N-cr,cr:N-cr,:], (N,N,3))
    im = Image.fromarray(im3)
    im.save(cropped_image_directory_name+filename)

# make augmentations on cropped images (train only)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
import time

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.10,
    height_shift_range=0.10,
    horizontal_flip=True)

datagen.fit(x_train)

N = 224
# load in all cropped images
x_train_cropped = []

for filename in train_filenames:
    pil_im = Image.open(cropped_image_directory_name+filename,'r')
    x_train_cropped.append(np.asarray(pil_im))

x_train_cropped = np.array(x_train_cropped)
print(x_train_cropped.shape)
    
    
    
from keras.preprocessing.image import ImageDataGenerator
import time

datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.10,
    height_shift_range=0.10,
    horizontal_flip=True)

datagen.fit(x_train)

x_resnet50_augmented_cropped = []
y_resnet50_augmented_cropped = []

N = 20
stime = time.time()
for j in range(N):
    jtext = str(j)
    for i, filepath in enumerate(train_filenames):
        itext = str(i)
        
        filename = train_filenames[i].split('/')[-1].split('.')[0]

        img = x_train_cropped[i,:,:,:].reshape((1,224,224,3))
        yval = y_train_cat[i,:].reshape(1,1,4)
        for x_temp, y_temp in datagen.flow(img,yval,batch_size=1,shuffle=False,save_to_dir=augmented_image_directory_name+'/cropped/', save_prefix=jtext+"_"+itext+"_"+filename, save_format='jpg'):
            x_resnet50_temp = model_resnet50.predict(x_temp)
            x_resnet50_temp = x_resnet50_temp.reshape((2048,))
            x_resnet50_augmented_cropped.append(x_resnet50_temp)
            y_resnet50_augmented_cropped.append(y_train_cat[i,:])
            break

    print(jtext+" of "+str(N)+" in "+str(time.time()-stime)+ " sec")
    
x_resnet50_augmented_cropped = np.array(x_resnet50_augmented_cropped)
y_resnet50_augmented_cropped = np.array(y_resnet50_augmented_cropped)

pickle.dump( {'x_train': x_resnet50_augmented_cropped,
              'classes': list(le.classes_),
              'y_train_cat': y_resnet50_augmented_cropped
             }, open( feature_basepath + "resnet50_full_augmented_cropped.p", "wb" ) )

# calculate and store resnet50 non_augmented features

In [None]:
# load VGG with pretrained imagenet weights (once with max pooling and once without)
from keras.models import Model
from keras.layers import Dense,Flatten
from keras.applications import resnet50
from keras import backend as K

model_resnet50 = resnet50.ResNet50(include_top=False, weights='imagenet')

In [None]:
# full (train, test, val)

x_train_resnet50 = model_resnet50.predict(x_train)
x_val_resnet50   = model_resnet50.predict(x_val)
x_test_resnet50  = model_resnet50.predict(x_test)

x_train_resnet50 = x_train_resnet50.reshape((x_train_resnet50.shape[0], 2048))
x_val_resnet50   = x_val_resnet50.reshape((x_val_resnet50.shape[0], 2048))
x_test_resnet50  = x_test_resnet50.reshape((x_test_resnet50.shape[0], 2048))

# dump reset features to file
import pickle 

y_train_num = np.array(le.transform(y_train)) 
y_test_num  = np.array(le.transform(y_test))
y_val_num   = np.array(le.transform(y_val))

y_train_cat = np.array(to_categorical(le.transform(y_train))) 
y_test_cat  = np.array(to_categorical(le.transform(y_test)))
y_val_cat   = np.array(to_categorical(le.transform(y_val)))

print(y_train_cat.shape)
print(y_test_cat.shape)
print(y_val_cat.shape)

pickle.dump( {'x_train': x_train_resnet50,
              'x_val': x_val_resnet50,
              'x_test': x_test_resnet50,
              'classes': list(le.classes_),
              'y_train_cat': y_train_cat,
              'y_test_cat': y_test_cat,
              'y_val_cat': y_val_cat,
              'y_train_num': y_train_num,
              'y_test_num': y_test_num,
              'y_val_num': y_val_num,
              'y_train': y_train,
              'y_test': y_test,
              'y_val': y_val
             }, open( feature_basepath + "resnet50_full.p", "wb" ) )

In [None]:
# cropped (train, test, val)

# note, order is the same as "full" dataset because both follow order in {train_filenames, val_filenames, test_filenames}

x_train = []
x_val   = []
x_test  = []

for filename in train_filenames:
    pil_im = Image.open(cropped_image_directory_name+filename,'r')
    pil_im = np.asarray(pil_im)
    x_train.append(im3)

for filename in val_filenames:
    pil_im = Image.open(cropped_image_directory_name+filename,'r')
    im3 = scipy.misc.imresize(np.asarray(pil_im), (N,N,3))
    x_val.append(im3)

for filename in test_filenames:
    pil_im = Image.open(cropped_image_directory_name+filename,'r')
    im3 = scipy.misc.imresize(np.asarray(pil_im), (N,N,3))
    x_test.append(im3)

x_train = np.array(x_train)
x_val   = np.array(x_val)
x_test  = np.array(x_test)

In [None]:
x_train_cropped_resnet50 = model_resnet50.predict(x_train)
x_val_cropped_resnet50   = model_resnet50.predict(x_val)
x_test_cropped_resnet50  = model_resnet50.predict(x_test)

x_train_cropped_resnet50 = x_train_cropped_resnet50.reshape((x_train_cropped_resnet50.shape[0], 2048))
x_val_cropped_resnet50   = x_val_cropped_resnet50.reshape((x_val_cropped_resnet50.shape[0], 2048))
x_test_cropped_resnet50  = x_test_cropped_resnet50.reshape((x_test_cropped_resnet50.shape[0], 2048))

In [None]:
pickle.dump( {'x_train': x_train_cropped_resnet50,
              'x_val': x_val_cropped_resnet50,
              'x_test': x_test_cropped_resnet50,
              'classes': list(le.classes_),
              'y_train_cat': y_train_cat,
              'y_test_cat': y_test_cat,
              'y_val_cat': y_val_cat,
              'y_train_num': y_train_num,
              'y_test_num': y_test_num,
              'y_val_num': y_val_num,
              'y_train': y_train,
              'y_test': y_test,
              'y_val': y_val
             }, open( feature_basepath + "resnet50_cropped.p", "wb" ) )