#Tomato Dataset Bounding Box
[Github repository](https://github.com/Ersy/object_detection_with_reinforcement_learning)

# Image Augmentation
### Take in an image and the bounding box/es, flip image horizontally determine new bounding  box locatiion based on this

In [None]:
# Image augmentation
# take in an image and the bounding box/es
# flip image horizontally
# determine new bounding box location based on this

# def flip_image(img, boundingbox):
# 	"""
# 	Takes an image and list of bounding boxes for the image 
# 	and flips everything horizontally
# 	returns the flipped image and boundingbox 
# 	(elements of the bb are changed inplace)
# 	"""
# 	flipped_image = np.fliplr(img)
# 	img_width = flipped_image.shape[1]
# 	for box_ix in range(len(boundingbox)):
# 		bb_topx = boundingbox[box_ix][0, 1]
# 		bb_bottomx = boundingbox[box_ix][1, 1]
# 		bb_width = bb_bottomx - bb_topx

# 		boundingbox[box_ix][0, 1] = img_width - bb_width - bb_topx	
# 		boundingbox[box_ix][1, 1] = img_width - bb_topx
# 	return flipped_image, boundingbox

# Image Actions
### To load the images, to get image names, its labels and its ground truth bb, also doing image preprocessing and viewing its results.


In [None]:
from PIL import Image, ImageFilter
from keras.preprocessing import image
import xml.etree.ElementTree as ET
import numpy as np
import cv2
from keras.applications.vgg16 import preprocess_input
import matplotlib.patches as patches
import matplotlib.pyplot as plt

In [None]:
### Reference values
class_name_dict = { 	
                    'tomato':0,
                  }

In [None]:
def load_images(VOC_path, image_names):
	"""
	loads images from a given data set
	"""
	images = []
	for i in range(len(image_names)):
		image_name = image_names[i]
		string = VOC_path + '/JPEGImages/' + image_name + '.jpg'
		images.append(image.load_img(string, False))
	return images

In [None]:
def get_img_names(VOC_path, data_set_name):
	"""
	collects the file names associated with a class and data set type
	"""
	file_path = VOC_path + data_set_name + '.txt'
	f = open(file_path)
	image_names = f.readlines()
	image_names = [x.strip('\n') for x in image_names]
	f.close()
	return [x.split(None, 1)[0] for x in image_names]

In [None]:
def get_img_labels(VOC_path, data_set_name):
	"""
	collects the labels for the desired dataset
	"""
	file_path = VOC_path + '/ImageSets/Main/' + data_set_name + '.txt'
	f = open(file_path)
	image_names = f.readlines()
	image_names = [x.strip('\n') for x in image_names]
	f.close()
	return [x.split(None, 1)[1] for x in image_names]

In [None]:
# def get_bb_gt(VOC_path, image_name):
# 	"""
# 	get the ground truth bounding box values and class for an image
# 	"""
# 	file_path = VOC_path + '/Annotations/' + image_name + '.xml'
# 	tree = ET.parse(file_path)
# 	root = tree.getroot()
# 	names = []
# 	x_min = []
# 	x_max = []
# 	y_min = []
# 	y_max = []
# 	for child in root:
# 		if child.tag == 'object':
# 			for child2 in child:
# 				if child2.tag == 'name':
# 					names.append(child2.text)
# 				elif child2.tag == 'bndbox':
# 					for child3 in child2:
# 						if child3.tag == 'xmin':
# 							x_min.append(child3.text)
# 						elif child3.tag == 'xmax':
# 							x_max.append(child3.text)
# 						elif child3.tag == 'ymin':
# 							y_min.append(child3.text)
# 						elif child3.tag == 'ymax':
# 							y_max.append(child3.text)
# 	bb_list = []
# 	category = []
# 	for i in range(np.size(names)):
# 		category.append(class_name_dict[names[i]])
# 		bb_list.append(np.array([[y_min[i], x_min[i]],[y_max[i], x_max[i]]]))
# 	return np.array(category, dtype='uint16'), np.array(bb_list, dtype='uint16')

In [None]:
def view_image(t0):
	"""
	converts an image back into a viewable format (PIL) and displays
	"""
	t0[:, :, 0] += 103
	t0[:, :, 1] += 116
	t0[:, :, 2] += 123
	t1 = np.uint8(t0)
	t2 = Image.fromarray(t1)
	t2.show()

In [None]:
def image_preprocessing(im):
	"""
	preprocessing for images before VGG16
	change the colour channel order
	resize to 224x224
	add dimension for input to vgg16
	carry out standard preprocessing
	"""
	im = im[:, :, ::-1] # keep this in if the color channel order needs reversing
	im = cv2.resize(im, (224, 224)).astype(np.float32)
	im = np.expand_dims(im, axis=0)
	im = preprocess_input(im)
	return im

In [None]:
def view_results(im, groundtruth, proposals, all_IOU, ix):
	"""
	takes in an image set, ground truth bounding boxes, proposal bounding boxes, and an image index
	prints out the image with the bouning boxes drawn in
	"""
	im = im[ix]
	max_IOU = max(all_IOU[ix][-1])
	proposals = proposals[ix]

	fig, ax = plt.subplots(1)
	ax.imshow(im)

	num_of_proposals = len(proposals)
	color = plt.cm.rainbow(np.linspace(0,1,num_of_proposals))

	for proposal, c in zip(proposals, color):
	    top_left = (proposal[0,1], proposal[0,0])
	    width = proposal[1,1] - proposal[0,1]
	    height = proposal[1,0] - proposal[0,0]
	    rect = patches.Rectangle(top_left, width, height, linewidth=2, edgecolor=c, facecolor='none') # change facecolor to add fill
	    ax.add_patch(rect)
	rect = patches.Rectangle(top_left, width, height, linewidth=2, edgecolor=c, facecolor='none' , label='Max IoU: '+str(max_IOU)[:5])
	ax.add_patch(rect)

	for ground_truth_box in groundtruth[ix]:
	    top_left = (ground_truth_box[0,1], ground_truth_box[0,0])
	    width = ground_truth_box[1,1] - ground_truth_box[0,1]
	    height = ground_truth_box[1,0] - ground_truth_box[0,0]
	    rect = patches.Rectangle(top_left, width, height, linewidth=2, edgecolor='white', facecolor='none')
	    ax.add_patch(rect)


	plt.legend()
	plt.show()

# Image Loader

In [None]:
def get_class_images(VOC_path, desired_class, img_name_list, img_list):

	# collect the code for desired object class
	desired_class = image_actions.class_name_dict[desired_class]

	desired_class_list_bb = []
	desired_class_list_image = []
	desired_class_list_name = []

	# collect bounding boxes for each image
	for image_ix in range(len(img_name_list)):
		current_image_groundtruth = []
		ground_image_bb_gt = image_actions.get_bb_gt(VOC_path, img_name_list[image_ix])
		
		# flag the image as containing the desired target object
		image_flag = False	
		for ix in range(len(ground_image_bb_gt[0])):	
			if ground_image_bb_gt[0][ix] == desired_class:
				current_image_groundtruth.append(ground_image_bb_gt[1][ix])
				image_flag = True

		# append images that contain desired object
		if image_flag:
			desired_class_list_bb.append(current_image_groundtruth)	
			desired_class_list_image.append(img_list[image_ix])
			desired_class_list_name.append(img_name_list[image_ix])

	return desired_class_list_image, desired_class_list_bb, desired_class_list_name

# Action Functions

### Performing right, down, left and up actions and updating the bb information


In [None]:
# dictionary mapping Q output index to actions
action_dict = {0:'right',1:'down',2:'left',3:'up'}

# amount to update the corner positions by for each step
update_step = 0.1

In [None]:
def TL_right(bb):
	"""moves the top corner to the right"""
	y_origin = bb[0,0]
	x_origin = bb[0,1]
	
	y_end = bb[1,0]
	x_end = bb[1,1]

	pixel_update = int((x_end - x_origin) * update_step)

	x_origin = x_origin + pixel_update

	tl = [y_origin, x_origin]
	br = [y_end, x_end]
	return np.array([tl, br])

In [None]:
def TL_down(bb):
	"""moves the top corner to the right"""
	y_origin = bb[0,0]
	x_origin = bb[0,1]
	
	y_end = bb[1,0]
	x_end = bb[1,1]

	pixel_update = int((y_end - y_origin) * update_step)

	y_origin = y_origin + pixel_update

	tl = [y_origin, x_origin]
	br = [y_end, x_end]
	return np.array([tl, br])

In [None]:
def BR_left(bb):
	"""moves the bottom corner to the left"""
	y_origin = bb[0,0]
	x_origin = bb[0,1]
	
	y_end = bb[1,0]
	x_end = bb[1,1]

	pixel_update = int((x_end - x_origin) * update_step)

	x_end = x_end - pixel_update

	tl = [y_origin, x_origin]
	br = [y_end, x_end]
	return np.array([tl, br])

In [None]:
def BR_up(bb):
	"""moves the top corner to the right"""
	y_origin = bb[0,0]
	x_origin = bb[0,1]
	
	y_end = bb[1,0]
	x_end = bb[1,1]

	pixel_update = int((y_end - y_origin) * update_step)

	y_end = y_end - pixel_update

	tl = [y_origin, x_origin]
	br = [y_end, x_end]
	return np.array([tl, br])

In [None]:
def crop_image(im, bb_in, region):
	"""
	returns a desired cropped region of the raw image
	im: raw image (numpy array)
	bb: the bounding box of the current region (defined by top left and bottom right corner points)
	region: 'TL', 'TR', 'BL', 'BR', 'centre'
	"""

	if action_dict[region] == 'right':
		new_bb = TL_right(bb_in)
	elif action_dict[region] == 'down':
		new_bb = TL_down(bb_in)
	elif action_dict[region] == 'left':
		new_bb = BR_left(bb_in)
	elif action_dict[region] == 'up':
		new_bb = BR_up(bb_in)

	y_start = new_bb[0,0]
	y_end = new_bb[1,0]
	x_start = new_bb[0,1]
	x_end = new_bb[1,1]

	# crop image to new boundingbox extents
	im = im[int(y_start):int(y_end), int(x_start):int(x_end), :]
	return im, new_bb

# Gabor Maker

In [None]:
# from scipy.ndimage import zoom
# import numpy as np
# import random
# import cPickle as pickle

# from scipy.ndimage import zoom
# import numpy as np
# import random

In [None]:
# class gabor_gen():
    
#     def __init__(self, im_size):
#         self.im_size = im_size
        
#     def gen_image(self, num_of_gabors, gabor_size, lambda_, theta,sigma, phase, 
#                   noisy=False, beta=-2, random_scaling=False, odd_one_out=False, 
#                   overlap=False, random_angles=False, occluded=False):
#         """
#         Generates an image of a select size with a select number of gabors
#         """
#         im_len = (np.linspace(0, self.im_size, self.im_size+1))
#         x_mesh, y_mesh = np.meshgrid(im_len, im_len)
#         bb = []
        
#         if noisy:
#             # create spatial noise background and normalise
#             im = self.spatial_noise(beta)
#             im = (im - im.mean())/im.std()
#         else:
#             im = x_mesh*0 + y_mesh*0
        
        
#         # collection of all coordinates in grid
#         available_space = [(y,x) for y, x in zip(y_mesh.flatten(), x_mesh.flatten())]
#         # storage for collecting gabor locations
#         existing_gabor_loc = []
#         # storage for collecting gabor sizes
#         existing_gabor_size = []
        
#         # create gabor patches in the image
#         for gab in range(num_of_gabors):
            
#             # hack to make the last gabor angle perpendicular to the rest
#             if odd_one_out and gab == 1:
#                 theta = theta+90
            
#             # allow for random angle generation
#             if random_angles:
#                 theta = random.choice([0,45,90,135,180,225,270,315])
            
#             # flag for random scaling of patches for variability
#             scaling_factor = 1
#             if random_scaling:
#                 scaling_factor = random.randint(1,3)

            
#             # create gabor and normalise
#             gabor, gauss = self.gabor_patch(size=gabor_size, lambda_=lambda_,theta=theta,sigma=sigma, phase=phase)
#             gabor = zoom(gabor, scaling_factor)
#             gauss = zoom(gauss, scaling_factor)
#             gabor = (gabor - gabor.mean())/gabor.std()
    
    
            
    
    
#             # get the scaled gabor size
#             scaled_gabor_size = gabor_size*scaling_factor
            
#             available_space = [(y, x) for y, x in available_space if x < self.im_size-scaled_gabor_size and y < self.im_size-scaled_gabor_size]
#             # generate a random location to place the new gabor
#             #x, y = self.gen_random_location(im_len, scaled_gabor_size, existing_gabor_size, existing_gabor_loc, overlap)
#             if available_space:
#                 x, y = self.gen_random_location(available_space)
#                 x, y = int(x), int(y)
                
#                 if overlap == False:
#                     available_space = self.get_available_space(available_space, x, y, scaled_gabor_size, im_len)

                    
                
#                 x_min = x
#                 y_min = y
#                 x_max = x+scaled_gabor_size
#                 y_max = y+scaled_gabor_size
                
#                 if occluded:
#                     half_y = y+int(scaled_gabor_size/2)
#                     half_x = x+int(scaled_gabor_size/2)
                
#                     random_occlusion_x = random.randint(0,2)
#                     if random_occlusion_x == 0:
#                         x_min = half_x
#                     elif random_occlusion_x == 1:
#                         x_max = half_x

#                     # trick to prevent full patches from being created
#                     y_occ = (1 if random_occlusion_x == 2 else 2)

#                     random_occlusion_y = random.randint(0,y_occ)
#                     if random_occlusion_y == 0:
#                         y_min = half_y
#                     elif random_occlusion_y == 1:
#                         y_max = half_y                    
                    
                
#                     im[y_min:y_max,x_min:x_max] = im[y_min:y_max,x_min:x_max]*(1-gauss[0+y_min-y:y_max-y, 0+x_min-x:x_max-x])
#                     im[y_min:y_max,x_min:x_max] = im[y_min:y_max,x_min:x_max]+gabor[0+y_min-y:y_max-y, 0+x_min-x:x_max-x]
                
#                 else:
#                     # reduce noise in the gabor region by 1-gaussian then add gabor patch
#                     im[y:y+scaled_gabor_size,x:x+scaled_gabor_size] = im[y:y+scaled_gabor_size,x:x+scaled_gabor_size]*(1-gauss)
#                     im[y:y+scaled_gabor_size,x:x+scaled_gabor_size] = im[y:y+scaled_gabor_size,x:x+scaled_gabor_size]+gabor



#                 if occluded:
#                     bb.append(np.array([[y_min, x_min],[y_max, x_max]]))
#                 else:
#                     bb.append(np.array([[y, x],[y+scaled_gabor_size, x+scaled_gabor_size]]))
#             else:
#                 print("No more space available after "+ str(gab) + " patches")
#                 break
        
#         if odd_one_out:
#             bb = [bb[0]]
        
        
#         # 0-255 mapping
#         im = self._convert_to_im(im)
            
#         return im, bb
    
#     def _convert_to_im(self, im):
#         """
#         converts image array values from original range to 0-255
#         """
#         input_min = im.min()
#         input_max = im.max()
#         output_min = 0
#         output_max = 255
        
#         input_range = input_max - input_min
#         output_range = output_max - output_min

#         new_im = ((im - input_min) * output_range / input_range) + output_min
#         new_im = np.uint8(np.ceil(new_im))
#         new_im = self.to_rgb1a(new_im)
        
#         return new_im

#     def to_rgb1a(self, im):
#         """
#         converts image from single channel to 3 channels
#         code from: http://www.socouldanyone.com/2013/03/converting-grayscale-to-rgb-with-numpy.html (Matt Murfitt, 2013)
#         """
#         w, h = im.shape
#         ret = np.empty((w, h, 3), dtype=np.uint8)
#         ret[:, :, 2] =  ret[:, :, 1] =  ret[:, :, 0] =  im
#         return ret    
    
#     def gen_random_location(self, available_space):
#         """
#         Selects a random location within the bounds of the image
#         """
#         y, x = random.choice(available_space)
        
#         return x, y
    
    
#     def get_available_space(self, available_space, x, y, scaled_gabor_size, im_len):
#         """
#         update the available space list to remove the 
#         """
        
        
#         available_space = [(a,b) for a,b in available_space if ((a+scaled_gabor_size<y or b+scaled_gabor_size<x)
#                                                                 or 
#                                                                 (a>y+scaled_gabor_size or b>x+scaled_gabor_size))]
        
        
#         # get current available space to account for current gabor size hitting the edge
#         current_x = [x for x in im_len][:-scaled_gabor_size]
#         current_y = [y for y in im_len][:-scaled_gabor_size]
#         current_grid = np.meshgrid(current_y, current_x)
#         current_available_space = [(y,x) for y, x in zip(current_grid[0].flatten(), current_grid[1].flatten())]
        
#         available_space = list(set(available_space).intersection(current_available_space))
        
#         return available_space


#     def spatial_noise(self, beta):
#         """
#         generates a noisy background with a given power spectrum
#         adapted from http://uk.mathworks.com/matlabcentral/fileexchange/5091-generate-spatial-data (Jon Yearsley, 2016)
#         """
#         DIM = [self.im_size,self.im_size]
#         BETA = beta

#         u1 = np.array(range(0,int(DIM[0]/2)+1, 1))
#         u2 = -np.array(range(int(np.ceil(DIM[0]/2))-1, 0, -1))
#         u = (np.hstack((u1, u2))/DIM[0])
#         u = np.tile(u, (DIM[1],1)).T


#         v1 = np.array(range(0,int(DIM[1]/2.0)+1, 1))
#         v2 = -np.array(range(int(np.ceil(DIM[1]/2.0))-1, 0, -1))
#         v = (np.hstack((v1, v2))/DIM[1])
#         v = np.tile(v, (DIM[0],1))

#         Spatial_freq = np.power(np.power(u, 2) + np.power(v, 2), (BETA/2.0))

#         Spatial_freq[Spatial_freq == np.inf] =0

#         phi = np.random.rand(DIM[0], DIM[1])

#         a = np.power(Spatial_freq, 0.5)
#         b = (np.cos(2*np.pi*phi))+(1j*np.sin(2*np.pi*phi))

#         x = np.fft.ifft2(a*b)
#         im = np.real(x)
#         return im
        
        
#     def gabor_patch(self, size, lambda_, theta, sigma, phase, trim=.005):
#         """
#         Create a Gabor Patch
#         size : int
#             Image size (n x n)
#         lambda_ : int
#             Spatial frequency (px per cycle)
#         theta : int or float
#             Grating orientation in degrees
#         sigma : int or float
#             gaussian standard deviation (in pixels)
#         phase : float
#             0 to 1 inclusive
#         """
#         # make linear ramp
#         X0 = (np.linspace(1, size, size) / size) - .5

#         # Set wavelength and phase
#         freq = size / float(lambda_)
#         phaseRad = phase * 2 * np.pi

#         # Make 2D grating
#         Xm, Ym = np.meshgrid(X0, X0)

#         # Change orientation by adding Xm and Ym together in different proportions
#         thetaRad = (theta / 360.) * 2 * np.pi
#         Xt = Xm * np.cos(thetaRad)
#         Yt = Ym * np.sin(thetaRad)
#         grating = np.sin(((Xt + Yt) * freq * 2 * np.pi) + phaseRad)

#         # 2D Gaussian distribution
#         gauss =  np.exp(-((Xm ** 2) + (Ym ** 2)) / (2 * (sigma / float(size)) ** 2))
        
#         # Trim
#         cropped_gauss = gauss[gauss < trim] = 0

#         return grating * gauss, gauss

In [None]:
# import matplotlib.patches as patches
# import matplotlib.pyplot as plt

# gabor_size=30
# sigma=5
# num_of_pics = 5
# num_of_gabors = 1
# im_size = 224
# beta = -2
# noisy=True
# phase=0
# lambda_ = 6
# theta=0
# random_scaling = False
# odd_one_out = False
# overlap=False
# random_angles = False
# occluded = True

In [None]:
# def generate_x_images(num_of_pics, im_size, num_of_gabors, gabor_size, lambda_, theta, phase, sigma, 
#                       noisy, random_scaling, odd_one_out, overlap, random_angles, occluded):
#     """
#     Generates multiple images with the same gabor settings
#     """
#     image_container = []
#     bb_container = []
#     gabor_instance = gabor_gen(im_size=im_size)
#     for i in range(num_of_pics):
#         image, bb = gabor_instance.gen_image(num_of_gabors=num_of_gabors, 
#                                              gabor_size=gabor_size,
#                                              lambda_ = lambda_,
#                                              theta = theta,
#                                              phase=phase,
#                                              sigma=sigma, 
#                                              beta=beta, 
#                                              noisy=noisy,
#                                              random_scaling=random_scaling,
#                                              odd_one_out=odd_one_out,
#                                              overlap=overlap,
#                                              random_angles=random_angles,
#                                              occluded=occluded)
#         image_container.append(image)
#         bb_container.append(bb)
#     return image_container, bb_container

In [None]:
# train_images, train_bbs = generate_x_images(num_of_pics, im_size, num_of_gabors, gabor_size, lambda_, theta, phase, sigma, 
#                                             noisy, random_scaling, odd_one_out, overlap, random_angles, occluded)

# Reinforcement Helper

In [None]:
from keras.models import Sequential # part to build the mode
from keras.layers.core import Dense, Dropout, Activation, Flatten # types of layers and associated functions
from keras.optimizers import RMSprop, SGD, Nadam, Adam #optimising method (cost function and update method)
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
# from keras.initializers import normal, identity
from keras.initializers import RandomNormal, Identity

import numpy as np

In [None]:
# Visual descriptor size
visual_descriptor_size = 25088
# Different actions that the agent can do
number_of_actions = 5

# Number of actions in the past to retain
past_action_val = 8

movement_reward = 1


terminal_reward_5 = 3
terminal_reward_7 = 5
terminal_reward_9 = 7

iou_threshold_5 = 0.7
iou_threshold_7 = 0.7
iou_threshold_9 = 0.9

In [None]:
def conv_net_out(image, model_vgg):
	return model_vgg.predict(image) 

In [None]:
### get the state by vgg_conv output, vectored, and stack on action history
def get_state_as_vec(image, history_vector, model_vgg):
	descriptor_image = conv_net_out(image, model_vgg)
	descriptor_image = np.reshape(descriptor_image, (visual_descriptor_size, 1))
	history_vector = np.reshape(history_vector, (number_of_actions*past_action_val, 1))
	state = np.vstack((descriptor_image, history_vector)).T
	return state

In [None]:
def get_q_network(shape_of_input, number_of_actions, weights_path='0'):
	model = Sequential()
	model.add(Dense(1024, use_bias=True, kernel_initializer='lecun_uniform', input_shape = shape_of_input))
	model.add(Activation('relu'))
	model.add(Dropout(0.2))
	model.add(Dense(1024, use_bias=True, kernel_initializer='lecun_uniform'))
	model.add(Activation('relu'))
	model.add(Dropout(0.2))
	model.add(Dense(number_of_actions, use_bias=True, kernel_initializer='lecun_uniform'))
	model.add(Activation('linear'))
	adam = Adam(lr=1e-6)
	#nadam = Nadam()
	model.compile(loss='mse', optimizer=adam)
	if weights_path != "0":
		model.load_weights(weights_path)
	return model

In [None]:
def IOU(bb, bb_gt):
	"""
	Calculates the intersection-over-union for two bounding boxes
	"""
	x1 = max(bb[0,1], bb_gt[0,1])
	y1 = max(bb[0,0], bb_gt[0,0])
	x2 = min(bb[1,1], bb_gt[1,1])
	y2 = min(bb[1,0], bb_gt[1,0])

	w = x2-x1+1
	h = y2-y1+1

	# handle odd cases of no intersection
	if (w < 0 and h < 0):
		return 0

	inter = w*h
	
	aarea = (bb[1,1]-bb[0,1]+1) * (bb[1,0]-bb[0,0]+1)
	
	barea = (bb_gt[1,1]-bb_gt[0,1]+1) * (bb_gt[1,0]-bb_gt[0,0]+1)
	# intersection over union overlap
	iou = np.float32(inter) / (aarea+barea-inter)
	# set invalid entries to 0 iou - occurs when there is no overlap in x and y
	if iou < 0 or iou > 1:
		return 0
	return iou

# Main Loop


In [None]:
import numpy as np
import argparse
import csv
import time
import random
# import cPickle as pickle
import pickle

from keras.applications import imagenet_utils
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import backend as K
# K.set_image_dim_ordering('tf')
# import keras
# keras.backend.image_data_format() == 'channels_last'
K.set_image_data_format('channels_last')

# Local helpers
# import image_actions
# import reinforcement_helper
# import action_functions
# import image_loader
# import image_augmentation

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
# Flag to use either VOC dataset or patch dataset stored as pickle
# VOC = True

# Paths
# project_root = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/'
# VOC2007_path = project_root+ 'Reinforcement learning/VOCdevkit/VOC2007'
# VOC2012_path = project_root+ 'Reinforcement learning/VOCdevkit/VOC2012'

In [None]:
# if VOC == True:
# 	# desired_class_set = 'aeroplane_trainval'
# 	# desired_class = 'person'
#     desired_class_set = 'tomato'
#     desired_class = 'tomato'

# 	### loading up VOC2007 images of a given class
# 	img_name_list_2007 = image_actions.get_img_names(VOC2007_path, desired_class_set)
# 	img_list_2007 = image_actions.load_images(VOC2007_path, img_name_list_2007) 
# 	img_list_2007, groundtruths_2007, img_name_list_2007 = image_loader.get_class_images(VOC2007_path, desired_class, img_name_list_2007, img_list_2007)

# 	desired_class_set = 'person_train'

# 	### loading up VOC2012 images of a given class
# 	img_name_list_2012 = image_actions.get_img_names(VOC2012_path, desired_class_set)
# 	img_list_2012 = image_actions.load_images(VOC2012_path, img_name_list_2012) 
# 	img_list_2012, groundtruths_2012, img_name_list_2012 = image_loader.get_class_images(VOC2012_path, desired_class, img_name_list_2012, img_list_2012)

# 	### combine 2007 and 2012 datasets
# 	img_list = img_list_2007+img_list_2012
# 	groundtruths = groundtruths_2007+groundtruths_2012
# 	img_name_list = img_name_list_2007+img_name_list_2012

# else:
# 	patches_file = 'Experiment_2_Train_images.pickle'
# 	patches_bb_file = 'Experiment_2_Train_boxes.pickle'
# 	img_list = pickle.load(open(project_root+'project_code/pickled_data/'+patches_file, 'rb'))
# 	groundtruths = pickle.load(open(project_root+'project_code/pickled_data/'+patches_bb_file, 'rb'))

# Custom Data input from drive

In [None]:
import os
import glob
import os.path
import sys
import xml.etree.ElementTree as ET

In [None]:
data_dir = '/gdrive/MyDrive/AOBD/Pascal_tomato'

train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'valid')
test_dir = os.path.join(data_dir, 'test')

In [None]:
train_xml = glob.glob(f"{train_dir}/*.xml")
train_images = glob.glob(f"{train_dir}/*.jpg")

val_xml = glob.glob(f"{val_dir}/*.xml")
val_images = glob.glob(f"{val_dir}/*.jpg")

test_xml = glob.glob(f"{test_dir}/*.xml")
test_images = glob.glob(f"{test_dir}/*.jpg")

train_images = sorted(train_images)
train_xml = sorted(train_xml)
val_images = sorted(val_images)
val_xml = sorted(val_xml)
test_images = sorted(test_images)
test_xml = sorted(test_xml)

In [None]:
img_name_list = list(map(lambda x: str(x)[:-4].split('/')[-1], train_images))
#img_list = [image.load_img(i) for i in train_images]

In [None]:
# false = 0
# assert len(train_xml) == len(train_images)
# for i in range(len(train_xml)):
#     if sorted(train_images)[i][:-4] != sorted(train_xml)[i][:-4]:
#         false += 1
# print(false / len(train_xml))

In [None]:
def get_bb_gt2(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    tomatoes = 0
    x_min = []
    x_max = []
    y_min = []
    y_max = []
    for child in root:
        # print(f'this child is {child.tag}')
        if child.tag == 'object':
            # print('Obj found')
            for child2 in child:
                # print(f'child2 is {child2}')
                if child2.tag == 'name':
                    tomatoes += 1
                elif child2.tag == 'bndbox':
                    for child3 in child2:
                        if child3.tag == 'xmin':
                            x_min.append(child3.text)
                        elif child3.tag == 'xmax':
                            x_max.append(child3.text)
                        if child3.tag == 'ymin':
                            y_min.append(child3.text)
                        elif child3.tag == 'ymax':
                            y_max.append(child3.text)
    bb_list = []
    category = [0] * tomatoes

    # print(x_max)
    # print(tomatoes)
    # print(category)

    for i in range(tomatoes):
        bb_list.append(np.array([[y_min[i], x_min[i]],[y_max[i], x_max[i]]]))
    
    return np.array(category, dtype='uint16'), np.array(bb_list, dtype='uint16')

In [None]:
def get_groundtruths(groundtruths, img_name_list, img_list):

	desired_class_list_bb = []
	desired_class_list_image = []
	desired_class_list_name = []

	# collect bounding boxes for each image
	for image_ix in range(len(groundtruths)):
		current_image_groundtruth = []
		ground_image_bb_gt = groundtruths[image_ix]
		
		# flag the image as containing the desired target object
		image_flag = False	
		for ix in range(len(ground_image_bb_gt[0])):	
			if ground_image_bb_gt[0][ix] == 0:
				current_image_groundtruth.append(ground_image_bb_gt[1][ix])
				image_flag = True

		# append images that contain desired object
		if image_flag:
			desired_class_list_bb.append(current_image_groundtruth)	
			# desired_class_list_image.append(img_list[image_ix])
			# desired_class_list_name.append(img_name_list[image_ix])

	return desired_class_list_bb

In [None]:
#groundtruths = []

#for img_path in train_xml:
    #groundtruths.append(get_bb_gt2(img_path))

In [None]:
#groundtruths2 = get_groundtruths(groundtruths, img_name_list, img_list)

# Main Loop continues

In [None]:
project_root = '/gdrive/MyDrive/AOBD/Project#1'

In [None]:
with open(os.path.join(project_root,'data.pkl'), 'rb') as fh:
    img_list, groundtruths2 = pickle.load(fh)

In [None]:
# Constants
number_of_actions = 5
history_length = 8
Q_net_input_size = (25128, )
visual_descriptor_size = 25088


# Models
### VGG16 model without top
vgg16_conv = VGG16(include_top=False, weights='imagenet')

# initialise Q network (randomly or with existing weights) 
#loaded_weights_name = 'combi_aeroplane_180717_02_appr_forcedIOU06_augoff.hdf5'
#loaded_weights = project_root+'project_code/network_weights/'+loaded_weights_name
# loaded_weights = '0'
loaded_weights_fname = 'q_weights.hdf5'
loaded_weights = os.path.join(project_root, loaded_weights_fname)

Q_net = get_q_network(shape_of_input=Q_net_input_size, number_of_actions=number_of_actions, weights_path=loaded_weights)

# Validation callback
saved_weights = 'saved_weights.hdf5'
filepath= os.path.join(project_root, saved_weights)
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
Plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, verbose=1, mode='min', epsilon=0.0001, cooldown=0, min_lr=0)

callbacks_list = []#[checkpoint]

In [None]:
# Training Parameters
episodes = 1
epsilon = 1.1
epsilon_decay = 0.1
gamma = 0.9
T = 50
force_terminal = 0.6 # IoU to force terminal action
training_epochs = 30
guided_learning = True # Flag for guided learning on exploration
augmented = False
logging = False

# example/batch size handling (controls for RAM VRAM constraints)
conv_predict_batch_size = 10 # Decrease value if low on VRAM
Q_predict_batch_size = 100
Q_train_batch_size = 100



chunk_factor = int(len(img_list)/128) #10 # Increase value if low on RAM
chunk_size = int(len(img_list)/chunk_factor)

In [None]:
from numpy import matlib

# Metric collection during training process
action_counts = []
avg_reward = []
std_reward = []


episode_time = time.time()

for episode in range(episodes):
	print("Episode:", episode)

	print("Time taken = ", time.time() - episode_time)
	episode_time = time.time()

	# change the exploration-eploitation tradeoff as the episode count increases
	if epsilon > 0.11:
		epsilon = epsilon - epsilon_decay

	# initialise collections for per episode metrics
	action_count = [0,0,0,0,0]
	episode_rewards = []

	for chunk in range(chunk_factor):
		# list to store experiences, new one for each episode
		experiences = []

		# determines the offset to use when iterating through the chunk
		chunk_offset = chunk*chunk_size

		# iteration through all images in the current chunk
		for image_ix in range(chunk_offset,chunk_offset + chunk_size):
			print("Image:", image_ix)

			# get initial parameters for each image
			original_image = np.array(img_list[image_ix])
			image = np.array(img_list[image_ix])
			image_dimensions = image.shape[:-1]

			# collect bounding boxes for each image
			ground_image_bb_gt = groundtruths2[image_ix]#image_actions.get_bb_gt(image_name)

			# data augmentation -> 0.5 probability of flipping image and bounding box horizontally
			# if augmented:
			# 	augment = bool(random.getrandbits(1))
			# 	if augment:
			# 		original_image, ground_image_bb_gt = image_augmentation.flip_image(original_image, ground_image_bb_gt)
			# 		image = np.fliplr(image)

			# initial bounding box (whole image, raw size)
			boundingbox = np.array([[0,0],image_dimensions])

			# list to store IOU for each object in the image and current bounding box
			IOU_list = []

			image_IOU = []
			# get the initial IOU for each object
			for ground_truth in ground_image_bb_gt:
				current_iou = IOU(ground_truth, boundingbox)
				image_IOU.append(current_iou)
			IOU_list.append(image_IOU)

			# create the history vector
			history_vec = np.zeros((number_of_actions, history_length))

			# preprocess the image
			preprocessed_image = image_preprocessing(original_image)

			# intiialise experience subcontainer for each image
			experiences.append([])

			# collecting the preprocessed images in a separate list, the history, and an index of states already calculated
			preprocessed_list = []
			history_list = []
			exploitation_index = []
			exploitation_states = []
			image_rewards = []


			for t in range(T):
				# collect the preprocessed image
				preprocessed_list.append(preprocessed_image)
				history_list.append(np.array(np.reshape(history_vec, (number_of_actions*history_length))))

				# add action history to experience collection
				experiences[image_ix-chunk_offset].append([np.array(np.reshape(history_vec, (number_of_actions*history_length)))])

				# exploration or exploitation
				if random.uniform(0,1) < epsilon:
		   
		   			# limit exploration actions to only positive actions
					if guided_learning:

						# collect positive actions
						good_actions = []
						for act in range(number_of_actions-1):
							potential_image, potential_boundingbox = crop_image(original_image, boundingbox, act)            
							potential_image_IOU = []

							# check for IoU change for each action
							for ground_truth in ground_image_bb_gt:
								potential_iou = IOU(ground_truth, potential_boundingbox)
								potential_image_IOU.append(potential_iou)

							# store only positive actions
							if max(potential_image_IOU) >= max(image_IOU):
								good_actions.append(act)

						# make a selection out of the positive actions of possible
						if len(good_actions) > 0:
							good_actions.append(number_of_actions-1)
							action = random.choice(good_actions)
						else:
							action = random.randint(0, number_of_actions-1)

					else:
						action = random.randint(0, number_of_actions-1)
					
				# if the IOU is greater than 0.5 force the action to be the terminal action
				# this is done to help speed up the training process
				elif max(image_IOU) > force_terminal:
					action = number_of_actions-1
				
				# Exploitation
				else:
					state_vec = get_state_as_vec(preprocessed_image, history_vec, vgg16_conv) ### ADDED!!!
					Q_vals = Q_net.predict(state_vec)
					action = np.argmax(Q_vals)

					# collect the time step value for states that have already been calculated
					exploitation_states.append(state_vec)
					exploitation_index.append(t)


				# if in training the termination action is used no need to get the subcrop again
				if action != number_of_actions-1:
					image, boundingbox = crop_image(original_image, boundingbox, action)


				# measure IOU
				image_IOU = []
				for ground_truth in ground_image_bb_gt:
					current_iou = IOU(ground_truth, boundingbox)
					image_IOU.append(current_iou)
				IOU_list.append(image_IOU)

				# get reward if termination action is taken
				reward = get_reward(action, IOU_list, t)

				# update history vector
				history_vec[:, :-1] = history_vec[:,1:]
				history_vec[:,-1] = [0,0,0,0,0] # hard coded actions here
				history_vec[action, -1] = 1

				preprocessed_image = image_preprocessing(image)
				
				# add action, reward, history to experience list
				experiences[image_ix-chunk_offset][t].append(action)
				experiences[image_ix-chunk_offset][t].append(reward)
				experiences[image_ix-chunk_offset][t].append(np.array(np.reshape(history_vec, (number_of_actions*history_length)))) # ADDED!!!

				# collect episode metrics
				action_count[action] += 1
				image_rewards.append(reward)

			episode_rewards.append(sum(image_rewards))

			### CONVERTING COLLECTED IMAGES TO CONV OUTPUTS
			# collect the last preprocessed image for this given image
			preprocessed_list.append(preprocessed_image)
			
			# collecting the final history state
			final_history = np.array(np.reshape(history_vec, (number_of_actions*history_length)))
			history_list.append(final_history)
			history_arr = np.vstack(history_list)

			# get the indexes that correspond to the conv_outputs
			todo_states = [i for i in range(T+1) if i not in exploitation_index]
			

			# preprocessed image -> conv output for a single image
			conv_output = np.array(preprocessed_list).squeeze(1)
			conv_output = vgg16_conv.predict(conv_output[todo_states], conv_predict_batch_size, verbose=1)
			conv_output = np.reshape(conv_output, (conv_output.shape[0], visual_descriptor_size))

			# get the precalculated states if any
			try:
				exploitation_states = np.vstack(exploitation_states)
			except:
				pass

			# add the history to the conv_output, combine with exploitation states (if any) and reorder by timestep
			conv_states = np.append(conv_output, history_arr[todo_states], axis=1)
			try:
				conv_states = np.append(conv_states, exploitation_states, axis=0)
			except:
				pass

			# add the exploited indexes and sort conv_states back into the correct order
			todo_states.extend(exploitation_index)
			conv_states = [x for (y, x) in sorted(zip(todo_states, conv_states))]

			[experiences[image_ix-chunk_offset][i].append(conv_states[i]) for i in range(T)]
			[experiences[image_ix-chunk_offset][i].append(conv_states[i+1]) for i in range(T)]




		# Actual training per given episode over a set number of experiences (training iterations)
		# flatten the experiences list for learning
		flat_experiences = [x for l in experiences for x in l]
		num_of_experiences = len(flat_experiences) 
		
		random_experiences = np.array(flat_experiences)

		# delete variables to free up memory
		del flat_experiences

		initial_state = np.array([state[4] for state in random_experiences]) 
		next_state = np.array([state[5] for state in random_experiences])


		# calculating the Q values for the initial state
		initial_Q = Q_net.predict(initial_state, Q_predict_batch_size, verbose=1)

		# calculating the Q values for the next state
		next_Q = Q_net.predict(next_state, Q_predict_batch_size, verbose=1)
		
		# calculating the maximum Q for the next state
		next_Q_max = next_Q.max(axis=1)

		# get the reward for a given experience
		# random_reward = np.expand_dims(random_experiences[:, 2], 1)
		random_reward = random_experiences[:, 2]

		# get the action of a given experience
		random_actions = np.expand_dims(random_experiences[:, 1], 1)
		flat_actions = [x for l in random_actions for x in l]

		# collect the indexes of terminal actions and set next state Q value to 0
		# if the terminal action is selected the episode ends and there should be no additional reward
		terminal_indices = [i for i, x in enumerate(flat_actions) if x == number_of_actions-1]
		next_Q_max[terminal_indices] = 0

		# discount the future reward, i.e the Q value output
		target = np.array(next_Q_max) * gamma

		# target for the current state should be the Q value of the next state - the reward 
		target = target + random_reward

		# repeat the target array to the same size as the initial_Q array (allowing the cost to be limited to the selected actions)
		target_repeated = matlib.repmat(target, 5, 1).T

		# this takes the initial Q values for the state and replaces only the Q values for the actions that were used to the new target, else the error should be 0
		initial_Q[np.arange(len(initial_Q)), flat_actions] = target_repeated[np.arange(len(target_repeated)), flat_actions]

		# nicer names
		training_input = initial_state
		training_target = initial_Q


		before = time.time()
		Q_net.fit(training_input, training_target, epochs=training_epochs, batch_size=Q_train_batch_size, shuffle=True, verbose=1)#, callbacks=callbacks_list, validation_split=0.2)
		after = time.time()
		print("Time taken =", after-before)
		print("Saving weights...")
		Q_net.save_weights(os.path.join(project_root, 'q_weights.hdf5'))
		
        # delete variables to free up memory
		del initial_state
		del next_state
		del random_experiences


	# collect the counts of actions taken per episode
	action_counts.append(action_count)
	avg_reward.append(np.mean(episode_rewards))
	std_reward.append(np.std(episode_rewards))

In [None]:
# t1 = cv2.imread('https://firebasestorage.googleapis.com/v0/b/tomatodetection-47395.appspot.com/o/images%2Ffoo.jpg?alt=media&token=f55832ff-3ece-4a86-83b3-051978eb6c0a')
# cv2_imshow(t1)

In [None]:
# Log of training parameters
logging = True
if logging:
	log_location = os.path.join(project_root, 'logs')

	log_names = ['loaded_weights','episodes', 'epsilon','gamma', 
					'Time_steps', 'movement_reward', 'terminal_reward_5', 'terminal_reward_7', 'terminal_reward_9',
					'iou_threshold_5', 'iou_threshold_7','iou_threshold_9','update_step', 'force_terminal']

	log_vars = [loaded_weights, episodes, epsilon, gamma, T,movement_reward,
				terminal_reward_5,terminal_reward_7,terminal_reward_9,
				iou_threshold_5, iou_threshold_7,iou_threshold_9,
				update_step, force_terminal]

	with open(os.path.join(log_location,'saved_weights.csv'), 'w') as csvfile:
		details = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
		details.writerow(log_names)	
		details.writerow(log_vars)

In [None]:
# plotting average reward per action over each episode
import matplotlib.pyplot as plt

minus_std = [avg_reward[i] - std_reward[i] for i in range(len(avg_reward))]
plus_std = [avg_reward[i] + std_reward[i] for i in range(len(avg_reward))]
plt.plot(avg_reward, label='Average Reward', color='black')
plt.plot(minus_std, label='-1 St. Dev', linestyle='--', color='red')
plt.plot(plus_std, label='+1 St. Dev', linestyle='--', color='blue')
plt.xlabel('Episode')
plt.ylabel('Average Reward per Image')
plt.title('Changes in Average Reward for each Image through the Learning Process')
plt.legend()
plt.show()

# Single Image Test

In [None]:
import numpy as np
import argparse
import matplotlib

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import os
import csv
import collections
import pickle

from keras.applications import imagenet_utils
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.preprocessing import image

from keras import backend as K
K.set_image_data_format('channels_last')

In [None]:
val_images[0]

In [None]:
weights

In [None]:
### Vars
# project_root = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/'
# VOC_path = project_root+ 'Reinforcement learning/VOCdevkit/VOC2007'

# image_path = "/home/ersy/Downloads/person_example.jpg"
image_path = train_images[0]
loaded_image = image.load_img(image_path, False)



number_of_actions = 5
history_length = 8
Q_net_input_size = (25128, )


### VGG16 model without top
vgg16_conv = VGG16(include_top=False, weights='imagenet')

saved_weights = 'q_weights.hdf5'
weights = os.path.join(project_root, saved_weights)

Q_net = get_q_network(shape_of_input=Q_net_input_size, number_of_actions=number_of_actions, weights_path=weights)

### Q network definition
epsilon = 0
T = 60


# convert image to array	
original_image = np.array(loaded_image)
image_copy = np.copy(original_image)
image_dimensions = image_copy.shape[:-1]

# create the history vector
history_vec = np.zeros((number_of_actions, history_length))

# preprocess the image
preprocessed_image = image_preprocessing(original_image)

# get initial state vector
state_vec = get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)

# get initial bounding box
boundingbox = np.array([[0,0],image_dimensions])

all_proposals = []

In [None]:
for t in range(T):
		print('Time Step: ', t)
		# add the current state to the experience list
		all_proposals.append(boundingbox)

		# plug state into Q network
		Q_vals = Q_net.predict(state_vec)

		action = np.argmax(Q_vals)


		if action != number_of_actions-1:
			image_copy, boundingbox = crop_image(original_image, boundingbox, action)
		else:
			print("This is your object!")

			break

		# update history vector
		history_vec[:, :-1] = history_vec[:,1:]
		history_vec[:,-1] = [0,0,0,0,0] # hard coded actions here
		history_vec[action, -1] = 1

		preprocessed_image = image_preprocessing(image_copy)
		state_vec = get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)

In [None]:
# Plotting
fig, ax = plt.subplots(1)
ax.imshow(original_image)

num_of_proposals = len(all_proposals)
color = plt.cm.rainbow(np.linspace(0,1,num_of_proposals))

for proposal, c in zip(all_proposals, color):
    top_left = (proposal[0,1], proposal[0,0])
    width = proposal[1,1] - proposal[0,1]
    height = proposal[1,0] - proposal[0,0]
    rect = patches.Rectangle(top_left, width, height, linewidth=2, edgecolor=c, facecolor='none') # change facecolor to add fill
    ax.add_patch(rect)
rect = patches.Rectangle(top_left, width, height, linewidth=2, edgecolor='white', facecolor='none' , label='proposal')
ax.add_patch(rect)

plt.legend()
plt.show()

# Testing Code

In [None]:
import numpy as np
import argparse
import matplotlib
#matplotlib.use("webagg")
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import os
import csv
import collections
import cPickle as pickle

from keras.applications import imagenet_utils
from keras.applications.vgg16 import preprocess_input, VGG16

### Local helpers
import image_actions
import reinforcement_helper
import action_functions
import image_loader

### 
from keras import backend as K
K.set_image_dim_ordering('tf')

In [None]:
### Vars
project_root = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/'
VOC_path = project_root+ 'Reinforcement learning/VOCdevkit/VOC2007'

# parser for the input, defining the number of training epochs and an image
parser = argparse.ArgumentParser(description = 'Epoch: ')
parser.add_argument('-n', metavar='N', type=int, default=0)
parser.add_argument("-i", "--image", help="path to the input image")
args = vars(parser.parse_args())
epochs_id = args['n']
image = args['image']

In [None]:
VOC = True
if VOC:
	### loading up VOC images of a given class
	class_file = 'person_test'
	img_name_list = image_actions.get_img_names(VOC_path, class_file)
	img_list = image_actions.load_images(VOC_path, img_name_list) 

	desired_class = 'person'

	img_list, groundtruths, img_name_list = image_loader.get_class_images(VOC_path, desired_class, img_name_list, img_list)
else:
	class_file = 'Experiment_1'
	img_list = pickle.load(open(project_root+'project_code/pickled_data/Experiment_8_Test_images.pickle', 'rb'))
	groundtruths = pickle.load(open(project_root+'project_code/pickled_data/Experiment_8_Test_boxes.pickle', 'rb'))

# DEBUG: Overfitting hack
#img_list = img_list[0:8]
#groundtruths = groundtruths[0:8]


In [None]:
number_of_actions = 5
history_length = 8
Q_net_input_size = (25128, )


### VGG16 model without top
vgg16_conv = VGG16(include_top=False, weights='imagenet')

# path for non validated set
#weights_path = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/project_code/network_weights/no_validation/'

weights_path = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/project_code/network_weights/final_weights/'

# change the weights loaded for Q network testing
saved_weights = 'Person_TEST.hdf5'
weights = weights_path+saved_weights

Q_net = reinforcement_helper.get_q_network(shape_of_input=Q_net_input_size, number_of_actions=number_of_actions, weights_path=weights)

### Q network definition
epsilon = 0
T = 60
# stores proposal regions
all_proposals = []

# stores ground truth regions
all_ground_truth = []

all_IOU = []

all_actions = []

all_image_scale= []
all_image_centre = []

# IOU for terminal actions - for use in calulating evaluation stats
terminal_IOU = []
terminal_index = []


In [None]:
# loop through images
for image_ix in range(len(img_list)):
	
	original_image = np.array(img_list[image_ix])

	print("new image: ", image_ix)
	# get initial parameters for each image

	image = np.copy(original_image)
	#image_name = img_name_list[image_ix]
	image_dimensions = image.shape[:-1]

	# collect bounding boxes for each image
	ground_image_bb_gt = groundtruths[image_ix]

	# METRICS: get the scale of the object relative to the image size
	
	image_scale = []
	image_centre = []
	for box in ground_image_bb_gt:

		width = box[1][1] - box[0][1]
		height = box[1][0] - box[0][0]
		area = width*height

		image_area = image_dimensions[0]*image_dimensions[1]
		image_scale.append(float(area)/image_area)
		image_centre.append([(box[1][0] + box[0][0])/2, (box[1][1] + box[0][1])/2])
	all_image_scale.append(image_scale)
	all_image_centre.append(image_centre)

	# add current image ground truth to all ground truths
	all_ground_truth.append(ground_image_bb_gt)

	# collect proposal bounding boxes
	boundingboxes = []

	#add image proposals to list of all proposals
	all_proposals.append(boundingboxes)

	# initial bounding box (whole image, raw size)
	boundingbox = np.array([[0,0],image_dimensions])

	# list to store IOU for each object in the image and current bounding box
	IOU_list = []

	# list to store actions taken for each image to associate with IOUs
	# the first IOU is associated with no action
	action_list = []
	
	image_IOU = []
	# get the IOU for each object
	for ground_truth in ground_image_bb_gt:
		current_iou = reinforcement_helper.IOU(ground_truth, boundingbox)
		image_IOU.append(current_iou)
	IOU_list.append(image_IOU)

	# create the history vector
	history_vec = np.zeros((number_of_actions, history_length))

	# preprocess the image
	preprocessed_image = image_actions.image_preprocessing(original_image)

	# get the state vector (conv output of VGG16 concatenated with the action history)
	state_vec = reinforcement_helper.get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)




	for t in range(T):

		# add the current state to the experience list
		all_proposals[image_ix].append(boundingbox)

		# plug state into Q network
		Q_vals = Q_net.predict(state_vec)

		best_action = np.argmax(Q_vals)

	   # exploration or exploitation
		if random.uniform(0,1) < epsilon:
			action = random.randint(0, number_of_actions-1)
		else:
			action = best_action

		print('action:', action)

		if action != number_of_actions-1:
			image, boundingbox = action_functions.crop_image(original_image, boundingbox, action)
		else:
			print("This is your object!")


			current_image_IOU = []
			for ground_truth in ground_image_bb_gt:
				current_iou = reinforcement_helper.IOU(ground_truth, boundingbox)
				current_image_IOU.append(current_iou)
			print("IOU: ", max(current_image_IOU))

			terminal_IOU.append(max(current_image_IOU))
			terminal_index.append(image_ix)
			action_list.append(action)
			#all_actions.append(action_list)

			# implement something to mask the region covered by the boundingbox
			# rerun for the image 
			#mask =  [103.939, 116.779, 123.68]
			#original_image[boundingbox[0,0]:boundingbox[1,0], boundingbox[0,1]:boundingbox[1,1]] = mask

			break

		# measure IOU
		image_IOU = []
		for ground_truth in ground_image_bb_gt:
			current_iou = reinforcement_helper.IOU(ground_truth, boundingbox)
			image_IOU.append(current_iou)
		IOU_list.append(image_IOU)

		action_list.append(action)

		# update history vector
		history_vec[:, :-1] = history_vec[:,1:]
		history_vec[:,-1] = [0,0,0,0,0] # hard coded actions here
		history_vec[action, -1] = 1

		preprocessed_image = image_actions.image_preprocessing(image)
		state_vec = reinforcement_helper.get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)

	# add the IOU calculated for each proposal for each image for evaluation purposes
	all_IOU.append(IOU_list)
	all_actions.append(action_list)


In [None]:
### EVALUATION AND METRICS

# lets the proposals and ground truth bounding boxes be visualised
ix = 0
image_actions.view_results(img_list, all_ground_truth, all_proposals, all_IOU, ix)


# simple evaluation metric
detected = sum([i>=0.5 for i in terminal_IOU])
termination_total = float(len(terminal_IOU))
termination_accuracy = detected/termination_total
print("termination accuracy = ", termination_accuracy)

flat_objects = [x for l in groundtruths for x in l]
total_objects = float(len(flat_objects))
total_accuracy = detected/total_objects
print('total accuracy = ', total_accuracy)

# obtain the accuracy for the final proposal bounding box (regardless of whether the terminal action is triggered)
final_proposal_IOU = [max(i[-1]) for i in all_IOU]
final_proposal_detected = sum([i>0.5 for i in final_proposal_IOU])
final_proposal_accuracy = final_proposal_detected/total_objects
print('final proposal accuracy = ', final_proposal_accuracy)


# turn list of IOUs for each image into separate object IOUs
t1 = [[list(j) for j in zip(*i)] for i in all_IOU]
t2 = [i for j in t1 for i in j]


fig, ax = plt.subplots(4, 1, sharex=True)

In [None]:
# code for investigating actions taken for different images - assessing the agent performance

# objects with the final IoU above 0.5 (terminal action called)
IOU_above_cutoff  = [i for i in t2 if i[-1]>=0.5]

# object 
IOU_below_cutoff = [i for i in t2 if i[-1]<0.5 and len(i) < T+1]

# objects with no terminal action called
IOU_no_terminal = [i for i in t2 if i[-1]<0.5 and len(i) == T+1]

for img in IOU_above_cutoff:
	ax[0].plot(img)
ax[0].set_ylabel('IOU')
ax[0].set_title('IOU above cutoff')
ax[0].set_ylim(0,1)

for img in IOU_below_cutoff:
	ax[1].plot(img)
ax[1].set_ylabel('IOU')
ax[1].set_title('IOU below cutoff')
ax[1].set_ylim(0,1)

for img in IOU_no_terminal:
	ax[2].plot(img)
ax[2].set_ylabel('IOU')
ax[2].set_title('IOU no terminal actions')
ax[2].set_ylim(0,1)

# storing the number of actions taken before the terminal action
action_count = [len(i) for i in all_actions if i[-1] == 4]
action_count_mean = sum(action_count)/len(action_count)
counter = collections.Counter(action_count)

ax[3].bar(counter.keys(), counter.values())
ax[3].set_xlabel("Actions taken")
ax[3].set_ylabel("Count")
ax[3].set_title('Actions per image (terminal action used)')
ax[3].axvline(action_count_mean, color='red', linewidth=2, label='MEAN: '+str(action_count_mean)[:5])
ax[3].legend()

plt.xlim(0,T)
plt.tight_layout()
plt.show()

In [None]:
# calculating mAP
# true positive -> IOU over 0.5 + terminal action
# false positive -> IOU under 0.5 + terminal action
# false negative -> no terminal action taken when image contains an object
# true negative -> no terminal action taken when image does not contain an object

TP = sum([i>=0.5 for i in terminal_IOU])
FP = sum([i<0.5 for i in terminal_IOU])
FN = total_objects-(TP+FP)

AP = float(TP)/(TP+FP)

if TP+FN > 0:
	Recall = float(TP)/(TP+FN)
else:
	Recall = 0

if AP > 0:
	F1 = AP*Recall/(AP+Recall)*2
else:
	F1 = 0


print('precision = ', AP)
print('recall = ', Recall)
print('F1 = ', F1)

In [None]:
average_terminal_IOU = sum(terminal_IOU)/len(terminal_IOU)
print("average terminal IOU = ", average_terminal_IOU)
std_terminal_IOU = np.std(terminal_IOU)
print("terminal IOU standard deviation = ", std_terminal_IOU)
average_TP_IOU = sum([i for i in terminal_IOU if i>=0.5])/TP if TP >0 else np.nan
print("average TP IOU = ", average_TP_IOU)
average_FP_IOU = sum([i for i in terminal_IOU if i<0.5])/FP if FP>0 else np.nan
print("average FP IOU = ", average_FP_IOU)

# Plot distributions of terminal IOUs
bins = np.arange(0,1,0.02)
plt.hist([i for i in terminal_IOU if i>=0.5], bins=bins, color='red')
plt.hist([i for i in terminal_IOU if i<0.5], bins=bins, color='blue')
plt.xlim(0,1)
plt.ylim(0,500)
plt.axvline(average_terminal_IOU, color='black', label='MEAN: '+ str(average_terminal_IOU)[:5])
plt.axvline(average_terminal_IOU-std_terminal_IOU, color='gray', linestyle='--', label='STDEV: '+ str(std_terminal_IOU)[:5])
plt.axvline(average_terminal_IOU+std_terminal_IOU, color='gray', linestyle='--')
plt.xlabel('IoU')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
###
# Get examples of images that did not have terminal actions
# Get examples of images that had a terminal IOU below 0.5
terminal_IOU_index = zip(terminal_index, terminal_IOU)
false_pos_list = [i[0] for i in terminal_IOU_index if i[1] < 0.5]


# Assessing the quality of the agent
# look at cumulative reward as a function of steps 
# calculate the reward in testing with different models
# calculate expected return


IOU_difference = [[k-j for j,k in zip(i[:-1], i[1:])] for i in t2]


from scipy.interpolate import griddata
yx = np.vstack(all_image_centre).T
y = yx[0,:]
x = yx[1,:]
z = list(np.vstack([i[-1] for i in all_IOU]).T[0])
xi = np.linspace(x.min(), x.max(), x.max()-x.min()+1)
yi = np.linspace(y.min(), y.max(), y.max()-y.min()+1)
zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic')

zmin = 0.0
zmax = 1.0
zi[(zi<zmin)] = zmin
zi[(zi>zmax)] = zmax

cs = plt.contourf(xi, yi, zi, 15, cmap=plt.cm.rainbow, vmax=zmax, vmin=zmin)
plt.colorbar()
plt.show()

In [None]:
# Log of parameters and testing scores
log_names = ['class_file', 'Time_steps', 'termination_accuracy', 
			'total_accuracy', 'precision', 'recall', 'F1', 'average_terminal_IOU',
			'average_TP_IOU', 'average_FP_IOU']

log_vars = [class_file, T, termination_accuracy, total_accuracy, AP, Recall, F1, 
			average_terminal_IOU, average_TP_IOU, average_FP_IOU]

log_location = project_root + 'project_code/network_weights/logs/'
with open(log_location+saved_weights + '.csv', 'a') as csvfile:
	details = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
	details.writerow(log_names)	
	details.writerow(log_vars)

# Video Test

In [None]:
import cv2
import numpy as np
import argparse

import random
import os
import csv
import collections


from keras.applications import imagenet_utils
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.preprocessing import image

### Local helpers
import image_actions
import reinforcement_helper
import action_functions
import image_loader


### 
from keras import backend as K
K.set_image_dim_ordering('tf')

In [None]:
### Vars
project_root = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/'
VOC_path = project_root+ 'Reinforcement learning/VOCdevkit/VOC2007'


#image_path = "/home/ersy/Downloads/aeroplane_example7.jpg"
#loaded_image = image.load_img(image_path, False)



number_of_actions = 5
history_length = 8
Q_net_input_size = (25128, )


### VGG16 model without top
vgg16_conv = VGG16(include_top=False, weights='imagenet')

weights_path = '/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/project_code/network_weights/final_weights/'

# change the weights loaded for Q network testing
saved_weights = 'Person_TEST.hdf5'
weights = weights_path+saved_weights

Q_net = reinforcement_helper.get_q_network(shape_of_input=Q_net_input_size, number_of_actions=number_of_actions, weights_path=weights)

In [None]:
### Q network definition
T = 60

def detectObject(original_image, T):
	"""
	takes in image as a numpy array, and a number of time steps then returns a localising bounding box around the object
	"""
	
	image_copy = np.copy(original_image)
	image_dimensions = image_copy.shape[:-1]

	# create the history vector
	history_vec = np.zeros((number_of_actions, history_length))

	# preprocess the image
	preprocessed_image = image_actions.image_preprocessing(original_image)

	# get initial state vector
	state_vec = reinforcement_helper.get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)

	# get initial bounding box
	boundingbox = np.array([[0,0],image_dimensions])

	all_proposals = []

	for t in range(T):
			# add the current state to the experience list
			all_proposals.append(boundingbox)

			# plug state into Q network
			Q_vals = Q_net.predict(state_vec)

			action = np.argmax(Q_vals)


			if action != number_of_actions-1:
				image_copy, boundingbox = action_functions.crop_image(original_image, boundingbox, action)
			else:
				print("This is your object!")
				return boundingbox
				#break


			# update history vector
			history_vec[:, :-1] = history_vec[:,1:]
			history_vec[:,-1] = [0,0,0,0,0] # hard coded actions here
			history_vec[action, -1] = 1

			preprocessed_image = image_actions.image_preprocessing(image_copy)
			state_vec = reinforcement_helper.get_state_as_vec(preprocessed_image, history_vec, vgg16_conv)

	return all_proposals[-1]

In [None]:
cap = cv2.VideoCapture('/media/ersy/Other/Google Drive/QM Work/Queen Mary/Course/Final Project/project_code/videos/Golf_Swing.mp4')
frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

buf = np.empty((frameCount, frameHeight, frameWidth, 3), np.dtype('uint8'))

fc = 0
ret = True

while (fc<205):#cap.read()[0]==True):
    ret, buf[fc] = cap.read()
    fc += 1

cap.release()


# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('Golf_Swing.avi',fourcc, 24.0, (frameWidth, frameHeight), isColor=True)


for frame in range(frameCount):
	print("Frame: ", frame)
	cv2.namedWindow('frame',cv2.WINDOW_NORMAL)
	cv2.resizeWindow('frame', 600,600)
	bb = detectObject(buf[frame], 60)
	cv2.rectangle(buf[frame], (bb[0,1], bb[0,0]),(bb[1,1],bb[1,0]),(0,0,255),2)
	
	out.write(buf[frame])
	cv2.imshow('frame', buf[frame])
	cv2.waitKey(1)

#cv2.namedWindow('frame 10')
#cv2.imshow('frame 10', buf[9])

#cv2.waitKey(0)
out.release()

cv2.destroyAllWindows()