Large diffs are not rendered by default.

@@ -0,0 +1,107 @@
#! /usr/bin/env python

import argparse
import os
import cv2
import numpy as np
from tqdm import tqdm
from preprocessing import parse_annotation
from utils import draw_boxes
from frontend import YOLO
import json

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

argparser = argparse.ArgumentParser(
description='Train and validate YOLO_v2 model on any dataset')

argparser.add_argument(
'-c',
'--conf',
help='path to configuration file')

argparser.add_argument(
'-w',
'--weights',
help='path to pretrained weights')

argparser.add_argument(
'-i',
'--input',
help='path to an image or an video (mp4 format)')


def _main_(args):

config_path = args.conf
weights_path = args.weights
image_path = args.input

with open(config_path) as config_buffer:
config = json.load(config_buffer)

###############################
# Make the model
###############################

yolo = YOLO(architecture=config['model']['architecture'],
input_size=config['model']['input_size'],
labels=config['model']['labels'],
max_box_per_image=config['model']['max_box_per_image'],
anchors=config['model']['anchors'])

###############################
# Load trained weights
###############################

print weights_path
yolo.load_weights(weights_path)

###############################
# Predict bounding boxes
###############################

# if it's an image, do detection, save image with bounding boxes to the same folder

# if it's a folder, do detection, save images with boundins boxes to another folder

# if result folder is present, save annotations to the result folder

if image_path[-4:] == '.mp4':
video_out = image_path[:-4] + '_detected' + image_path[-4:]

video_reader = cv2.VideoCapture(image_path)

nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))

video_writer = cv2.VideoWriter(video_out,
cv2.VideoWriter_fourcc(*'MPEG'),
20.0,
(frame_w, frame_h))

for i in tqdm(range(nb_frames)):
_, image = video_reader.read()

boxes = yolo.predict(image)
image = draw_boxes(image, boxes, config['model']['labels'])

video_writer.write(np.uint8(image))

video_reader.release()
video_writer.release()
else:
image = cv2.imread(image_path)
boxes = yolo.predict(image)
image = draw_boxes(image, boxes, config['model']['labels'])

print len(boxes), 'boxes are found'

cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], image)


if __name__ == '__main__':
args = argparser.parse_args()
_main_(args)
@@ -0,0 +1,318 @@
import os
import cv2
import copy
import numpy as np
import imgaug as ia
from imgaug import augmenters as iaa
from keras.utils import Sequence
import xml.etree.ElementTree as ET
from utils import BoundBox, normalize, bbox_iou


def parse_annotation(ann_dir, img_dir, labels=[]):
all_imgs = []
seen_labels = {}

for ann in sorted(os.listdir(ann_dir)):
img = {'object': []}

print(ann_dir + ann)
tree = ET.parse(ann_dir + ann)

for elem in tree.iter():
if 'filename' in elem.tag:
img['filename'] = img_dir + elem.text
if 'width' in elem.tag:
img['width'] = int(elem.text)
if 'height' in elem.tag:
img['height'] = int(elem.text)
if 'object' in elem.tag or 'part' in elem.tag:
obj = {}

for attr in list(elem):
if 'name' in attr.tag:
obj['name'] = attr.text

if obj['name'] in seen_labels:
seen_labels[obj['name']] += 1
else:
seen_labels[obj['name']] = 1

if len(labels) > 0 and obj['name'] not in labels:
break
else:
img['object'] += [obj]

if 'bndbox' in attr.tag:
for dim in list(attr):
if 'xmin' in dim.tag:
obj['xmin'] = int(round(float(dim.text)))
if 'ymin' in dim.tag:
obj['ymin'] = int(round(float(dim.text)))
if 'xmax' in dim.tag:
obj['xmax'] = int(round(float(dim.text)))
if 'ymax' in dim.tag:
obj['ymax'] = int(round(float(dim.text)))

if len(img['object']) > 0:
all_imgs += [img]

return all_imgs, seen_labels


class BatchGenerator(Sequence):
def __init__(self, images,
config,
shuffle=True,
jitter=True,
norm=None):
self.generator = None

self.images = images
self.config = config

self.shuffle = shuffle
self.jitter = jitter
self.norm = norm

self.counter = 0
self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1])
for i in range(len(config['ANCHORS'])/2)]

# augmentors by https://github.com/aleju/imgaug
def sometimes(aug): return iaa.Sometimes(0.5, aug)

# Define our sequence of augmentation steps that will be applied to every image
# All augmenters with per_channel=0.5 will sample one value _per image_
# in 50% of all cases. In all other cases they will sample new values
# _per channel_.
self.aug_pipe = iaa.Sequential(
[
# apply the following augmenters to most images
# iaa.Fliplr(0.5), # horizontally flip 50% of all images
# iaa.Flipud(0.2), # vertically flip 20% of all images
# sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width
sometimes(iaa.Affine(
# scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
# translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
# rotate=(-5, 5), # rotate by -45 to +45 degrees
# shear=(-5, 5), # shear by -16 to +16 degrees
# order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
# cval=(0, 255), # if mode is constant, use a cval between 0 and 255
# mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
)),
# execute 0 to 5 of the following (less important) augmenters per image
# don't execute all of them, as that would often be way too strong
iaa.SomeOf((0, 5),
[
# sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
iaa.OneOf([
# blur images with a sigma between 0 and 3.0
iaa.GaussianBlur((0, 3.0)),
# blur image using local means with kernel sizes between 2 and 7
iaa.AverageBlur(k=(2, 7)),
# blur image using local medians with kernel sizes between 2 and 7
iaa.MedianBlur(k=(3, 11)),
]),
iaa.Sharpen(alpha=(0, 1.0), lightness=(
0.75, 1.5)), # sharpen images
# iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
# search either for all edges or for directed edges
# sometimes(iaa.OneOf([
# iaa.EdgeDetect(alpha=(0, 0.7)),
# iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),
#])),
# add gaussian noise to images
iaa.AdditiveGaussianNoise(loc=0, scale=(
0.0, 0.05*255), per_channel=0.5),
iaa.OneOf([
# randomly remove up to 10% of the pixels
iaa.Dropout((0.01, 0.1), per_channel=0.5),
#iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),
]),
# iaa.Invert(0.05, per_channel=True), # invert color channels
# change brightness of images (by -10 to 10 of original value)
iaa.Add((-10, 10), per_channel=0.5),
# change brightness of images (50-150% of original value)
iaa.Multiply((0.5, 1.5), per_channel=0.5),
# improve or worsen the contrast
iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5),
#iaa.Grayscale(alpha=(0.0, 1.0)),
# sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
# sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around
],
random_order=True
)
],
random_order=True
)

if shuffle:
np.random.shuffle(self.images)

def __len__(self):
return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))

def __getitem__(self, idx):
l_bound = idx*self.config['BATCH_SIZE']
r_bound = (idx+1)*self.config['BATCH_SIZE']

if r_bound > len(self.images):
r_bound = len(self.images)
l_bound = r_bound - self.config['BATCH_SIZE']

instance_count = 0

# input images
x_batch = np.zeros(
(r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3))
# list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes
b_batch = np.zeros((r_bound - l_bound, 1, 1, 1,
self.config['TRUE_BOX_BUFFER'], 4))
# desired network output
y_batch = np.zeros(
(r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1))

for train_instance in self.images[l_bound:r_bound]:
# augment input image and fix object's position and size
img, all_objs = self.aug_image(train_instance, jitter=self.jitter)

# construct output from object's x, y, w, h
true_box_index = 0

for obj in all_objs:
if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:
center_x = .5*(obj['xmin'] + obj['xmax'])
center_x = center_x / \
(float(self.config['IMAGE_W']) / self.config['GRID_W'])
center_y = .5*(obj['ymin'] + obj['ymax'])
center_y = center_y / \
(float(self.config['IMAGE_H']) / self.config['GRID_H'])

grid_x = int(np.floor(center_x))
grid_y = int(np.floor(center_y))

if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:
obj_indx = self.config['LABELS'].index(obj['name'])

center_w = (obj['xmax'] - obj['xmin']) / (
float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
center_h = (obj['ymax'] - obj['ymin']) / (
float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell

box = [center_x, center_y, center_w, center_h]

# find the anchor that best predicts this box
best_anchor = -1
max_iou = -1

shifted_box = BoundBox(0,
0,
center_w,
center_h)

for i in range(len(self.anchors)):
anchor = self.anchors[i]
iou = bbox_iou(shifted_box, anchor)

if max_iou < iou:
best_anchor = i
max_iou = iou

# assign ground truth x, y, w, h, confidence and class probs to y_batch
y_batch[instance_count, grid_y,
grid_x, best_anchor, 0:4] = box
y_batch[instance_count, grid_y,
grid_x, best_anchor, 4] = 1.
y_batch[instance_count, grid_y,
grid_x, best_anchor, 5] = obj_indx

# assign the true box to b_batch
b_batch[instance_count, 0, 0, 0, true_box_index] = box

true_box_index += 1
true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']

# assign input image to x_batch
if self.norm != None:
x_batch[instance_count] = self.norm(img)
else:
# plot image and bounding boxes for sanity check
for obj in all_objs:
if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:
cv2.rectangle(img[:, :, ::-1], (obj['xmin'], obj['ymin']),
(obj['xmax'], obj['ymax']), (255, 0, 0), 3)
cv2.putText(img[:, :, ::-1], obj['name'],
(obj['xmin']+2, obj['ymin']+12),
0, 1.2e-3 * img.shape[0],
(0, 255, 0), 2)

x_batch[instance_count] = img

# increase instance counter in current batch
instance_count += 1

self.counter += 1
# print ' new batch created', self.counter

return [x_batch, b_batch], y_batch

def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.images)
self.counter = 0

def aug_image(self, train_instance, jitter):
image_name = train_instance['filename']
image = cv2.imread(image_name)
h, w, c = image.shape

all_objs = copy.deepcopy(train_instance['object'])

if jitter:
# scale the image
scale = np.random.uniform() / 10. + 1.
image = cv2.resize(image, (0, 0), fx=scale, fy=scale)

# translate the image
max_offx = (scale-1.) * w
max_offy = (scale-1.) * h
offx = int(np.random.uniform() * max_offx)
offy = int(np.random.uniform() * max_offy)

image = image[offy: (offy + h), offx: (offx + w)]

# flip the image
flip = np.random.binomial(1, .5)
if flip > 0.5:
image = cv2.flip(image, 1)

image = self.aug_pipe.augment_image(image)

# resize the image to standard size
image = cv2.resize(
image, (self.config['IMAGE_H'], self.config['IMAGE_W']))
image = image[:, :, ::-1]

# fix object's position and size
for obj in all_objs:
for attr in ['xmin', 'xmax']:
if jitter:
obj[attr] = int(obj[attr] * scale - offx)

obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)
obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)

for attr in ['ymin', 'ymax']:
if jitter:
obj[attr] = int(obj[attr] * scale - offy)

obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)
obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)

if jitter and flip > 0.5:
xmin = obj['xmin']
obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']
obj['xmax'] = self.config['IMAGE_W'] - xmin

return image, all_objs
137 train.py
@@ -0,0 +1,137 @@
#! /usr/bin/env python

"""
This script takes in a configuration file and produces the best model.
The configuration file is a json file and looks like this:
{
"model" : {
"architecture": "Full Yolo",
"input_size": 416,
"anchors": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828],
"max_box_per_image": 10,
"labels": ["raccoon"]
},
"train": {
"train_image_folder": "/home/andy/data/raccoon_dataset/images/",
"train_annot_folder": "/home/andy/data/raccoon_dataset/anns/",
"train_times": 10,
"pretrained_weights": "",
"batch_size": 16,
"learning_rate": 1e-4,
"nb_epoch": 50,
"warmup_batches": 100,
"object_scale": 5.0 ,
"no_object_scale": 1.0,
"coord_scale": 1.0,
"class_scale": 1.0,
"debug": true
},
"valid": {
"valid_image_folder": "",
"valid_annot_folder": "",
"valid_times": 1
}
}
"""

import argparse
import os
import numpy as np
from preprocessing import parse_annotation
from frontend import YOLO
import json

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

argparser = argparse.ArgumentParser(
description='Train and validate YOLO_v2 model on any dataset')

argparser.add_argument(
'-c',
'--conf',
help='path to configuration file')


def _main_(args):

config_path = args.conf

with open(config_path) as config_buffer:
config = json.loads(config_buffer.read())

###############################
# Parse the annotations
###############################

# parse annotations of the training set
train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'],
config['train']['train_image_folder'],
config['model']['labels'])

# parse annotations of the validation set, if any, otherwise split the training set
if os.path.exists(config['valid']['valid_annot_folder']):
valid_imgs, valid_labels = parse_annotation(config['valid']['valid_annot_folder'],
config['valid']['valid_image_folder'],
config['model']['labels'])
else:
train_valid_split = int(0.8*len(train_imgs))
np.random.shuffle(train_imgs)

valid_imgs = train_imgs[train_valid_split:]
train_imgs = train_imgs[:train_valid_split]

print train_labels

if len(set(config['model']['labels']).intersection(set(train_labels.keys()))) == 0:
print "Labels to be detected are not present in the dataset! Please revise the list of labels in the config.json file!"
return

###############################
# Construct the model
###############################

yolo = YOLO(architecture=config['model']['architecture'],
input_size=config['model']['input_size'],
labels=config['model']['labels'],
max_box_per_image=config['model']['max_box_per_image'],
anchors=config['model']['anchors'])

###############################
# Load the pretrained weights (if any)
###############################

if os.path.exists(config['train']['pretrained_weights']):
print "Loading pre-trained weights in", config['train']['pretrained_weights']
yolo.load_weights(config['train']['pretrained_weights'])

###############################
# Start the training process
###############################

yolo.train(train_imgs=train_imgs,
valid_imgs=valid_imgs,
train_times=config['train']['train_times'],
valid_times=config['valid']['valid_times'],
nb_epoch=config['train']['nb_epoch'],
learning_rate=config['train']['learning_rate'],
batch_size=config['train']['batch_size'],
warmup_bs=config['train']['warmup_batches'],
object_scale=config['train']['object_scale'],
no_object_scale=config['train']['no_object_scale'],
coord_scale=config['train']['coord_scale'],
class_scale=config['train']['class_scale'],
saved_weights_name=config['train']['saved_weights_name'],
debug=config['train']['debug'])


if __name__ == '__main__':
args = argparser.parse_args()
_main_(args)
180 utils.py
@@ -0,0 +1,180 @@
import numpy as np
import os
import xml.etree.ElementTree as ET
import tensorflow as tf
import copy
import cv2


class BoundBox:
def __init__(self, x, y, w, h, c=None, classes=None):
self.x = x
self.y = y
self.w = w
self.h = h

self.c = c
self.classes = classes

self.label = -1
self.score = -1

def get_label(self):
if self.label == -1:
self.label = np.argmax(self.classes)

return self.label

def get_score(self):
if self.score == -1:
self.score = self.classes[self.get_label()]

return self.score


class WeightReader:
def __init__(self, weight_file):
self.offset = 4
self.all_weights = np.fromfile(weight_file, dtype='float32')

def read_bytes(self, size):
self.offset = self.offset + size
return self.all_weights[self.offset-size:self.offset]

def reset(self):
self.offset = 4


def normalize(image):
image = image / 255.

return image


def bbox_iou(box1, box2):
x1_min = box1.x - box1.w/2
x1_max = box1.x + box1.w/2
y1_min = box1.y - box1.h/2
y1_max = box1.y + box1.h/2

x2_min = box2.x - box2.w/2
x2_max = box2.x + box2.w/2
y2_min = box2.y - box2.h/2
y2_max = box2.y + box2.h/2

intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max])
intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max])

intersect = intersect_w * intersect_h

union = box1.w * box1.h + box2.w * box2.h - intersect

return float(intersect) / union


def interval_overlap(interval_a, interval_b):
x1, x2 = interval_a
x3, x4 = interval_b

if x3 < x1:
if x4 < x1:
return 0
else:
return min(x2, x4) - x1
else:
if x2 < x3:
return 0
else:
return min(x2, x4) - x3


def draw_boxes(image, boxes, labels):

for box in boxes:
xmin = int((box.x - box.w/2) * image.shape[1])
xmax = int((box.x + box.w/2) * image.shape[1])
ymin = int((box.y - box.h/2) * image.shape[0])
ymax = int((box.y + box.h/2) * image.shape[0])

cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3)
cv2.putText(image,
labels[box.get_label()] + ' ' + str(box.get_score()),
(xmin, ymin - 13),
cv2.FONT_HERSHEY_SIMPLEX,
1e-3 * image.shape[0],
(0, 255, 0), 2)

return image


def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class):
grid_h, grid_w, nb_box = netout.shape[:3]

boxes = []

# decode the output by the network
netout[..., 4] = sigmoid(netout[..., 4])
netout[..., 5:] = netout[..., 4][...,
np.newaxis] * softmax(netout[..., 5:])
netout[..., 5:] *= netout[..., 5:] > obj_threshold

for row in range(grid_h):
for col in range(grid_w):
for b in range(nb_box):
# from 4th element onwards are confidence and class classes
classes = netout[row, col, b, 5:]

if np.sum(classes) > 0:
# first 4 elements are x, y, w, and h
x, y, w, h = netout[row, col, b, :4]

# center position, unit: image width
x = (col + sigmoid(x)) / grid_w
# center position, unit: image height
y = (row + sigmoid(y)) / grid_h
w = anchors[2 * b + 0] * \
np.exp(w) / grid_w # unit: image width
h = anchors[2 * b + 1] * \
np.exp(h) / grid_h # unit: image height
confidence = netout[row, col, b, 4]

box = BoundBox(x, y, w, h, confidence, classes)

boxes.append(box)

# suppress non-maximal boxes
for c in range(nb_class):
sorted_indices = list(
reversed(np.argsort([box.classes[c] for box in boxes])))

for i in xrange(len(sorted_indices)):
index_i = sorted_indices[i]

if boxes[index_i].classes[c] == 0:
continue
else:
for j in xrange(i+1, len(sorted_indices)):
index_j = sorted_indices[j]

if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold:
boxes[index_j].classes[c] = 0

# remove the boxes which are less likely than a obj_threshold
boxes = [box for box in boxes if box.get_score() > obj_threshold]

return boxes


def sigmoid(x):
return 1. / (1. + np.exp(-x))


def softmax(x, axis=-1, t=-100.):
x = x - np.max(x)

if np.min(x) < t:
x = x / np.min(x)*t

e_x = np.exp(x)

return e_x / e_x.sum(axis, keepdims=True)