| @@ -0,0 +1,107 @@ | ||
| #! /usr/bin/env python | ||
|
|
||
| import argparse | ||
| import os | ||
| import cv2 | ||
| import numpy as np | ||
| from tqdm import tqdm | ||
| from preprocessing import parse_annotation | ||
| from utils import draw_boxes | ||
| from frontend import YOLO | ||
| import json | ||
|
|
||
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | ||
|
|
||
| argparser = argparse.ArgumentParser( | ||
| description='Train and validate YOLO_v2 model on any dataset') | ||
|
|
||
| argparser.add_argument( | ||
| '-c', | ||
| '--conf', | ||
| help='path to configuration file') | ||
|
|
||
| argparser.add_argument( | ||
| '-w', | ||
| '--weights', | ||
| help='path to pretrained weights') | ||
|
|
||
| argparser.add_argument( | ||
| '-i', | ||
| '--input', | ||
| help='path to an image or an video (mp4 format)') | ||
|
|
||
|
|
||
| def _main_(args): | ||
|
|
||
| config_path = args.conf | ||
| weights_path = args.weights | ||
| image_path = args.input | ||
|
|
||
| with open(config_path) as config_buffer: | ||
| config = json.load(config_buffer) | ||
|
|
||
| ############################### | ||
| # Make the model | ||
| ############################### | ||
|
|
||
| yolo = YOLO(architecture=config['model']['architecture'], | ||
| input_size=config['model']['input_size'], | ||
| labels=config['model']['labels'], | ||
| max_box_per_image=config['model']['max_box_per_image'], | ||
| anchors=config['model']['anchors']) | ||
|
|
||
| ############################### | ||
| # Load trained weights | ||
| ############################### | ||
|
|
||
| print weights_path | ||
| yolo.load_weights(weights_path) | ||
|
|
||
| ############################### | ||
| # Predict bounding boxes | ||
| ############################### | ||
|
|
||
| # if it's an image, do detection, save image with bounding boxes to the same folder | ||
|
|
||
| # if it's a folder, do detection, save images with boundins boxes to another folder | ||
|
|
||
| # if result folder is present, save annotations to the result folder | ||
|
|
||
| if image_path[-4:] == '.mp4': | ||
| video_out = image_path[:-4] + '_detected' + image_path[-4:] | ||
|
|
||
| video_reader = cv2.VideoCapture(image_path) | ||
|
|
||
| nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT)) | ||
| frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT)) | ||
| frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH)) | ||
|
|
||
| video_writer = cv2.VideoWriter(video_out, | ||
| cv2.VideoWriter_fourcc(*'MPEG'), | ||
| 20.0, | ||
| (frame_w, frame_h)) | ||
|
|
||
| for i in tqdm(range(nb_frames)): | ||
| _, image = video_reader.read() | ||
|
|
||
| boxes = yolo.predict(image) | ||
| image = draw_boxes(image, boxes, config['model']['labels']) | ||
|
|
||
| video_writer.write(np.uint8(image)) | ||
|
|
||
| video_reader.release() | ||
| video_writer.release() | ||
| else: | ||
| image = cv2.imread(image_path) | ||
| boxes = yolo.predict(image) | ||
| image = draw_boxes(image, boxes, config['model']['labels']) | ||
|
|
||
| print len(boxes), 'boxes are found' | ||
|
|
||
| cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], image) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| args = argparser.parse_args() | ||
| _main_(args) |
| @@ -0,0 +1,318 @@ | ||
| import os | ||
| import cv2 | ||
| import copy | ||
| import numpy as np | ||
| import imgaug as ia | ||
| from imgaug import augmenters as iaa | ||
| from keras.utils import Sequence | ||
| import xml.etree.ElementTree as ET | ||
| from utils import BoundBox, normalize, bbox_iou | ||
|
|
||
|
|
||
| def parse_annotation(ann_dir, img_dir, labels=[]): | ||
| all_imgs = [] | ||
| seen_labels = {} | ||
|
|
||
| for ann in sorted(os.listdir(ann_dir)): | ||
| img = {'object': []} | ||
|
|
||
| print(ann_dir + ann) | ||
| tree = ET.parse(ann_dir + ann) | ||
|
|
||
| for elem in tree.iter(): | ||
| if 'filename' in elem.tag: | ||
| img['filename'] = img_dir + elem.text | ||
| if 'width' in elem.tag: | ||
| img['width'] = int(elem.text) | ||
| if 'height' in elem.tag: | ||
| img['height'] = int(elem.text) | ||
| if 'object' in elem.tag or 'part' in elem.tag: | ||
| obj = {} | ||
|
|
||
| for attr in list(elem): | ||
| if 'name' in attr.tag: | ||
| obj['name'] = attr.text | ||
|
|
||
| if obj['name'] in seen_labels: | ||
| seen_labels[obj['name']] += 1 | ||
| else: | ||
| seen_labels[obj['name']] = 1 | ||
|
|
||
| if len(labels) > 0 and obj['name'] not in labels: | ||
| break | ||
| else: | ||
| img['object'] += [obj] | ||
|
|
||
| if 'bndbox' in attr.tag: | ||
| for dim in list(attr): | ||
| if 'xmin' in dim.tag: | ||
| obj['xmin'] = int(round(float(dim.text))) | ||
| if 'ymin' in dim.tag: | ||
| obj['ymin'] = int(round(float(dim.text))) | ||
| if 'xmax' in dim.tag: | ||
| obj['xmax'] = int(round(float(dim.text))) | ||
| if 'ymax' in dim.tag: | ||
| obj['ymax'] = int(round(float(dim.text))) | ||
|
|
||
| if len(img['object']) > 0: | ||
| all_imgs += [img] | ||
|
|
||
| return all_imgs, seen_labels | ||
|
|
||
|
|
||
| class BatchGenerator(Sequence): | ||
| def __init__(self, images, | ||
| config, | ||
| shuffle=True, | ||
| jitter=True, | ||
| norm=None): | ||
| self.generator = None | ||
|
|
||
| self.images = images | ||
| self.config = config | ||
|
|
||
| self.shuffle = shuffle | ||
| self.jitter = jitter | ||
| self.norm = norm | ||
|
|
||
| self.counter = 0 | ||
| self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) | ||
| for i in range(len(config['ANCHORS'])/2)] | ||
|
|
||
| # augmentors by https://github.com/aleju/imgaug | ||
| def sometimes(aug): return iaa.Sometimes(0.5, aug) | ||
|
|
||
| # Define our sequence of augmentation steps that will be applied to every image | ||
| # All augmenters with per_channel=0.5 will sample one value _per image_ | ||
| # in 50% of all cases. In all other cases they will sample new values | ||
| # _per channel_. | ||
| self.aug_pipe = iaa.Sequential( | ||
| [ | ||
| # apply the following augmenters to most images | ||
| # iaa.Fliplr(0.5), # horizontally flip 50% of all images | ||
| # iaa.Flipud(0.2), # vertically flip 20% of all images | ||
| # sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width | ||
| sometimes(iaa.Affine( | ||
| # scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis | ||
| # translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis) | ||
| # rotate=(-5, 5), # rotate by -45 to +45 degrees | ||
| # shear=(-5, 5), # shear by -16 to +16 degrees | ||
| # order=[0, 1], # use nearest neighbour or bilinear interpolation (fast) | ||
| # cval=(0, 255), # if mode is constant, use a cval between 0 and 255 | ||
| # mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples) | ||
| )), | ||
| # execute 0 to 5 of the following (less important) augmenters per image | ||
| # don't execute all of them, as that would often be way too strong | ||
| iaa.SomeOf((0, 5), | ||
| [ | ||
| # sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation | ||
| iaa.OneOf([ | ||
| # blur images with a sigma between 0 and 3.0 | ||
| iaa.GaussianBlur((0, 3.0)), | ||
| # blur image using local means with kernel sizes between 2 and 7 | ||
| iaa.AverageBlur(k=(2, 7)), | ||
| # blur image using local medians with kernel sizes between 2 and 7 | ||
| iaa.MedianBlur(k=(3, 11)), | ||
| ]), | ||
| iaa.Sharpen(alpha=(0, 1.0), lightness=( | ||
| 0.75, 1.5)), # sharpen images | ||
| # iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images | ||
| # search either for all edges or for directed edges | ||
| # sometimes(iaa.OneOf([ | ||
| # iaa.EdgeDetect(alpha=(0, 0.7)), | ||
| # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)), | ||
| #])), | ||
| # add gaussian noise to images | ||
| iaa.AdditiveGaussianNoise(loc=0, scale=( | ||
| 0.0, 0.05*255), per_channel=0.5), | ||
| iaa.OneOf([ | ||
| # randomly remove up to 10% of the pixels | ||
| iaa.Dropout((0.01, 0.1), per_channel=0.5), | ||
| #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), | ||
| ]), | ||
| # iaa.Invert(0.05, per_channel=True), # invert color channels | ||
| # change brightness of images (by -10 to 10 of original value) | ||
| iaa.Add((-10, 10), per_channel=0.5), | ||
| # change brightness of images (50-150% of original value) | ||
| iaa.Multiply((0.5, 1.5), per_channel=0.5), | ||
| # improve or worsen the contrast | ||
| iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), | ||
| #iaa.Grayscale(alpha=(0.0, 1.0)), | ||
| # sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths) | ||
| # sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around | ||
| ], | ||
| random_order=True | ||
| ) | ||
| ], | ||
| random_order=True | ||
| ) | ||
|
|
||
| if shuffle: | ||
| np.random.shuffle(self.images) | ||
|
|
||
| def __len__(self): | ||
| return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE'])) | ||
|
|
||
| def __getitem__(self, idx): | ||
| l_bound = idx*self.config['BATCH_SIZE'] | ||
| r_bound = (idx+1)*self.config['BATCH_SIZE'] | ||
|
|
||
| if r_bound > len(self.images): | ||
| r_bound = len(self.images) | ||
| l_bound = r_bound - self.config['BATCH_SIZE'] | ||
|
|
||
| instance_count = 0 | ||
|
|
||
| # input images | ||
| x_batch = np.zeros( | ||
| (r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) | ||
| # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes | ||
| b_batch = np.zeros((r_bound - l_bound, 1, 1, 1, | ||
| self.config['TRUE_BOX_BUFFER'], 4)) | ||
| # desired network output | ||
| y_batch = np.zeros( | ||
| (r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1)) | ||
|
|
||
| for train_instance in self.images[l_bound:r_bound]: | ||
| # augment input image and fix object's position and size | ||
| img, all_objs = self.aug_image(train_instance, jitter=self.jitter) | ||
|
|
||
| # construct output from object's x, y, w, h | ||
| true_box_index = 0 | ||
|
|
||
| for obj in all_objs: | ||
| if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: | ||
| center_x = .5*(obj['xmin'] + obj['xmax']) | ||
| center_x = center_x / \ | ||
| (float(self.config['IMAGE_W']) / self.config['GRID_W']) | ||
| center_y = .5*(obj['ymin'] + obj['ymax']) | ||
| center_y = center_y / \ | ||
| (float(self.config['IMAGE_H']) / self.config['GRID_H']) | ||
|
|
||
| grid_x = int(np.floor(center_x)) | ||
| grid_y = int(np.floor(center_y)) | ||
|
|
||
| if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: | ||
| obj_indx = self.config['LABELS'].index(obj['name']) | ||
|
|
||
| center_w = (obj['xmax'] - obj['xmin']) / ( | ||
| float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell | ||
| center_h = (obj['ymax'] - obj['ymin']) / ( | ||
| float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell | ||
|
|
||
| box = [center_x, center_y, center_w, center_h] | ||
|
|
||
| # find the anchor that best predicts this box | ||
| best_anchor = -1 | ||
| max_iou = -1 | ||
|
|
||
| shifted_box = BoundBox(0, | ||
| 0, | ||
| center_w, | ||
| center_h) | ||
|
|
||
| for i in range(len(self.anchors)): | ||
| anchor = self.anchors[i] | ||
| iou = bbox_iou(shifted_box, anchor) | ||
|
|
||
| if max_iou < iou: | ||
| best_anchor = i | ||
| max_iou = iou | ||
|
|
||
| # assign ground truth x, y, w, h, confidence and class probs to y_batch | ||
| y_batch[instance_count, grid_y, | ||
| grid_x, best_anchor, 0:4] = box | ||
| y_batch[instance_count, grid_y, | ||
| grid_x, best_anchor, 4] = 1. | ||
| y_batch[instance_count, grid_y, | ||
| grid_x, best_anchor, 5] = obj_indx | ||
|
|
||
| # assign the true box to b_batch | ||
| b_batch[instance_count, 0, 0, 0, true_box_index] = box | ||
|
|
||
| true_box_index += 1 | ||
| true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] | ||
|
|
||
| # assign input image to x_batch | ||
| if self.norm != None: | ||
| x_batch[instance_count] = self.norm(img) | ||
| else: | ||
| # plot image and bounding boxes for sanity check | ||
| for obj in all_objs: | ||
| if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: | ||
| cv2.rectangle(img[:, :, ::-1], (obj['xmin'], obj['ymin']), | ||
| (obj['xmax'], obj['ymax']), (255, 0, 0), 3) | ||
| cv2.putText(img[:, :, ::-1], obj['name'], | ||
| (obj['xmin']+2, obj['ymin']+12), | ||
| 0, 1.2e-3 * img.shape[0], | ||
| (0, 255, 0), 2) | ||
|
|
||
| x_batch[instance_count] = img | ||
|
|
||
| # increase instance counter in current batch | ||
| instance_count += 1 | ||
|
|
||
| self.counter += 1 | ||
| # print ' new batch created', self.counter | ||
|
|
||
| return [x_batch, b_batch], y_batch | ||
|
|
||
| def on_epoch_end(self): | ||
| if self.shuffle: | ||
| np.random.shuffle(self.images) | ||
| self.counter = 0 | ||
|
|
||
| def aug_image(self, train_instance, jitter): | ||
| image_name = train_instance['filename'] | ||
| image = cv2.imread(image_name) | ||
| h, w, c = image.shape | ||
|
|
||
| all_objs = copy.deepcopy(train_instance['object']) | ||
|
|
||
| if jitter: | ||
| # scale the image | ||
| scale = np.random.uniform() / 10. + 1. | ||
| image = cv2.resize(image, (0, 0), fx=scale, fy=scale) | ||
|
|
||
| # translate the image | ||
| max_offx = (scale-1.) * w | ||
| max_offy = (scale-1.) * h | ||
| offx = int(np.random.uniform() * max_offx) | ||
| offy = int(np.random.uniform() * max_offy) | ||
|
|
||
| image = image[offy: (offy + h), offx: (offx + w)] | ||
|
|
||
| # flip the image | ||
| flip = np.random.binomial(1, .5) | ||
| if flip > 0.5: | ||
| image = cv2.flip(image, 1) | ||
|
|
||
| image = self.aug_pipe.augment_image(image) | ||
|
|
||
| # resize the image to standard size | ||
| image = cv2.resize( | ||
| image, (self.config['IMAGE_H'], self.config['IMAGE_W'])) | ||
| image = image[:, :, ::-1] | ||
|
|
||
| # fix object's position and size | ||
| for obj in all_objs: | ||
| for attr in ['xmin', 'xmax']: | ||
| if jitter: | ||
| obj[attr] = int(obj[attr] * scale - offx) | ||
|
|
||
| obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w) | ||
| obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0) | ||
|
|
||
| for attr in ['ymin', 'ymax']: | ||
| if jitter: | ||
| obj[attr] = int(obj[attr] * scale - offy) | ||
|
|
||
| obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h) | ||
| obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0) | ||
|
|
||
| if jitter and flip > 0.5: | ||
| xmin = obj['xmin'] | ||
| obj['xmin'] = self.config['IMAGE_W'] - obj['xmax'] | ||
| obj['xmax'] = self.config['IMAGE_W'] - xmin | ||
|
|
||
| return image, all_objs |
| @@ -0,0 +1,137 @@ | ||
| #! /usr/bin/env python | ||
|
|
||
| """ | ||
| This script takes in a configuration file and produces the best model. | ||
| The configuration file is a json file and looks like this: | ||
| { | ||
| "model" : { | ||
| "architecture": "Full Yolo", | ||
| "input_size": 416, | ||
| "anchors": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828], | ||
| "max_box_per_image": 10, | ||
| "labels": ["raccoon"] | ||
| }, | ||
| "train": { | ||
| "train_image_folder": "/home/andy/data/raccoon_dataset/images/", | ||
| "train_annot_folder": "/home/andy/data/raccoon_dataset/anns/", | ||
| "train_times": 10, | ||
| "pretrained_weights": "", | ||
| "batch_size": 16, | ||
| "learning_rate": 1e-4, | ||
| "nb_epoch": 50, | ||
| "warmup_batches": 100, | ||
| "object_scale": 5.0 , | ||
| "no_object_scale": 1.0, | ||
| "coord_scale": 1.0, | ||
| "class_scale": 1.0, | ||
| "debug": true | ||
| }, | ||
| "valid": { | ||
| "valid_image_folder": "", | ||
| "valid_annot_folder": "", | ||
| "valid_times": 1 | ||
| } | ||
| } | ||
| """ | ||
|
|
||
| import argparse | ||
| import os | ||
| import numpy as np | ||
| from preprocessing import parse_annotation | ||
| from frontend import YOLO | ||
| import json | ||
|
|
||
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | ||
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | ||
|
|
||
| argparser = argparse.ArgumentParser( | ||
| description='Train and validate YOLO_v2 model on any dataset') | ||
|
|
||
| argparser.add_argument( | ||
| '-c', | ||
| '--conf', | ||
| help='path to configuration file') | ||
|
|
||
|
|
||
| def _main_(args): | ||
|
|
||
| config_path = args.conf | ||
|
|
||
| with open(config_path) as config_buffer: | ||
| config = json.loads(config_buffer.read()) | ||
|
|
||
| ############################### | ||
| # Parse the annotations | ||
| ############################### | ||
|
|
||
| # parse annotations of the training set | ||
| train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'], | ||
| config['train']['train_image_folder'], | ||
| config['model']['labels']) | ||
|
|
||
| # parse annotations of the validation set, if any, otherwise split the training set | ||
| if os.path.exists(config['valid']['valid_annot_folder']): | ||
| valid_imgs, valid_labels = parse_annotation(config['valid']['valid_annot_folder'], | ||
| config['valid']['valid_image_folder'], | ||
| config['model']['labels']) | ||
| else: | ||
| train_valid_split = int(0.8*len(train_imgs)) | ||
| np.random.shuffle(train_imgs) | ||
|
|
||
| valid_imgs = train_imgs[train_valid_split:] | ||
| train_imgs = train_imgs[:train_valid_split] | ||
|
|
||
| print train_labels | ||
|
|
||
| if len(set(config['model']['labels']).intersection(set(train_labels.keys()))) == 0: | ||
| print "Labels to be detected are not present in the dataset! Please revise the list of labels in the config.json file!" | ||
| return | ||
|
|
||
| ############################### | ||
| # Construct the model | ||
| ############################### | ||
|
|
||
| yolo = YOLO(architecture=config['model']['architecture'], | ||
| input_size=config['model']['input_size'], | ||
| labels=config['model']['labels'], | ||
| max_box_per_image=config['model']['max_box_per_image'], | ||
| anchors=config['model']['anchors']) | ||
|
|
||
| ############################### | ||
| # Load the pretrained weights (if any) | ||
| ############################### | ||
|
|
||
| if os.path.exists(config['train']['pretrained_weights']): | ||
| print "Loading pre-trained weights in", config['train']['pretrained_weights'] | ||
| yolo.load_weights(config['train']['pretrained_weights']) | ||
|
|
||
| ############################### | ||
| # Start the training process | ||
| ############################### | ||
|
|
||
| yolo.train(train_imgs=train_imgs, | ||
| valid_imgs=valid_imgs, | ||
| train_times=config['train']['train_times'], | ||
| valid_times=config['valid']['valid_times'], | ||
| nb_epoch=config['train']['nb_epoch'], | ||
| learning_rate=config['train']['learning_rate'], | ||
| batch_size=config['train']['batch_size'], | ||
| warmup_bs=config['train']['warmup_batches'], | ||
| object_scale=config['train']['object_scale'], | ||
| no_object_scale=config['train']['no_object_scale'], | ||
| coord_scale=config['train']['coord_scale'], | ||
| class_scale=config['train']['class_scale'], | ||
| saved_weights_name=config['train']['saved_weights_name'], | ||
| debug=config['train']['debug']) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| args = argparser.parse_args() | ||
| _main_(args) |
| @@ -0,0 +1,180 @@ | ||
| import numpy as np | ||
| import os | ||
| import xml.etree.ElementTree as ET | ||
| import tensorflow as tf | ||
| import copy | ||
| import cv2 | ||
|
|
||
|
|
||
| class BoundBox: | ||
| def __init__(self, x, y, w, h, c=None, classes=None): | ||
| self.x = x | ||
| self.y = y | ||
| self.w = w | ||
| self.h = h | ||
|
|
||
| self.c = c | ||
| self.classes = classes | ||
|
|
||
| self.label = -1 | ||
| self.score = -1 | ||
|
|
||
| def get_label(self): | ||
| if self.label == -1: | ||
| self.label = np.argmax(self.classes) | ||
|
|
||
| return self.label | ||
|
|
||
| def get_score(self): | ||
| if self.score == -1: | ||
| self.score = self.classes[self.get_label()] | ||
|
|
||
| return self.score | ||
|
|
||
|
|
||
| class WeightReader: | ||
| def __init__(self, weight_file): | ||
| self.offset = 4 | ||
| self.all_weights = np.fromfile(weight_file, dtype='float32') | ||
|
|
||
| def read_bytes(self, size): | ||
| self.offset = self.offset + size | ||
| return self.all_weights[self.offset-size:self.offset] | ||
|
|
||
| def reset(self): | ||
| self.offset = 4 | ||
|
|
||
|
|
||
| def normalize(image): | ||
| image = image / 255. | ||
|
|
||
| return image | ||
|
|
||
|
|
||
| def bbox_iou(box1, box2): | ||
| x1_min = box1.x - box1.w/2 | ||
| x1_max = box1.x + box1.w/2 | ||
| y1_min = box1.y - box1.h/2 | ||
| y1_max = box1.y + box1.h/2 | ||
|
|
||
| x2_min = box2.x - box2.w/2 | ||
| x2_max = box2.x + box2.w/2 | ||
| y2_min = box2.y - box2.h/2 | ||
| y2_max = box2.y + box2.h/2 | ||
|
|
||
| intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max]) | ||
| intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max]) | ||
|
|
||
| intersect = intersect_w * intersect_h | ||
|
|
||
| union = box1.w * box1.h + box2.w * box2.h - intersect | ||
|
|
||
| return float(intersect) / union | ||
|
|
||
|
|
||
| def interval_overlap(interval_a, interval_b): | ||
| x1, x2 = interval_a | ||
| x3, x4 = interval_b | ||
|
|
||
| if x3 < x1: | ||
| if x4 < x1: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x1 | ||
| else: | ||
| if x2 < x3: | ||
| return 0 | ||
| else: | ||
| return min(x2, x4) - x3 | ||
|
|
||
|
|
||
| def draw_boxes(image, boxes, labels): | ||
|
|
||
| for box in boxes: | ||
| xmin = int((box.x - box.w/2) * image.shape[1]) | ||
| xmax = int((box.x + box.w/2) * image.shape[1]) | ||
| ymin = int((box.y - box.h/2) * image.shape[0]) | ||
| ymax = int((box.y + box.h/2) * image.shape[0]) | ||
|
|
||
| cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3) | ||
| cv2.putText(image, | ||
| labels[box.get_label()] + ' ' + str(box.get_score()), | ||
| (xmin, ymin - 13), | ||
| cv2.FONT_HERSHEY_SIMPLEX, | ||
| 1e-3 * image.shape[0], | ||
| (0, 255, 0), 2) | ||
|
|
||
| return image | ||
|
|
||
|
|
||
| def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class): | ||
| grid_h, grid_w, nb_box = netout.shape[:3] | ||
|
|
||
| boxes = [] | ||
|
|
||
| # decode the output by the network | ||
| netout[..., 4] = sigmoid(netout[..., 4]) | ||
| netout[..., 5:] = netout[..., 4][..., | ||
| np.newaxis] * softmax(netout[..., 5:]) | ||
| netout[..., 5:] *= netout[..., 5:] > obj_threshold | ||
|
|
||
| for row in range(grid_h): | ||
| for col in range(grid_w): | ||
| for b in range(nb_box): | ||
| # from 4th element onwards are confidence and class classes | ||
| classes = netout[row, col, b, 5:] | ||
|
|
||
| if np.sum(classes) > 0: | ||
| # first 4 elements are x, y, w, and h | ||
| x, y, w, h = netout[row, col, b, :4] | ||
|
|
||
| # center position, unit: image width | ||
| x = (col + sigmoid(x)) / grid_w | ||
| # center position, unit: image height | ||
| y = (row + sigmoid(y)) / grid_h | ||
| w = anchors[2 * b + 0] * \ | ||
| np.exp(w) / grid_w # unit: image width | ||
| h = anchors[2 * b + 1] * \ | ||
| np.exp(h) / grid_h # unit: image height | ||
| confidence = netout[row, col, b, 4] | ||
|
|
||
| box = BoundBox(x, y, w, h, confidence, classes) | ||
|
|
||
| boxes.append(box) | ||
|
|
||
| # suppress non-maximal boxes | ||
| for c in range(nb_class): | ||
| sorted_indices = list( | ||
| reversed(np.argsort([box.classes[c] for box in boxes]))) | ||
|
|
||
| for i in xrange(len(sorted_indices)): | ||
| index_i = sorted_indices[i] | ||
|
|
||
| if boxes[index_i].classes[c] == 0: | ||
| continue | ||
| else: | ||
| for j in xrange(i+1, len(sorted_indices)): | ||
| index_j = sorted_indices[j] | ||
|
|
||
| if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: | ||
| boxes[index_j].classes[c] = 0 | ||
|
|
||
| # remove the boxes which are less likely than a obj_threshold | ||
| boxes = [box for box in boxes if box.get_score() > obj_threshold] | ||
|
|
||
| return boxes | ||
|
|
||
|
|
||
| def sigmoid(x): | ||
| return 1. / (1. + np.exp(-x)) | ||
|
|
||
|
|
||
| def softmax(x, axis=-1, t=-100.): | ||
| x = x - np.max(x) | ||
|
|
||
| if np.min(x) < t: | ||
| x = x / np.min(x)*t | ||
|
|
||
| e_x = np.exp(x) | ||
|
|
||
| return e_x / e_x.sum(axis, keepdims=True) |