In [1]:
import pandas as pd
from average_precision import Box, evaluate_detector

# Load bounding boxes

In [2]:
# ground truth bounding boxes
G = pd.read_csv(
    'ground_truth_boxes.csv', header=None, 
    names=['img_name', 'label', 'x', 'y', 'w', 'h']
)

In [3]:
# detected(predicted) bounding boxes
D = pd.read_csv(
    'predictions.csv', header=None,
    names=['img_name', 'label', 'confidence', 'x', 'y', 'w', 'h']
)

# Convert to dict

In [4]:
def pandas_to_dict(dataframe, sort_by_area=False):
    
    # bounding boxes of different labels are separated
    boxes_per_label = {}
    unique_labels = dataframe['label'].unique()
    
    for label in unique_labels:
        
        # bounding boxes on different images are separated
        boxes_per_img = {}
        
        # iterate over all boxes of a particular label
        for i, row in dataframe.loc[dataframe['label'] == label].iterrows():
            image_name = row['img_name']
            if image_name in boxes_per_img:
                boxes_per_img[image_name] += [Box(row)]
            else:
                boxes_per_img[image_name] = [Box(row)]
        
        if sort_by_area:
            # for each image we sort bounding boxes by area
            for img in boxes_per_img:
                boxes_per_img[img].sort(key=lambda b: b.area, reverse=True)
        
        boxes_per_label[label] = boxes_per_img
    
    return boxes_per_label

In [5]:
%%time
gt_boxes_by_label = pandas_to_dict(G, sort_by_area=True)
pred_boxes_by_label = pandas_to_dict(D)
# so, now we have two dictionary of dictionaries:
# label -> (image -> bounding boxes for this label and this image)

CPU times: user 3.03 s, sys: 8 ms, total: 3.04 s
Wall time: 3.14 s


# Compute average precision for each class

In [6]:
%%time
APs = []
for label in gt_boxes_by_label:
    gt_boxes_by_img = gt_boxes_by_label[label]
    pred_boxes_by_img = pred_boxes_by_label[label]
    ap = evaluate_detector(gt_boxes_by_img, pred_boxes_by_img)
    APs += [(label, ap)]

CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 61.9 ms


In [7]:
APs

[(' person', 0.5879933675211664),
 (' car', 0.6727775922888186),
 (' bus', 1.0),
 (' bicycle', 0.2683673469387755)]

# Alternative method

In [8]:
from average_precision import iou, compute_ap

In [9]:
def evaluate_detector_ver2(ground_truth_boxes_by_img, detected_boxes_by_img):
    
    precision, recall = [], []
    # increase number of thresholds to get more accurate AP estimation
    thresholds = [0.1*i for i in reversed(range(0, 10))]
    
    for confidence_threshold in thresholds:
        
        TP, FP, FN = 0, 0, 0
        
        for img in ground_truth_boxes_by_img:
            
            ground_truth_boxes = ground_truth_boxes_by_img[img]
            detected_boxes = list(detected_boxes_by_img[img])  # copy
            
            detected_boxes = [
                b for b in detected_boxes 
                if b.confidence >= confidence_threshold
            ]
            
            for box in ground_truth_boxes:
                matched = [b for b in detected_boxes if iou(box, b) > 0.5]
                if len(matched) > 0:
                    matched.sort(key=lambda b: b.confidence, reverse=True)
                    detected_boxes.remove(matched[0])
                    TP += 1
                else:
                    FN += 1
            
            FP += len(detected_boxes)
        
        # also consider images without gt boxes but with predictions
        FP += sum([
            len([b for b in detected_boxes_by_img[img] 
                 if b.confidence >= confidence_threshold]) 
            for img in detected_boxes_by_img 
            if not img in ground_truth_boxes_by_img
        ])
        
        if (TP + FP) != 0:
            precision += [float(TP)/float(TP + FP)]
            recall += [float(TP)/float(TP + FN)]
            print('{0:.1f} {1:.3f} {2:.3f}'.format(
                confidence_threshold, precision[-1], recall[-1]
            ))
        
    return compute_ap(precision, recall)

In [10]:
%%time
for label in gt_boxes_by_label:
    print('label =' + label)
    gt_boxes_by_img = gt_boxes_by_label[label]
    pred_boxes_by_img = pred_boxes_by_label[label]
    print('AP = {:.3f}'.format(
        evaluate_detector_ver2(gt_boxes_by_img, pred_boxes_by_img)
    ), '\n')

label = person
0.9 1.000 0.009
0.8 1.000 0.094
0.7 1.000 0.197
0.6 0.972 0.299
0.5 0.939 0.393
0.4 0.883 0.453
0.3 0.756 0.530
0.2 0.578 0.573
0.1 0.405 0.641
0.0 0.009 0.735
AP = 0.549 

label = car
0.8 1.000 0.091
0.7 1.000 0.182
0.6 0.923 0.364
0.5 0.842 0.485
0.4 0.810 0.515
0.3 0.786 0.667
0.2 0.686 0.727
0.1 0.545 0.727
0.0 0.008 0.758
AP = 0.637 

label = bus
0.8 1.000 0.250
0.7 1.000 0.500
0.6 1.000 0.500
0.5 1.000 0.500
0.4 1.000 0.500
0.3 1.000 1.000
0.2 1.000 1.000
0.1 0.800 1.000
0.0 0.005 1.000
AP = 1.000 

label = bicycle
0.8 1.000 0.143
0.7 1.000 0.214
0.6 1.000 0.214
0.5 1.000 0.214
0.4 1.000 0.214
0.3 1.000 0.214
0.2 0.750 0.214
0.1 0.600 0.214
0.0 0.007 0.357
AP = 0.215 

CPU times: user 68 ms, sys: 4 ms, total: 72 ms
Wall time: 69.3 ms
