In [1]:
import numpy as np
import torch
import torch.nn as nn
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
def conv_batch(in_num, out_num, kernel_size=3, padding=1, stride=1):
    return nn.Sequential(
        nn.Conv2d(in_num, out_num, kernel_size=kernel_size, stride=stride, padding=padding, bias=False),
        nn.BatchNorm2d(out_num),
        nn.LeakyReLU())

Residual block

In [3]:
class DarkResidualBlock(nn.Module):
    def __init__(self, in_channels):
        super(DarkResidualBlock, self).__init__()
        reduced_channels = int(in_channels/2)
        self.layer1 = conv_batch(in_channels, reduced_channels, kernel_size=1, padding=0)
        self.layer2 = conv_batch(reduced_channels, in_channels)
    def forward(self, x):
        residual = x
        out = self.layer1(x)
        out = self.layer2(out)
        out += residual
        return out

In [4]:
class Darknet53(nn.Module):
    def __init__(self, block, num_classes):
        super(Darknet53, self).__init__()
        self.num_classes = num_classes
        self.conv1 = conv_batch(3, 32)
        self.conv2 = conv_batch(32, 64, stride=2)
        self.residual_block1 = self.make_layer(block, in_channels=64, num_blocks=1)
        self.conv3 = conv_batch(64, 128, stride=2)
        self.residual_block2 = self.make_layer(block, in_channels=128, num_blocks=2)
        self.conv4 = conv_batch(128, 256, stride=2)
        self.residual_block3 = self.make_layer(block, in_channels=256, num_blocks=8)
        self.conv5 = conv_batch(256, 512, stride=2)
        self.residual_block4 = self.make_layer(block, in_channels=512, num_blocks=8)
        self.conv6 = conv_batch(512, 1024, stride=2)
        self.residual_block5 = self.make_layer(block, in_channels=1024, num_blocks=4)
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(1024, self.num_classes)
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.residual_block1(out)
        out = self.conv3(out)
        out = self.residual_block2(out)
        out = self.conv4(out)
        out = self.residual_block3(out)
        out = self.conv5(out)
        out = self.residual_block4(out)
        out = self.conv6(out)
        out = self.residual_block5(out)
        out = self.global_avg_pool(out)
        out = out.view(-1, 1024)
        out = self.fc(out)
        return out
    def make_layer(self, block, in_channels, num_blocks):
        layers = []
        for i in range(0, num_blocks):
            layers.append(block(in_channels))
        return nn.Sequential(*layers)

In [5]:
def darknet53(num_classes):
    return Darknet53(DarkResidualBlock, num_classes)

Modified predict(num_classes) function for DenseNet

In [6]:
class Darknet:
    def __init__(self,pth):
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        print(self.device)
        model = darknet53(28)
        model.load_state_dict(torch.load(pth, map_location=self.device))
        self.Darknet53 = model.to(self.device)
    def predict(self,image):
        # Set for eval mode, require grad = False
        self.Darknet53.eval()
        gray = image.reshape(64, 64, 3)
        with torch.no_grad():
            gray_tensor = torch.from_numpy(gray.astype(np.float32) / 255.).permute(2, 0, 1).unsqueeze(0).to(self.device)
            # Onehot vector of size 28 for the output layer, 10 for first digit, 10 for secon digit, and 8 for the bbox
            oh = self.Darknet53(gray_tensor)
            oh_class = oh[:, :20].contiguous().view(-1, 10)
            oh_box = oh[:, 20:]

            # Sort the tensor by ascending order
            pred_class = oh_class.argmax(1).cpu().numpy()
            pred_box = oh_box.long().cpu().numpy()[0].reshape(2,4)
        return pred_class,pred_box

In [7]:
def classify_and_detect(images):
    """
    :param np.ndarray images: N x 4096 array containing N 64x64 images flattened into vectors
    :return: np.ndarray, np.ndarray
    """
    N = images.shape[0]

    # pred_class: Your predicted labels for the 2 digits, shape [N, 2]
    pred_class = np.empty((N, 2), dtype=np.int32)
    # pred_bboxes: Your predicted bboxes for 2 digits, shape [N, 2, 4]
    pred_bboxes = np.empty((N, 2, 4), dtype=np.float64)

    # add your code here to fill in pred_class and pred_bboxes
    model = Darknet("/content/gdrive/My Drive/visual_recognition_data/checkpoint_32.pth")
    for i in range(N):
        label,box=model.predict(images[i,:])
        box[0,2] = box[0,0] + 28
        box[0,3] = box[0,1] + 28
        box[1,2] = box[1,0] + 28
        box[1,3] = box[1,1] + 28
        pred_class[i,:]=label
        pred_bboxes[i,:]=box
        if i % 100 == 0:
            print('Evaluating: [{}/{} ({:.0f}%)]\n'.format(i, N, (i/N*100)))
    return pred_class, pred_bboxes

In [8]:
import timeit
import numpy as np
from skimage.draw import polygon

In [9]:
def resize_ar(src_img, width=0, height=0, return_factors=False,
              placement_type=0):
    import cv2
    src_height, src_width, n_channels = src_img.shape
    src_aspect_ratio = float(src_width) / float(src_height)
    if width <= 0 and height <= 0:
        raise AssertionError('Both width and height cannot be zero')
    elif height <= 0:
        height = int(width / src_aspect_ratio)
    elif width <= 0:
        width = int(height * src_aspect_ratio)
    aspect_ratio = float(width) / float(height)
    if src_aspect_ratio == aspect_ratio:
        dst_width = src_width
        dst_height = src_height
        start_row = start_col = 0
    elif src_aspect_ratio > aspect_ratio:
        dst_width = src_width
        dst_height = int(src_width / aspect_ratio)
        start_row = int((dst_height - src_height) / 2.0)
        if placement_type == 0:
            start_row = 0
        elif placement_type == 1:
            start_row = int((dst_height - src_height) / 2.0)
        elif placement_type == 2:
            start_row = int(dst_height - src_height)
        start_col = 0
    else:
        dst_height = src_height
        dst_width = int(src_height * aspect_ratio)
        start_col = int((dst_width - src_width) / 2.0)
        if placement_type == 0:
            start_col = 0
        elif placement_type == 1:
            start_col = int((dst_width - src_width) / 2.0)
        elif placement_type == 2:
            start_col = int(dst_width - src_width)
        start_row = 0
    dst_img = np.zeros((dst_height, dst_width, n_channels), dtype=np.uint8)
    dst_img[start_row:start_row + src_height, start_col:start_col + src_width, :] = src_img
    dst_img = cv2.resize(dst_img, (width, height))
    if return_factors:
        resize_factor = float(height) / float(dst_height)
        return dst_img, resize_factor, start_row, start_col
    else:
        return dst_img

In [10]:
def compute_classification_acc(pred, gt):
    assert pred.shape == gt.shape
    return (pred == gt).astype(int).sum() / gt.size

In [11]:
def compute_iou(b_pred, b_gt):
    """
    :param b_pred: predicted bounding boxes, shape=(n,2,4)
    :param b_gt: ground truth bounding boxes, shape=(n,2,4)
    :return:
    """
    n = np.shape(b_gt)[0]
    L_pred = np.zeros((64, 64))
    L_gt = np.zeros((64, 64))
    iou = 0.0
    for i in range(n):
        for b in range(2):
            rr, cc = polygon([b_pred[i, b, 0], b_pred[i, b, 0], b_pred[i, b, 2], b_pred[i, b, 2]],
                             [b_pred[i, b, 1], b_pred[i, b, 3], b_pred[i, b, 3], b_pred[i, b, 1]], [64, 64])
            L_pred[rr, cc] = 1
            rr, cc = polygon([b_gt[i, b, 0], b_gt[i, b, 0], b_gt[i, b, 2], b_gt[i, b, 2]],
                             [b_gt[i, b, 1], b_gt[i, b, 3], b_gt[i, b, 3], b_gt[i, b, 1]], [64, 64])
            L_gt[rr, cc] = 1
            iou += (1.0 / (2 * n)) * (np.sum((L_pred + L_gt) == 2) / np.sum((L_pred + L_gt) >= 1))
            L_pred[:, :] = 0
            L_gt[:, :] = 0
    return iou

In [12]:
class A7_Params:
    def __init__(self):
        # self.prefix = "test"
        self.prefix = "valid"
        # self.prefix = "train"
        self.vis = 0
        self.vis_size = (300, 300)
        self.show_pred = 1
        self.speed_thresh = 10
        self.acc_thresh = (0.7, 0.98)
        self.iou_thresh = (0.7, 0.98)

In [13]:
def compute_score(res, thresh):
    min_thres, max_thres = thresh
    if res < min_thres:
        score = 0.0
    elif res > max_thres:
        score = 100.0
    else:
        score = float(res - min_thres) / (max_thres - min_thres) * 100
    return score

In [14]:
def draw_bboxes(img, bbox_1, bbox_2, y1, y2, vis_size):
    import cv2
    ymin, xmin, ymax, xmax = bbox_1
    cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)),
                  (0, 255, 0), thickness=1)
    cv2.putText(img, '{:d}'.format(y1), (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX_SMALL,
                0.4, (0, 255, 0))
    ymin, xmin, ymax, xmax = bbox_2
    cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)),
                  (255, 0, 0), thickness=1)
    cv2.putText(img, '{:d}'.format(y2), (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX_SMALL,
                0.4, (255, 0, 0))
    img = resize_ar(img, *vis_size)
    return img

In [15]:
def main():
    params = A7_Params()
    try:
        import paramparse
    except ImportError:
        pass
    else:
        paramparse.process(params)
    prefix = params.prefix
    images = np.load("/content/gdrive/My Drive/visual_recognition_data/" + prefix + "_X.npy")
    gt_classes = np.load("/content/gdrive/My Drive/visual_recognition_data/" + prefix + "_Y.npy")
    gt_bboxes = np.load("/content/gdrive/My Drive/visual_recognition_data/" + prefix + "_bboxes.npy")
    n_images = images.shape[0]
    print(f'running on {n_images} {prefix} images')
    start_t = timeit.default_timer()
    pred_classes, pred_bboxes = classify_and_detect(images)
    end_t = timeit.default_timer()
    test_time = end_t - start_t
    assert test_time > 0, "test_time cannot be 0"
    test_speed = float(n_images) / test_time
    acc = compute_classification_acc(pred_classes, gt_classes)
    iou = compute_iou(pred_bboxes, gt_bboxes)
    acc_score = compute_score(acc, params.acc_thresh)
    iou_score = compute_score(iou, params.iou_thresh)
    if test_speed < params.speed_thresh:
        overall_score = 0
    else:
        overall_score = (iou_score + acc_score) / 2
    print(f"Classification Accuracy: {acc:.3f}")
    print(f"Detection IOU: {iou:.3f}")
    print(f"Test time: {test_time:.3f} seconds")
    print(f"Test speed: {test_speed:.3f} images / second")
    print(f"Classification Score: {acc_score:.3f}")
    print(f"IOU Score: {iou_score:.3f}")
    print(f"Overall Score: {overall_score:.3f}")
    if params.vis:
        import cv2
        print('press space to taggle pause after each frame and escape to quit')
        pause_after_frame = 1
        for img_id in range(n_images):
            src_img = images[img_id, ...].squeeze().reshape((64, 64, 3)).astype(np.uint8)
            vis_img = np.copy(src_img)
            vis_img_det = None
            if params.show_pred:
                vis_img_det = np.copy(src_img)
            bbox_1 = gt_bboxes[img_id, 0, :].squeeze().astype(np.int32)
            bbox_2 = gt_bboxes[img_id, 1, :].squeeze().astype(np.int32)
            y1, y2 = gt_classes[img_id, ...].squeeze()
            gt_classes[img_id, ...].squeeze()
            vis_img = draw_bboxes(vis_img, bbox_1, bbox_2, y1, y2, params.vis_size)
            if params.show_pred:
                bbox_1 = pred_bboxes[img_id, 0, :].squeeze().astype(np.int32)
                bbox_2 = pred_bboxes[img_id, 1, :].squeeze().astype(np.int32)
                y1, y2 = pred_classes[img_id, ...].squeeze()
                gt_classes[img_id, ...].squeeze()
                vis_img_det = draw_bboxes(vis_img_det, bbox_1, bbox_2, y1, y2, params.vis_size)
                vis_img = np.concatenate((vis_img, vis_img_det), axis=1)
            cv2.imshow('vis_img', vis_img)
            key = cv2.waitKey(1 - pause_after_frame)
            if key == 27:
                return
            elif key == 32:
                pause_after_frame = 1 - pause_after_frame
main()

running on 5000 valid images
cuda


















































Classification Accuracy: 0.981
Detection IOU: 0.898
Test time: 58.140 seconds
Test speed: 86.000 images / second
Classification Score: 100.000
IOU Score: 70.574
Overall Score: 85.287
