In [2]:
from __future__ import print_function, division
import torch
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import scipy.io as sio
import torch.nn.functional as F
from ctpn_model import CTPN_Model
from ctpn_utils import gen_anchor, bbox_transfor_inv, clip_box, filter_bbox,nms, TextProposalConnectorOriented
import config
from shapely.geometry import Polygon

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
def rescale(image, label, new_w =None, new_h =None):
    old_h, old_w = image.shape[:2]
    #new_h, new_w = int(new_h), int(new_w)
    # if both the width and height are None, then return the
    # original image
    if new_w is None and new_h is None:
        new_w = old_w
        new_h = old_h
        
    # check to see if the width is None
    elif new_w is None and new_h is not None:
        # calculate the ratio of the height and construct the
        # dimensions
        r = new_h / float(old_h)
        new_w = int(old_w * r)
    
    # otherwise, the height is None
    elif new_w is not None and new_h is None:
        # calculate the ratio of the width and construct the
        # dimensions
        r = new_w / float(old_w)
        new_h = int(old_h * r)
    
    image = cv2.resize(image, (new_w, new_h))
    
    label[:,0] = label[:,0] * (new_w/old_w)
    label[:,1] = label[:,1] * (new_h/old_h)
    label[:,2] = label[:,2] * (new_w/old_w)
    label[:,3] = label[:,3] * (new_h/old_h)
    label[:,4] = label[:,4] * (new_w/old_w)
    label[:,5] = label[:,5] * (new_h/old_h)
    
    return image, label

    

In [4]:
class SceneTextDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, mat_paths, root_dir):
        """
        Args:
            mat_file (string): Path to the mat file.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.mat_paths = mat_paths
        self.root_dir = root_dir

    def __len__(self):
        return (1555)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
                   
        img_name = os.path.join(self.root_dir,'img'+ str(idx) + '.jpg')
        image = cv2.imread(img_name)
        label_name = os.path.join(self.mat_paths, 'rect_gt_img' + str(idx)+ '.mat')                                          
        label = sio.loadmat(label_name)
        label = list(label.items())
        label = np.array(label)
        label = label[3,1]
        label = label[:,0:6]
        label = label.astype(float)
        
        transformed_image, transformed_label = rescale(image, label, new_h=720)
        
        return transformed_image, transformed_label

In [5]:
# Splitting the data into training and test sets and loading the two sets into dataloaders

text_dataset =SceneTextDataset(mat_paths='E:/CIE/CIE 3/Spring 2020/Computer Vision/Final Project/Dataset/Groundtruth/Rectangular/All',
                                    root_dir='E:/CIE/CIE 3/Spring 2020/Computer Vision/Final Project/Dataset/Images/All')

train_set, test_set = torch.utils.data.random_split(text_dataset, [1455, 100])

dataloader_train = DataLoader(train_set,batch_size=1, shuffle=True,num_workers=0)

dataloader_test = DataLoader(test_set,batch_size=1, shuffle=True,num_workers=0)
    

In [6]:

# Utilites functions to make the detection on the test data

prob_thresh = 0.5
gpu = True
if not torch.cuda.is_available():
    gpu = False
device = torch.device('cuda:0' if gpu else 'cpu')
weights = os.path.join(config.checkpoints_dir, 'CTPN.pth')
model = CTPN_Model()
model.load_state_dict(torch.load(weights, map_location=device)['model_state_dict'])
model.to(device)
model.eval()



def dis(image):
    cv2.imshow('image', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

def get_2_points(text):
    '''
     Objective: to drop the unused elements from the input text and return only the 2 points to compare them with the opposite 2 points in the label.
     Input: a numpy array that retrieved from the model and represent the coordinates of the points that used to draw the detection box.
     output: a numpy array that have only 2 points.
    '''

    text = np.delete(text, np.s_[2:6], axis=1)
    text = text[:, 0:4]

    return text


def get_4_points(label):
    '''
    Objective: to return the four points coordinates from the two points in the label array
    Input: the label with shape (1,4) has 2 points: (xmin, ymin) and (xmax, ymax)
    output: 4 points (xmin, ymin) (x1, y1) (x2, y2) (xmax, ymax)
    '''
    p1 = (label[:, 0], label[:, 1])
    p2 = (label[:, 2], label[:, 1])
    p3 = (label[:, 2], label[:, 3])
    p4 = (label[:, 0], label[:, 3])
   
    return p1, p2, p3, p4

def intersect(p1, p2, p3, p4, m1, m2, m3, m4):
    '''
     Objective: to calculate the area of the intersection between 2 rectangles.
     Inputs: 8 tuples: 4 tuples for each rectangle represent the 4 points to draw it.
     Output: the area of the intersection (float)
    '''
    box1 = Polygon([p1, p2, p3, p4])
    box2 = Polygon([m1, m2, m3, m4])
    intersection = box1.intersection(box2)
    return intersection.area


def get_iou(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.  Here we operate on
    ground truth boxes and default boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
    Return:
        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
    """
    a1, a2, a3, a4 = get_4_points(box_a)
    b1, b2, b3, b4 = get_4_points(box_b)
    inter = intersect(a1, a2, a3, a4, b1, b2, b3, b4)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]


def get_accuracy(label, target):
    '''
    Objective: get an image that contains multiple detection boxes and loop over them to get the iou accuracy of each box and then get the average accuracy
               of all boxes to represent the accuracy for this image.
    Input:
           label: tensor of the groundtruth labels of one image
           target: tensor of the label that returned by the model for one image
    Output: a float number represents the iou accuracy of one image
    '''
    target_rows = target.shape[0]
    label_rows = label.shape[0]
    iou_list = []
    for t in range(target_rows):
        for L in range(label_rows):
            iou_box = get_iou(label[L, :].unsqueeze(0), target[t, :].unsqueeze(0))
            iou_list.append(iou_box)
    iou_list = sorted(iou_list, reverse=True)
    iou_list = np.asarray(iou_list)
    final_list = iou_list[0:target_rows]
    acc_image = sum(final_list)/len(final_list)

    return acc_image


def get_det_boxes(image,display = True, expand = True):
    image_r = image.copy()
    image_c = image.copy()
    h, w = image.shape[:2]
    image = image.astype(np.float32) - config.IMAGE_MEAN
    image = torch.from_numpy(image.transpose(2, 0, 1)).unsqueeze(0).float()
    
    with torch.no_grad():
        image = image.to(device)
        cls, regr = model(image)
        cls_prob = F.softmax(cls, dim=-1).cpu().numpy()
        regr = regr.cpu().numpy()
        anchor = gen_anchor((int(h / 16), int(w / 16)), 16)
        bbox = bbox_transfor_inv(anchor, regr)
        bbox = clip_box(bbox, [h, w])
        # print(bbox.shape)

        fg = np.where(cls_prob[0, :, 1] > prob_thresh)[0]
        # print(np.max(cls_prob[0, :, 1]))
        select_anchor = bbox[fg, :]
        select_score = cls_prob[0, fg, 1]
        select_anchor = select_anchor.astype(np.int32)
        # print(select_anchor.shape)
        keep_index = filter_bbox(select_anchor, 16)

        # nms
        select_anchor = select_anchor[keep_index]
        select_score = select_score[keep_index]
        select_score = np.reshape(select_score, (select_score.shape[0], 1))
        nmsbox = np.hstack((select_anchor, select_score))
        keep = nms(nmsbox, 0.3)
        # print(keep)
        select_anchor = select_anchor[keep]
        select_score = select_score[keep]

        # text line-
        textConn = TextProposalConnectorOriented()
        text = textConn.get_text_lines(select_anchor, select_score, [h, w])

        # expand text

        if expand:
            for idx in range(len(text)):
                text[idx][0] = max(text[idx][0] - 10, 0)
                text[idx][2] = min(text[idx][2] + 10, w - 1)
                text[idx][4] = max(text[idx][4] - 10, 0)
                text[idx][6] = min(text[idx][6] + 10, w - 1)



        # print(text)
        if display:

            blank = np.zeros(image_c.shape,dtype=np.uint8)
            for box in select_anchor:
                pt1 = (box[0], box[1])
                pt2 = (box[2], box[3])
                blank = cv2.rectangle(blank, pt1, pt2, (50, 0, 0), -1)
            image_c = image_c+blank
            image_c[image_c>255] = 255


            for i in text:
                s = str(round(i[-1] * 100, 2)) + '%'
                i = [int(j) for j in i]
                cv2.line(image_c, (i[0], i[1]), (i[2], i[3]), (0, 0, 255), 2)
                cv2.line(image_c, (i[0], i[1]), (i[4], i[5]), (0, 0, 255), 2)
                cv2.line(image_c, (i[6], i[7]), (i[2], i[3]), (0, 0, 255), 2)
                cv2.line(image_c, (i[4], i[5]), (i[6], i[7]), (0, 0, 255), 2)
                cv2.putText(image_c, s, (i[0]+13, i[1]+13),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (255,0,0),
                            2,
                            cv2.LINE_AA)
            # dis(image_c)


        return text, image_c, image_r                   #text[:,1:5]



In [8]:
# The driver code to test the test data and get the iou accuracy
if __name__ == '__main__':
    all_acc_list = []
    c = 0
    for image, label in dataloader_test:
      
        image = np.asarray(image).squeeze(0)
        label = np.asarray(label).squeeze(0)
        label = label[:, 0:4]
        text, image_c, image_r = get_det_boxes(image)
        print(text.shape)
        if len(text) == 0:
            image, label = next(iter(dataloader_test))
        else:
            text = get_2_points(text)
            text = torch.from_numpy(text)
            label = torch.from_numpy(label)
            #dis(image_c)
            acc = get_accuracy(label, text)
                
        c += 1
        print(c)
    
    all_acc_list.append(acc)    
    test_acc = sum(all_acc_list)/len(all_acc_list)
    print(test_acc)

(8, 9)
1
(4, 9)
2
(5, 9)
3
(5, 9)
4
(2, 9)
5
(5, 9)
6
(5, 9)
7
(0, 9)
8
(0, 9)
9
(4, 9)
10
(3, 9)
11
(2, 9)
12
(2, 9)
13
(2, 9)
14
(3, 9)
15
(11, 9)
16
(9, 9)
17
(4, 9)
18
(4, 9)
19
(2, 9)
20
(4, 9)
21
(2, 9)
22
(3, 9)
23
(7, 9)
24
(0, 9)
25
(1, 9)
26
(2, 9)
27
(0, 9)
28
(1, 9)
29
(2, 9)
30
(7, 9)
31
(4, 9)
32
(2, 9)
33
(2, 9)
34
(2, 9)
35
(0, 9)
36
(2, 9)
37
(1, 9)
38
(2, 9)
39
(1, 9)
40
(5, 9)
41
(5, 9)
42
(1, 9)
43
(6, 9)
44
(1, 9)
45
(2, 9)
46
(1, 9)
47
(1, 9)
48
(3, 9)
49
(1, 9)
50
(2, 9)
51
(1, 9)
52
(0, 9)
53
(4, 9)
54
(1, 9)
55
(2, 9)
56
(2, 9)
57
(10, 9)
58
(2, 9)
59
(2, 9)
60
(2, 9)
61
(5, 9)
62
(3, 9)
63
(1, 9)
64
(2, 9)
65
(6, 9)
66
(6, 9)
67
(2, 9)
68
(4, 9)
69
(0, 9)
70
(6, 9)
71
(1, 9)
72
(5, 9)
73
(4, 9)
74
(3, 9)
75
(1, 9)
76
(5, 9)
77
(1, 9)
78
(3, 9)
79
(3, 9)
80
(1, 9)
81
(6, 9)
82
(3, 9)
83
(0, 9)
84
(2, 9)
85
(2, 9)
86
(5, 9)
87
(1, 9)
88
(1, 9)
89
(0, 9)
90
(4, 9)
91
(6, 9)
92
(6, 9)
93
(2, 9)
94
(3, 9)
95
(0, 9)
96
(10, 9)
97
(0, 9)
98
(6, 9)
99
(2, 9)
100
0.518