In [77]:
%%capture
import mrcnn
import mrcnn.config
import mrcnn.model
import mrcnn.visualize
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np

In [78]:
%%capture
CLASS_NAMES = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

class SimpleConfig(mrcnn.config.Config):
    NAME = "coco_inference"
    
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

    NUM_CLASSES = len(CLASS_NAMES)

model = mrcnn.model.MaskRCNN(mode="inference", 
                             config=SimpleConfig(),
                             model_dir=os.getcwd())

model.load_weights("/home/kabilan/Desktop/caption/dataset/mask_rcnn_coco.h5", by_name=True)

In [79]:
def get_objects_and_locations(image_path):
    image = cv2.imread(image_path,1)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    rois = model.detect([image], verbose=0)[0]
    return rois

In [80]:
img_path = "/home/kabilan/Desktop/caption/dataset/images/10815824_2997e03d76.jpg"

In [83]:
class Node:
    def __init__(self,start,end,class_id,class_name):
        self.start = start
        self.end = end
        self.class_id = class_id
        self.class_name = class_name

In [81]:
image = cv2.imread(img_path,1)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

In [84]:
objects = []

In [85]:
for i in range(len(res['rois'])):
    class_name = CLASS_NAMES[res['class_ids'][i]]
    class_id = res['class_ids'][i]
    points = res['rois'][i]
    start = (points[1],points[0])
    end = (points[3],points[2])
    score = res['scores'][i]
    
    ob = Node(start,end,class_id,class_name)
    if score > 0.8:
        objects.append(ob)

In [89]:
for each in objects:
    print(each.__dict__)

{'start': (12, 44), 'end': (260, 309), 'class_id': 18, 'class_name': 'horse', 'reference_name': 'horse 1'}
{'start': (235, 53), 'end': (319, 317), 'class_id': 1, 'class_name': 'person', 'reference_name': 'person 1'}
{'start': (377, 51), 'end': (416, 156), 'class_id': 1, 'class_name': 'person', 'reference_name': 'person 2'}
{'start': (409, 68), 'end': (500, 153), 'class_id': 18, 'class_name': 'horse', 'reference_name': 'horse 2'}
{'start': (399, 70), 'end': (405, 90), 'class_id': 28, 'class_name': 'tie', 'reference_name': 'tie 1'}


In [90]:
get_angle(objects[3],objects[2])

-0.03571428571428571

In [91]:
def get_angle(node_a,node_b):
    x1,y1 = node_a.end
    x2,y2 = node_b.end
    tan = (y2-y1)/(x2-x1)
    return tan

In [92]:
def distance(node_a,node_b):
    return np.linalg.norm(np.array(node_a.end) - np.array(node_b.end))

In [104]:
behind_threshold = -0.2
front_threshold = 0.2

In [105]:
BEHIND_WORD = "behind"
FRONT_WORD = "front of"
NEAR_WORD = "near"

In [106]:
def prepare_sentence(node_a,node_b,join_word):
    return f"{node_a.reference_name} is {join_word} {node_b.reference_name}"

In [109]:
def get_position_word(node_a,node_b):
    angle = get_angle(node_a,node_b)
    if angle < behind_threshold:
        return prepare_sentence(node_b,node_a,BEHIND_WORD)
    elif angle > front_threshold:
        return prepare_sentence(node_b,node_a,FRONT_WORD)
    return prepare_sentence(node_b,node_a,NEAR_WORD)

In [112]:
def process_sentence_objects(objects):
    i=0
    while i< len(objects)-1:
        node_a = objects[i]
        node_b = objects[i+1]
        print(get_position_word(node_a,node_b))
        i+=1

In [116]:
def add_node_reference_name(objects):
    objects_count_dict = {}
    for each in objects:
        if each.class_id in objects_count_dict:
            objects_count_dict[each.class_id] +=1
        else:
            objects_count_dict[each.class_id] =1
        each.reference_name = each.class_name + " " + str(objects_count_dict[each.class_id])

In [117]:
def create_nodes(res):
    objects = []
    for i in range(len(res['rois'])):
        class_name = CLASS_NAMES[res['class_ids'][i]]
        class_id = res['class_ids'][i]
        points = res['rois'][i]
        start = (points[1],points[0])
        end = (points[3],points[2])
        score = res['scores'][i]
        ob = Node(start,end,class_id,class_name)
        if score > 0.8:
            objects.append(ob)
    add_node_reference_name(objects)
    return objects

person 1 is near horse 1
person 2 is behind person 1
horse 2 is near person 2
tie 1 is front of horse 2


In [115]:
def process_objects(image_path):
    res =  get_objects_and_locations(image_path)
    objects = create_nodes(res)
    process_sentence_objects(objects)
    

In [118]:
process_objects(img_path)

person 1 is near horse 1
person 2 is behind person 1
horse 2 is near person 2
tie 1 is front of horse 2
