# Using Mask R-CNN in Google Colab Notebooks

---

**Author:** Nicolás Metallo – http://www.nicolasmetallo.com

Source:
- https://arxiv.org/pdf/1703.06870.pdf
- https://github.com/matterport/Mask_RCNN
- https://github.com/facebookresearch/Detectron
- https://research.fb.com/publications/mask-r-cnn/

More info about Google Colab:
- https://towardsdatascience.com/fast-ai-lesson-1-on-google-colab-free-gpu-d2af89f53604
- https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d


# Setup environment

## Configure Notebook

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Clone Git repository and setup coco-api

---

COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation. This package provides Matlab, Python, and Lua APIs that assists in loading, parsing, and visualizing the annotations in COCO.


In [0]:
import os

## Clone Git repository
!git clone https://github.com/matterport/Mask_RCNN.git object-detection
  
## Set repo as default folder
os.chdir('object-detection')

In [0]:
!pip install -U scikit-image
!pip install -U cython
!pip install git+https://github.com/waleedka/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI
!git clone https://github.com/pdollar/coco.git
!cd coco/PythonAPI && make
!cd coco/PythonAPI && make install
!cd coco/PythonAPI && python3 setup.py install

# Import libraries, model and load weights

---

A quick intro to using the pre-trained model to detect and segment objects.

In [0]:
import os
import sys
import random
import math
import numpy as np
import skimage.io
import matplotlib
import matplotlib.pyplot as plt

import coco
import utils
import model as modellib

%matplotlib inline 

# Root directory of the project
ROOT_DIR = os.getcwd()

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

## Configurations
## We'll be using a model trained on the MS-COCO dataset. The configurations of this model are in the ```CocoConfig``` class in ```coco.py```.
## For inferencing, modify the configurations a bit to fit the task. To do so, sub-class the ```CocoConfig``` class and override the attributes you need to change.

class InferenceConfig(coco.CocoConfig):
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

config = InferenceConfig()
config.display()

## Create Model and Load Trained Weights

# Create model object in inference mode.
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)

# COCO Class names
# Index of the class in the list is its ID. For example, to get ID of
# the teddy bear class, use: class_names.index('teddy bear')
class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']

# Define functions

---

Source: https://www.youtube.com/watch?v=lLM8oAsi32g

In [0]:
import cv2
import os
from os import path
import time
import PIL
from PIL import Image
import scipy.misc
import matplotlib
import matplotlib.pyplot as plt
import collections

In [0]:
# Fix for bug in Google Colab
# !pip install --no-cache-dir -I pillow

def register_extension(id, extension):
    PIL.Image.EXTENSION[extension.lower()] = id.upper()
PIL.Image.register_extension = register_extension
def register_extensions(id, extensions):
    for extension in extensions:
        register_extension(id, extension)
PIL.Image.register_extensions = register_extensions

In [0]:
## Functions to visualize detection results on the image

def random_colors(N):
    np.random.seed(2500)
    colors = [tuple(255 * np.random.rand(3)) for _ in range(N)]    
    return colors

def apply_mask(image, mask, color, alpha=0.5):
    """apply mask to image"""
    for n, c in enumerate(color):
        image[:, :, n] = np.where(
            mask == 1,
            image[:, :, n] * (1 - alpha) + alpha * c,
            image[:, :, n]
        )
    return image
  
def display_instances(image, boxes, masks, ids, names, scores, same = True):
    """
        take the image and results and apply the mask, box, and Label
    """
    n_instances = boxes.shape[0]
    
    if not n_instances:
        print('NO INSTANCES TO DISPLAY')
    else:
        assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]

    if same is True:
      colors = random_colors(len(class_names))    
      for i in range(n_instances):
        if not np.any(boxes[i]):
          continue
        y1,x1,y2,x2 = boxes[i]
        label = names[ids[i]]
        color = colors[ids[i]]
        score = scores[i] if scores is not None else None
        caption = '{} {:.1%}'.format(label, score) if score else label
        mask = masks[:, :, i]
        image = apply_mask(image, mask, color)
        image = cv2.rectangle(image, (x1, y1), (x2, y2), color, 1)
        image = cv2.putText(
            image, caption, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.5, color, 1
        )
    else:
      colors = random_colors(n_instances)
      for i, color in enumerate(colors):
        if not np.any(boxes[i]):
          continue
        y1, x1, y2, x2 = boxes[i]
        label = names[ids[i]]
        score = scores[i] if scores is not None else None
        caption = '{} {:.1%}'.format(label, score) if score else label
        mask = masks[:, :, i]
        image = apply_mask(image, mask, color)
        image = cv2.rectangle(image, (x1, y1), (x2, y2), color, 1)
        image = cv2.putText(
            image, caption, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.5, color, 1
        )

    # Add caption
    counter = []
    for _ in ids:
      counter.append(names[_])
    caption = str(collections.Counter(counter).most_common(3))
    image = cv2.rectangle(image, (0, 0), (len(caption)*8, 40), (0,0,0), -1)
    image = cv2.putText(image,caption,(10,25), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1, cv2.LINE_AA)
    
    return image

In [0]:
## Function to input a video and output multiple frames with their detection

def video_to_frames(input_vid, output_loc, max_fps = None):
  # check if folder already exist otherwise mkdir
  if not os.path.exists(output_loc):
    os.mkdir(output_loc)
    print("%s was created" % output_loc)
  # log the time
  time_start = time.time()
  # capture frame
  cap = cv2.VideoCapture(input_vid)
  count = 0
  print('\nRunning Mask R-CNN on %s' % input_vid)
  
  try:
    while True:
      status, image = cap.read()
      # run detection
      results = model.detect([image], verbose = 0)
      # visualization
      r = results[0]
      result_image = display_instances(
          image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores']
      )
      cv2.imwrite(output_loc + "/frame%04d.jpg" % count, result_image)
      count += 1
      # print every 50 frames
      if count % 50 == 0:
        time_mid = time.time()
        print("%d frames converted. Time elapsed: %d seconds." % (count, (time_mid - time_start)))
      # set upper limit
      if not max_fps == None:
        if count > max_fps:
          break

  except Exception as e:
    print("There was an error!")
    print(e)
 
  cap.release()
	# log the time again
  time_end = time.time()
  print("%d frames converted at %d frames per second\n" % (count, (count/(time_end - time_start))))
  print("Conversion time: %d seconds." % (time_end - time_start))
  
def single_frame_detection(path, title="", figsize=(16, 16), ax=None):  
  image = scipy.misc.imread(path)
  
  if not ax:
        _, ax = plt.subplots(1, figsize=figsize)
  
  # Show area outside image boundaries.
  height, width = image.shape[:2]
  ax.set_ylim(height + 10, -10)
  ax.set_xlim(-10, width + 10)
  ax.axis('off')
  ax.set_title(title)
  
  # Run detection
  results = model.detect([image], verbose=0)
  # Visualize results
  r = results[0]
  result_image = display_instances(image, r['rois'], r['masks'], r['class_ids'], 
                            class_names, r['scores'])

  plt.imshow(result_image)
  plt.show()

## Download mp4 video from YouTube
---
Source: https://github.com/rg3/youtube-dl and https://github.com/rg3/youtube-dl/issues/5192

In [0]:
from __future__ import unicode_literals
!pip install --upgrade youtube-dl # install if you don't have it
import youtube_dl

def YouTube_download(url):
  ydl_opts = {
      'outtmpl': 'yt-video.%(ext)s'
  }
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

## Frames to video and upload/download function

---

Source: http://www.xavierdupre.fr/blog/2016-03-30_nojs.html 

In [0]:
## Function to combine frames into a video

def frames_to_video(input_folder, outvid=None, fps=30, size=None,
               is_color=True, format='MP4V'):
    """
    Create a video from a list of images.
 
    @param      outvid      output video
    @param      images      list of images to use in the video
    @param      fps         frame per second
    @param      size        size of each frame
    @param      is_color    color
    @param      format      see http://www.fourcc.org/codecs.php
    @return                 see http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_gui/py_video_display/py_video_display.html
 
    The function relies on http://opencv-python-tutroals.readthedocs.org/en/latest/.
    By default, the video will have the size of the first image.
    It will resize every image to this size before adding them to the video.
    """
    from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize
    fourcc = VideoWriter_fourcc(*format)
    image_dir = sorted(os.listdir(input_folder))
    vid = None
    for i in image_dir:
      image = os.path.join(input_folder, i)
      if not os.path.exists(image):
        raise FileNotFoundError(image)
      img = imread(image)
      if vid is None:
        if size is None:
          size = img.shape[1], img.shape[0]
          vid = VideoWriter('out.mp4', fourcc, float(fps), size, is_color)
      if size[0] != img.shape[1] and size[1] != img.shape[0]:
        img = resize(img, size)
      vid.write(img)
    vid.release()

# Function to download generated output

def download_output(exportAs = "MP4"):
  from google.colab import files
  import shutil
  
  if exportAs is "MP4":
    files.download("out.mp4")
  else:
    shutil.make_archive("youtube-object-detection", 'zip', IMAGE_DIR)
    files.download("youtube-object-detection.zip")
    
def upload():
  from google.colab import files
  uploaded = files.upload()
  for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))

# Run mask R-CNN on video stream

---
Example videos:
- https://www.youtube.com/watch?v=bAhprdemJKE
- https://www.youtube.com/watch?v=nosl1lxFyng
- https://www.youtube.com/watch?v=vUpDPz3Z8NM

In [0]:
IMAGE_DIR = "output-dir" # dir to save images

# Download YT video
YouTube_download("https://www.youtube.com/watch?v=vUpDPz3Z8NM")

# Run detection and output frames
video_to_frames(input_vid = "yt-video.mp4", output_loc = IMAGE_DIR, max_fps=30*60)

In [0]:
# Combine frames into video
frames_to_video(IMAGE_DIR)

# Download result
download_output()

# Run Mask R-CNN on single picture

In [0]:
# upload your own photo
upload()

# Single frame detection
single_frame_detection("trafico-buenos-aires.jpg")