# Mask R-CNN - Train on Shapes Dataset


This notebook shows how to train Mask R-CNN on your own dataset. To keep things simple we use a synthetic dataset of shapes (squares, triangles, and circles) which enables fast training. You'd still need a GPU, though, because the network backbone is a Resnet101, which would be too slow to train on a CPU. On a GPU, you can start to get okay-ish results in a few minutes, and good results in less than an hour.

The code of the *Shapes* dataset is included below. It generates images on the fly, so it doesn't require downloading any data. And it can generate images of any size, so we pick a small image size to train faster. 

In [1]:
import os
import sys
import random
import math
import re
import time
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt

# Root directory of the project
ROOT_DIR = os.path.abspath("../../")

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

from skimage.draw import rectangle

%matplotlib inline 

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
DATA_DIR = os.path.join(ROOT_DIR, "../data")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(DATA_DIR, "mask_rcnn_coco.h5")

# Download COCO trained weights from Releases if needed
# if not os.path.exists(COCO_MODEL_PATH):
#     utils.download_trained_weights(COCO_MODEL_PATH)

Using TensorFlow backend.


## Pull in annotations from db

In [49]:
import pyodbc
import os
import pandas as pd

PATH_PREFIX = '/mnt/scratch/shared/kaggle/'

ANNOTATION_COLS = ['XMax','XMin','YMin','YMax','LabelDescription']

passwd = os.environ['dgs_sandbox_pwd']

conn = pyodbc.connect('DSN=BIdevDatabase;'
                      'Database=Sandbox;'
                      'UID=DGSuser;'
                      'PWD=' + passwd)


# annotations_query = """SELECT a.ImageID, a.XMax, a.XMin, a.YMin, a.YMax, b.LabelDescription FROM [Sandbox].[kaggle].[Combined_Set_Detection_BBox] as a
# join [kaggle].[Class_Description] as b on b.LabelName = a.LabelName"""

class_descriptions = pd.read_sql("SELECT LabelName, LabelDescription from [kaggle].[Class_Description]",conn)

class_descriptions['LabelID'] = class_descriptions.index

display(class_descriptions.head())

#This now holds the annotation information.
bboxes = pd.read_sql("SELECT ImageID, XMax, XMin, YMin, YMax, LabelName FROM [Sandbox].[kaggle].[Combined_Set_Detection_BBox]", conn)

annotations = pd.merge(bboxes,class_descriptions, on='LabelName',how='inner')

display(annotations.head())

# This now holds the list of images
image_paths = pd.read_sql("SELECT ImageID, RelativePath, Height, Width, Mode from [kaggle].[Image_Path]", conn)

display(image_paths.head())

# Inner join on the two dataframes, so we now have images combined with associated annotations
annotated_image_paths = pd.merge(image_paths,annotations, on='ImageID',how='inner')

display(annotated_image_paths.head())

Unnamed: 0,LabelName,LabelDescription,LabelID
0,/m/061hd_,Infant bed,0
1,/m/06m11,Rose,1
2,/m/03120,Flag,2
3,/m/01kb5b,Flashlight,3
4,/m/0120dh,Sea turtle,4


Unnamed: 0,ImageID,XMax,XMin,YMin,YMax,LabelName,LabelDescription,LabelID
0,5a27bbe9ab1ede93,0.706875,0.02625,0.553571,0.99906,/m/04p0qw,Billiard table,46
1,2fe71cd9dc2acfdd,0.999333,0.0,0.0,0.998565,/m/04p0qw,Billiard table,46
2,ea2422b5f4c97b42,0.868228,0.0,0.168945,0.431641,/m/04p0qw,Billiard table,46
3,ea2422b5f4c97b42,0.888726,0.0,0.567383,0.999023,/m/04p0qw,Billiard table,46
4,2fef02bb3916fec3,0.48125,0.045833,0.402778,0.687963,/m/04p0qw,Billiard table,46


Unnamed: 0,ImageID,RelativePath,Height,Width,Mode
0,95259d687d440013,train/95259d687d440013.jpg,1024,1024,RGB
1,ccce42a1f4ed2c96,train/ccce42a1f4ed2c96.jpg,1024,768,RGB
2,30723bdea123ccdf,validation/30723bdea123ccdf.jpg,683,1024,RGB
3,cba0bd46fcddd85d,train/cba0bd46fcddd85d.jpg,1024,1024,RGB
4,cc88539b5c74a0c0,train/cc88539b5c74a0c0.jpg,1024,1024,RGB


Unnamed: 0,ImageID,RelativePath,Height,Width,Mode,XMax,XMin,YMin,YMax,LabelName,LabelDescription,LabelID
0,2fef4dd2f83feb18,train/2fef4dd2f83feb18.jpg,768,1024,RGB,0.697656,0.089844,0.0,0.936458,/m/04yx4,Man,113
1,2fef4dd2f83feb18,train/2fef4dd2f83feb18.jpg,768,1024,RGB,0.388281,0.221875,0.0,0.309375,/m/0dzct,Human face,279
2,2fef4dd2f83feb18,train/2fef4dd2f83feb18.jpg,768,1024,RGB,0.960156,0.778906,0.0,0.3,/m/0dzct,Human face,279
3,2fef4dd2f83feb18,train/2fef4dd2f83feb18.jpg,768,1024,RGB,0.332812,0.0,0.864583,0.998958,/m/050gv4,Plate,472
4,2fef4dd2f83feb18,train/2fef4dd2f83feb18.jpg,768,1024,RGB,0.946875,0.605469,0.836458,0.998958,/m/050gv4,Plate,472


## Configurations

In [7]:
#TODO configure this

class KaggleConfig(Config):
    
    NAME = "kaggle"

    #TODO experiment with this
    GPU_COUNT = 2
    IMAGES_PER_GPU = 2

    # Number of classes (including background)
    NUM_CLASSES = 1 + len(class_descriptions)  # + 1 for background

    #TODO experiment with this
    # Use small images for faster training. Set the limits of the small side
    # the large side, and that determines the image shape.
    IMAGE_MIN_DIM = 512
    IMAGE_MAX_DIM = 512

    #TODO disabling based on samples/balloon/balloon.py
    # Use smaller anchors because our image and objects are small
#     RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128, 256)  # anchor side in pixels

    #TODO what is this? Disabling based on samples/balloon/balloon.py
    # Reduce training ROIs per image because the images are small and have
    # few objects. Aim to allow ROI sampling to pick 33% positive ROIs.
#     TRAIN_ROIS_PER_IMAGE = 32

    #TODO
    # Use a small epoch since the data is simple
    STEPS_PER_EPOCH = 100

    #disabling based on samples/balloon/balloon.py
    # use small validation steps since the epoch is small
#     VALIDATION_STEPS = 5
    
    # Skip detections with < 90% confidence
    DETECTION_MIN_CONFIDENCE = 0.75
    
config = ShapesConfig()
config.display()


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     4
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.75
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      2
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 2
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  512
IMAGE_META_SIZE                513
IMAGE_MIN_DIM                  512
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [512 512   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'mrcnn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'rpn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE           

## Notebook Preferences

In [None]:
def get_ax(rows=1, cols=1, size=8):
    """Return a Matplotlib Axes array to be used in
    all visualizations in the notebook. Provide a
    central point to control graph sizes.
    
    Change the default size attribute to control the size
    of rendered images
    """
    _, ax = plt.subplots(rows, cols, figsize=(size*cols, size*rows))
    return ax

## Dataset

Create a synthetic dataset

Extend the Dataset class and add a method to load the shapes dataset, `load_shapes()`, and override the following methods:

* load_image()
* load_mask()
* image_reference()

In [58]:
class KaggleImageDataset(utils.Dataset):

    #TODO better method documentation needed for this
    
    def load_kaggle_images(self, dataset_dir, subset, classes=None):
        """Load a subset of the COCO dataset.
        dataset_dir: The root directory of the COCO dataset.
        subset: What to load (train, validation, test)
        classes: Dataframe : If provided, only loads images that have the given classes.
        """

        #Verify parameters
        valid_subsets = {'validation','test','train'}
        if subset not in valid_subsets:
            raise ValueError("subset argument must be one of %r." % valid_subsets)
        

        image_dir = os.path.join(DATA_DIR, subset)

        # Load all classes or a subset?
        if classes is None:
            classes = class_descriptions  #load all classes

        image_ids_with_desired_classes = bboxes[bboxes['LabelName'].isin(classes['LabelName'])]['ImageID'].unique()
        images = image_paths[image_paths['ImageID'].isin(image_ids_with_desired_classes)]

        # Add classes
        for i,row in classes.iterrows():
            self.add_class("openImages", i, row['LabelDescription'])
        
        # Add images
        for i,row in images.iterrows():
            self.add_image(
                "openImages", image_id=i,
                path=os.path.join(DATA_DIR, row['RelativePath']),
                width=row["Width"],
                height=row["Height"],
                annotations=annotations[annotations['ImageID'] == row['ImageID']])
            

    def load_mask(self, image_id):
        """Generate instance masks for an image.
       Returns:
        masks: A bool array of shape [height, width, instance count] with
            one mask per instance.
        class_ids: a 1D array of class IDs of the instance masks.
        """
        
        # Create rectangular bounding box since we are doing object detection, not segmentation
        # desired dimension is [height, width, instance_count]
        img = self.image_info[image_id]
        
        mask = np.zeros([img["height"], img["width"], len(img["annotations"])],
                        dtype=np.uint8)
        
        for i, p in img["annotations"].iterrows():
            # Create rectangular bounding box since we are doing object detection, not segmentation
            # Get indexes of pixels inside the polygon and set them to 1
            
            #TODO can maybe do this once and store it ...
    
            xmax = img["width"]*p['XMax']
            xmin = img["width"]*p['XMin']
            ymin = img["height"]*p['YMin']
            ymax = img["height"]*p['YMax']
            
            start = (xmin, ymin)  #top left corner ... are coordinates reversed?
            extent = (ymax - ymin, xmax - xmin)  #height and width
            rr, cc = rectangle(start, extent=extent)
            
            mask[rr, cc, i] = 1

        # Return mask, and array of class IDs of each instance. Since we have
        # one class ID only, we return an array of 1s
        return mask.astype(np.bool), np.array(img['annotations']['LabelID'].values, dtype=np.int32)

    def image_reference(self, image_id):
        return self.image_info[image_id]['path']
    

# HERE!!!

In [59]:
# Training dataset
# dataset_dir, subset, classes=None

dataset_train = KaggleImageDataset()
dataset_train.load_kaggle_images(DATA_DIR,'train', class_descriptions.iloc[0:1])
# dataset_train.prepare()

In [60]:
dataset_train.image_info

[{'annotations':                    ImageID      XMax      XMin      YMin      YMax  \
  1825507   9f8e6d6fab76cfde  0.999167  0.480833  0.212500  0.643125   
  4372548   9f8e6d6fab76cfde  0.360833  0.249167  0.151250  0.278125   
  4372549   9f8e6d6fab76cfde  0.415833  0.324167  0.846875  0.911875   
  4372550   9f8e6d6fab76cfde  0.431667  0.340000  0.317500  0.395625   
  4372551   9f8e6d6fab76cfde  0.686667  0.584167  0.235000  0.336250   
  4372552   9f8e6d6fab76cfde  0.825833  0.714167  0.054375  0.199375   
  4372553   9f8e6d6fab76cfde  0.898333  0.773333  0.221875  0.337500   
  9930877   9f8e6d6fab76cfde  0.512500  0.269167  0.282500  0.603750   
  12392270  9f8e6d6fab76cfde  0.999167  0.047500  0.528750  0.999375   
  
             LabelName LabelDescription  LabelID  
  1825507   /m/03bt1vf            Woman      164  
  4372548     /m/0dzct       Human face      279  
  4372549     /m/0dzct       Human face      279  
  4372550     /m/0dzct       Human face      279  
  43725

## BELOW IS UNTOUCHED

In [None]:
# Training dataset
dataset_train = ShapesDataset()
dataset_train.load_shapes(500, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
dataset_train.prepare()

# Validation dataset
dataset_val = ShapesDataset()
dataset_val.load_shapes(50, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
dataset_val.prepare()

In [None]:
# Load and display random samples
image_ids = np.random.choice(dataset_train.image_ids, 4)
for image_id in image_ids:
    image = dataset_train.load_image(image_id)
    mask, class_ids = dataset_train.load_mask(image_id)
    visualize.display_top_masks(image, mask, class_ids, dataset_train.class_names)

## Create Model

In [None]:
# Create model in training mode
model = modellib.MaskRCNN(mode="training", config=config,
                          model_dir=MODEL_DIR)

In [None]:
# Which weights to start with?
init_with = "coco"  # imagenet, coco, or last

if init_with == "imagenet":
    model.load_weights(model.get_imagenet_weights(), by_name=True)
elif init_with == "coco":
    # Load weights trained on MS COCO, but skip layers that
    # are different due to the different number of classes
    # See README for instructions to download the COCO weights
    model.load_weights(COCO_MODEL_PATH, by_name=True,
                       exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", 
                                "mrcnn_bbox", "mrcnn_mask"])
elif init_with == "last":
    # Load the last model you trained and continue training
    model.load_weights(model.find_last(), by_name=True)

## Training

Train in two stages:
1. Only the heads. Here we're freezing all the backbone layers and training only the randomly initialized layers (i.e. the ones that we didn't use pre-trained weights from MS COCO). To train only the head layers, pass `layers='heads'` to the `train()` function.

2. Fine-tune all layers. For this simple example it's not necessary, but we're including it to show the process. Simply pass `layers="all` to train all layers.

In [None]:
# Train the head branches
# Passing layers="heads" freezes all layers except the head
# layers. You can also pass a regular expression to select
# which layers to train by name pattern.
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE, 
            epochs=1, 
            layers='heads')

In [None]:
# Fine tune all layers
# Passing layers="all" trains all layers. You can also 
# pass a regular expression to select which layers to
# train by name pattern.
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE / 10,
            epochs=2, 
            layers="all")

In [None]:
# Save weights
# Typically not needed because callbacks save after every epoch
# Uncomment to save manually
# model_path = os.path.join(MODEL_DIR, "mask_rcnn_shapes.h5")
# model.keras_model.save_weights(model_path)

## Detection

In [None]:
class InferenceConfig(ShapesConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

inference_config = InferenceConfig()

# Recreate the model in inference mode
model = modellib.MaskRCNN(mode="inference", 
                          config=inference_config,
                          model_dir=MODEL_DIR)

# Get path to saved weights
# Either set a specific path or find last trained weights
# model_path = os.path.join(ROOT_DIR, ".h5 file name here")
model_path = model.find_last()

# Load trained weights
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)

In [None]:
# Test on a random image
image_id = random.choice(dataset_val.image_ids)
original_image, image_meta, gt_class_id, gt_bbox, gt_mask =\
    modellib.load_image_gt(dataset_val, inference_config, 
                           image_id, use_mini_mask=False)

log("original_image", original_image)
log("image_meta", image_meta)
log("gt_class_id", gt_class_id)
log("gt_bbox", gt_bbox)
log("gt_mask", gt_mask)

visualize.display_instances(original_image, gt_bbox, gt_mask, gt_class_id, 
                            dataset_train.class_names, figsize=(8, 8))

In [None]:
results = model.detect([original_image], verbose=1)

r = results[0]
visualize.display_instances(original_image, r['rois'], r['masks'], r['class_ids'], 
                            dataset_val.class_names, r['scores'], ax=get_ax())

## Evaluation

In [None]:
# Compute VOC-Style mAP @ IoU=0.5
# Running on 10 images. Increase for better accuracy.
image_ids = np.random.choice(dataset_val.image_ids, 10)
APs = []
for image_id in image_ids:
    # Load image and ground truth data
    image, image_meta, gt_class_id, gt_bbox, gt_mask =\
        modellib.load_image_gt(dataset_val, inference_config,
                               image_id, use_mini_mask=False)
    molded_images = np.expand_dims(modellib.mold_image(image, inference_config), 0)
    # Run object detection
    results = model.detect([image], verbose=0)
    r = results[0]
    # Compute AP
    AP, precisions, recalls, overlaps =\
        utils.compute_ap(gt_bbox, gt_class_id, gt_mask,
                         r["rois"], r["class_ids"], r["scores"], r['masks'])
    APs.append(AP)
    
print("mAP: ", np.mean(APs))