In [1]:
import tensorflow as tf
print(tf.__version__)

# I/O Libraries
import os
from io import BytesIO
import tarfile
import tempfile
from six.moves import urllib

import matplotlib
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from tabulate import tabulate

import warnings
warnings.simplefilter('ignore', DeprecationWarning)

### Build Model
`DeepLab` is a DeepLearning model for semantic image segmentation, where the goal is to assign semantic labels to every pixel in the input image.
<img src="https://camo.githubusercontent.com/557038ad2a63a106c4ca97b82cae48e345c2c017f8b46db291600c31e67569e0/68747470733a2f2f6769746875622e636f6d2f74656e736f72666c6f772f6d6f64656c732f626c6f622f6d61737465722f72657365617263682f646565706c61622f6733646f632f696d672f766973322e706e673f7261773d74727565"/>
<img src="https://camo.githubusercontent.com/1c93625310141758671743ffac614750d3d324dcd1beef50075cd02b12d6459d/68747470733a2f2f6769746875622e636f6d2f74656e736f72666c6f772f6d6f64656c732f626c6f622f6d61737465722f72657365617263682f646565706c61622f6733646f632f696d672f766973312e706e673f7261773d74727565"/>

In the driving context, we aim to obtain a semantic understanding of the front driving scene throught the camera input. This is important for driving safety and an essential requirement for all levels of autonomous driving. The first step is to build the model and load the pre-trained weights. In this demo, we use the model checkpoint trained on `Cityscapes` dataset.
<img src="https://camo.githubusercontent.com/fa58d4500df52b272cde52df3d146099350286f5eea9dcf5b43965ee02d487b4/68747470733a2f2f7777772e636974797363617065732d646174617365742e636f6d2f776f726470726573732f77702d636f6e74656e742f75706c6f6164732f323031352f30372f6d75656e7374657230302e706e67"/>
<img src="https://camo.githubusercontent.com/8b211a1ff514143ea596ab37290fc80f75658f2157d3641da8f38b46c031d402/68747470733a2f2f7777772e636974797363617065732d646174617365742e636f6d2f776f726470726573732f77702d636f6e74656e742f75706c6f6164732f323031352f30372f7a75657269636830302e706e67"/>

In [2]:
# tf.compat.v1.disable_eager_execution()
# tf.compat.v1.disable_v2_behavior()

class DeepLab(object):
    """ Class to load deeplab model and run inference """
    FROZEN_GRAPH_NAME = 'frozen_inference_graph'
    
    def __init__(self, tarball_path):
        """ Creates and loads pretrained deeplab model. """
        self.graph = tf.compat.v1.Graph()
        graph_def = None
        
        # Extract frozen graph from tar archive
        tar_file = tarfile.open(tarball_path)
        for tar_info in tar_file.getmembers():
            if self.FROZEN_GRAPH_NAME in os.path.basename(tar_info.name):
                file_handle = tar_file.extractfile(tar_info)
                graph_def = tf.compat.v1.GraphDef.FromString(file_handle.read())
                break
        tar_file.close()
        
        if graph_def is None:
            raise RuntimeError('Cannot find inference graph in tar archive.')

        with self.graph.as_default():
            tf.compat.v1.import_graph_def(graph_def, name='')
        self.sess = tf.compat.v1.Session(graph=self.graph)
        
    def run(self, image, INPUT_TENSOR_NAME='ImageTensor:0', OUTPUT_TENSOR_NAME='SemanticPredictions:0'):
        """
        Runs Inference on a single image
        
        Args:
            image: PIL.Image object, raw input image
            INPUT_TENSOR_IMAGE: Name of input tensor, default to ImageTensor.
            OUTPUT_TENSOR_IMAGE: Name of output tensor, default to SemanticPredictions.
            
        Returns:
            resized_image: RGB image resized from original input image.
            seg_map: Segmentation map of `resized_image`
        """
        width, height = image.size
        target_size = (2049, 1025) # Size of Cityscapes images
        resized_image = image.convert('RGB').resize(target_size, Image.ANTIALIAS)
        batch_seg_map = self.sess.run(OUTPUT_TENSOR_NAME, feed_dict={INPUT_TENSOR_NAME: [np.asarray(resized_image)]})
        seg_map = batch_seg_map[0] # Expected Batch Size = 1
        
        if len(seg_map.shape) == 2:
            seg_map = np.expand_dims(seg_map, -1) # Need an Extra Dimension for cv2.resize
        seg_map = cv2.resize(seg_map, (width, height), interpolation=cv2.INTER_NEAREST)
        return seg_map

### Visualization
Create Helper functions for decoding and visualizing results

In [3]:
def create_label_colormap():
    """
    Creates a label colormap used in CityScapes benchmark
    
    Returns:
        A Colormap for visualizing segmentation results
    """
    colormap = np.array([
        [128,  64, 128],
        [244,  35, 232],
        [ 70,  70,  70],
        [102, 102, 156],
        [190, 153, 153],
        [153, 153, 153],
        [250, 170,  30],
        [220, 220,   0],
        [107, 142,  35],
        [152, 251, 152],
        [ 70, 130, 180],
        [220,  20,  60],
        [255,   0,   0],
        [  0,   0, 142],
        [  0,   0,  70],
        [  0,  60, 100],
        [  0,  80, 100],
        [  0,   0, 230],
        [119,  11,  32],
        [  0,   0,   0]], dtype=np.uint8)
    return colormap


def label_to_color_image(label):
    """Adds color defined by the dataset colormap to the label.

    Args:
        label: A 2D array with integer type, storing the segmentation label.

    Returns:
        result: A 2D array with floating type. The element of the array
            is the color indexed by the corresponding element in the input label
            to the PASCAL color map.

    Raises:
        ValueError: If label is not of rank 2 or its value is larger than color
            map maximum entry.
    """
    if label.ndim != 2:
        raise ValueError('Expect 2-D input label')

    colormap = create_label_colormap()

    if np.max(label) >= len(colormap):
        raise ValueError('label value too large.')

    return colormap[label]


def vis_segmentation(image, seg_map):
    """Visualizes input image, segmentation map and overlay view."""
    plt.figure(figsize=(20, 4))
    grid_spec = gridspec.GridSpec(1, 4, width_ratios=[6, 6, 6, 1])

    plt.subplot(grid_spec[0])
    plt.imshow(image)
    plt.axis('off')
    plt.title('input image')

    plt.subplot(grid_spec[1])
    seg_image = label_to_color_image(seg_map).astype(np.uint8)
    plt.imshow(seg_image)
    plt.axis('off')
    plt.title('segmentation map')

    plt.subplot(grid_spec[2])
    plt.imshow(image)
    plt.imshow(seg_image, alpha=0.7)
    plt.axis('off')
    plt.title('segmentation overlay')

    unique_labels = np.unique(seg_map)
    ax = plt.subplot(grid_spec[3])
    plt.imshow(FULL_COLOR_MAP[unique_labels].astype(np.uint8), interpolation='nearest')
    ax.yaxis.tick_right()
    plt.yticks(range(len(unique_labels)), LABEL_NAMES[unique_labels])
    plt.xticks([], [])
    ax.tick_params(width=0.0)
    plt.grid('off')
    plt.show()


LABEL_NAMES = np.asarray([
    'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light',
    'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck',
    'bus', 'train', 'motorcycle', 'bicycle', 'void'])

FULL_LABEL_MAP = np.arange(len(LABEL_NAMES)).reshape(len(LABEL_NAMES), 1)
FULL_COLOR_MAP = label_to_color_image(FULL_LABEL_MAP)

### Load Model from a Frozen Graph
There are 2 model checkpoints pretrained on CityScapes with different network backbones: `MobileNetV2` and `Xception65`. We will use `MobileNetV2` for inference.

In [4]:
MODEL_NAME = 'mobilenetv2_coco_cityscapes_trainfine'

_DOWNLOAD_URL_PREFIX = 'http://download.tensorflow.org/models/'
_MODEL_URLS = {
    'mobilenetv2_coco_cityscapes_trainfine':
        'deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz',
    'xception65_cityscapes_trainfine':
        'deeplabv3_cityscapes_train_2018_02_06.tar.gz',
}
_TARBALL_NAME = 'deeplab_model.tar.gz'

model_dir = tempfile.mkdtemp()
tf.compat.v1.gfile.MakeDirs(model_dir)

download_path = os.path.join(model_dir, _TARBALL_NAME)
print('Downloading model, this might take a while...')
urllib.request.urlretrieve(_DOWNLOAD_URL_PREFIX + _MODEL_URLS[MODEL_NAME], download_path)
print('Download Completed!, loading DeepLab Model...')

MODEL = DeepLab(download_path)
print('Model loaded Successfully!!')

### Run on Sample Image
The sample image is frame#0 in Dataset `(mitdrivingsegmentation)`

In [5]:
SAMPLE_IMAGE = '../input/mitdrivingsegmentation/mit_driveseg_sample.png'


def visualize(SAMPLE_IMAGE):
    """ Inferences DeepLab model and visualizes result. """
    original_im = Image.open(SAMPLE_IMAGE)
    seg_map = MODEL.run(original_im)
    vis_segmentation(original_im, seg_map)

visualize(SAMPLE_IMAGE)

### Run on Sample Video

In [14]:
import IPython

def visualize_segmentation_stream(image, seg_map, index):
    """ Visualizes Segmentation overlay view and stream it with IPython display """
    plt.figure(figsize=(12, 7))

    seg_image = label_to_color_image(seg_map).astype(np.uint8)
    plt.imshow(image)
    plt.imshow(seg_image, alpha=0.7)
    plt.axis('off')
    plt.title('segmentation overlay | frame #%d'%index)
    plt.grid('off')
    plt.tight_layout()

    # Show visualization in a streaming fashion.
    f = BytesIO()
    plt.savefig(f, format='jpeg')
    IPython.display.display(IPython.display.Image(data=f.getvalue()))
    f.close()
    plt.close()
    
def run_visualization_video(frame, index):
    """ Inferences DeepLab Model on a Video File and Stream the Visualization """
    original_im = Image.fromarray(frame[..., ::-1])
    seg_map = MODEL.run(original_im)../input/mitdrivingsegmentation/lize_segmentation_stream(original_im, seg_map, index)
    
SAMPLE_VIDEO = '../input/mitdrivingsegmentation/mit_driveseg_sample.mp4'
# Capture Video Frames
video = cv2.VideoCapture(SAMPLE_VIDEO)
num_frames = 30

try:
    for i in range(num_frames):
        _, frame = video.read()
        if not _: break
        run_visualization_video(frame, i)
        IPython.display.clear_output(wait=True)
except KeyboardInterrupt:
    plt.close()
    print("Stream stopped.")

### Evaluation
Let's evaluate 

In [25]:
class DriveSeg(object):
    """Class to load MIT DriveSeg Dataset."""

    def __init__(self, tarball_path):
        self.tar_file = tarfile.open(tarball_path)
        self.tar_info = self.tar_file.getmembers()
    
    def fetch(self, index):
        """Get ground truth by index.

        Args:
            index: The frame number.

        Returns:
            gt: Ground truth segmentation map.
        """
        tar_info = self.tar_info[index + 1]  # exclude index 0 which is the parent directory
        file_handle = self.tar_file.extractfile(tar_info)
        gt = np.fromstring(file_handle.read(), np.uint8)
        gt = cv2.imdecode(gt, cv2.IMREAD_COLOR)
        gt = gt[:, :, 0]  # select a single channel from the 3-channel image
        gt[gt==255] = 19  # void class, does not count for accuracy
        return gt


SAMPLE_GT = 'mit_driveseg_sample_gt.tar.gz'
if not os.path.isfile(SAMPLE_GT): 
    print('downloading the sample ground truth...')
    SAMPLE_GT = urllib.request.urlretrieve('https://github.com/lexfridman/mit-deep-learning/raw/master/tutorial_driving_scene_segmentation/mit_driveseg_sample_gt.tar.gz')[0]

dataset = DriveSeg(SAMPLE_GT)
print('visualizing ground truth annotation on the sample image...')

original_im = Image.open(SAMPLE_IMAGE)
gt = dataset.fetch(0)  # sample image is frame 0
vis_segmentation(original_im, gt)

### Evaluation on Sample Image
There are many ways to measure the performance of a segmentation model. The most straight forward one is pixel accuracy, which calculates how many pixels are correctly predicted. Another commonly used one is the standard `Jaccard Index` (intersection-over-union) as `IoU = TP ⁄ (TP+FP+FN)`, where TP, FP, and FN are the numbers of true positive, false positive, and false negative pixels, respectively.

In [26]:
def evaluate_single(seg_map, ground_truth):
    """Evaluate a single frame with the MODEL loaded."""    
    # merge label due to different annotation scheme
    seg_map[np.logical_or(seg_map==14,seg_map==15)] = 13
    seg_map[np.logical_or(seg_map==3,seg_map==4)] = 2
    seg_map[seg_map==12] = 11

    # calculate accuracy on valid area
    acc = np.sum(seg_map[ground_truth!=19]==ground_truth[ground_truth!=19])/np.sum(ground_truth!=19)
    
    # select valid labels for evaluation
    cm = confusion_matrix(ground_truth[ground_truth!=19], seg_map[ground_truth!=19], 
                          labels=np.array([0,1,2,5,6,7,8,9,11,13]))
    intersection = np.diag(cm)
    union = np.sum(cm, 0) + np.sum(cm, 1) - np.diag(cm)
    return acc, intersection, union


print('evaluating on the sample image...')

original_im = Image.open(SAMPLE_IMAGE)
seg_map = MODEL.run(original_im)
gt = dataset.fetch(0)  # sample image is frame 0
acc, intersection, union = evaluate_single(seg_map, gt)
class_iou = np.round(intersection / union, 5)
print('pixel accuracy: %.5f'%acc)
print('mean class IoU:', np.mean(class_iou))
print('class IoU:')
print(tabulate([class_iou], headers=LABEL_NAMES[[0,1,2,5,6,7,8,9,11,13]]))

### Evaluate on Sample Video

In [28]:
print('evaluating on the sample video...', flush=True)

video = cv2.VideoCapture(SAMPLE_VIDEO)
# num_frames = 598  # uncomment to use the full sample video
num_frames = 30

acc = []
intersection = []
union = []

for i in tqdm(range(num_frames)):
    _, frame = video.read()
    original_im = Image.fromarray(frame[..., ::-1])
    seg_map = MODEL.run(original_im)
    gt = dataset.fetch(i)
    _acc, _intersection, _union = evaluate_single(seg_map, gt)
    intersection.append(_intersection)
    union.append(_union)
    acc.append(_acc)

class_iou = np.round(np.sum(intersection, 0) / np.sum(union, 0), 4)
print('pixel accuracy: %.4f'%np.mean(acc))
print('mean class IoU: %.4f'%np.mean(class_iou))
print('class IoU:')
print(tabulate([class_iou], headers=LABEL_NAMES[[0,1,2,5,6,7,8,9,11,13]]))

### Optional: leverage temporal information
One thing makes video scene segmentation different from image segmentation is the availability of previous frames, which contains valuable `temporal information` that may help with perception. The open question is how can we use such temporal information. Let's try combine the prediction of two frames instead of only one frame, making smoother predictions over time.

In [29]:
print('Evaluating on the sample video with temporal smoothing...', flush=True)

video = cv2.VideoCapture(SAMPLE_VIDEO)
num_frames = 598  # uncomment to use the full sample video
#num_frames = 30

acc = []
intersection = []
union = []
prev_seg_map_logits = 0

for i in tqdm(range(num_frames)):
    _, frame = video.read()
    original_im = Image.fromarray(frame[..., ::-1])
    
    # Get the logits instead of label prediction
    seg_map_logits = MODEL.run(original_im, OUTPUT_TENSOR_NAME='ResizeBilinear_3:0')
    
    # Add previous frame's logits and get the results
    seg_map = np.argmax(seg_map_logits + prev_seg_map_logits, -1)
    prev_seg_map_logits = seg_map_logits
    
    gt = dataset.fetch(i)
    _acc, _intersection, _union = evaluate_single(seg_map, gt)
    intersection.append(_intersection)
    union.append(_union)
    acc.append(_acc)
    
class_iou = np.round(np.sum(intersection, 0) / np.sum(union, 0), 4)
print('pixel accuracy: %.4f'%np.mean(acc))
print('mean class IoU: %.4f'%np.mean(class_iou))
print('class IoU:')
print(tabulate([class_iou], headers=LABEL_NAMES[[0,1,2,5,6,7,8,9,11,13]]))