# Image Segmentation using Dense Prediction Transformers

This notebook is based on the github repo on Dense Prediction Transformers: https://github.com/isl-org/DPT & collective effort from course 4.570


Install necessary libraries

In [None]:
!pip install timm

Connect notebook to Google Drive

In [None]:
import os, sys
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#change this path to your DPT Segmentation folder
os.chdir('/content/drive/MyDrive/0206Demo')

Import necessary libraries

In [None]:
import os
import glob #for file paths
import cv2
import argparse

import torch
import torch.nn.functional as F

import util.io

from torchvision.transforms import Compose
from dpt.models import DPTSegmentationModel
from dpt.transforms import Resize, NormalizeImage, PrepareForNet

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from os import listdir

List all images in 'input' folder

In [None]:
images = listdir('Exercise1_segmentation/Japan St')
print(images)

['23.1980026 大.jpeg', '2007_11 大.jpeg', '469553383 大.jpeg', 'closeupviewofthefacade_02 大.jpeg', 'calin-stan-Jm0oI-zRn0s-unsplash 大.jpeg', '25987667747_77d6621c48_b 大.jpeg', '122970922-kyoto-japan-november-23-2018-old-japanese-house-in-a-residential-area-in-japan 大.jpeg', '374522630 大.jpeg', '11015 大.jpeg', '92691265-traditional-japanese-house-in-kanazawa-japan 大.jpeg', '231140012 大.jpeg', '402556044 大.jpeg', 'a0002031_main 大.jpeg', '143993278-the-small-town-s-ancient-japanese-houses-of-hida-furukawa-town-gifu-japan 大.jpeg', 'DSCF2059 大.jpeg', 'FFFA9A07-60CA-41DE-8A7B-2F40F821D36B_cx0_cy4_cw0_w1200_r1 大.jpeg', 'exterior-of-old-farmers-house-thatched-roof-jidayubori-park-kitami-setagaya-ku-tokyo-japan-east-asia-asia-2PK085G 大.jpeg', 'exterior-amber-house-southern-higashiyama-1 (Large)-XL 大.jpeg', 'DSCF2072 大.jpeg', 'virtual_house02_pic05 大.jpeg', 'urban-japanese-house-ai 大.jpeg', 'walkJapan-695x469 大.jpeg', 'IMG_9027 大.jpeg', 'urban-japanese-house 大.jpeg', 'istockphoto-598523722-612x612 

Create empty dataframe of images and visual feature scores

In [None]:
images_df = pd.DataFrame(images)
images_df.columns = ['filename']
images_df.head()

Unnamed: 0,filename
0,23.1980026 大.jpeg
1,2007_11 大.jpeg
2,469553383 大.jpeg
3,closeupviewofthefacade_02 大.jpeg
4,calin-stan-Jm0oI-zRn0s-unsplash 大.jpeg


In [None]:
labels = ['wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed', 'window', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', 'signboard', 'dresser', 'counter', 'sand', 'sink', 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', 'stairs', 'runway', 'case', 'pooltable', 'pillow', 'screen', 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffeetable', 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', 'palmtree', 'kitchen', 'computer', 'swivelchair', 'boat', 'bar', 'arcade', 'hut', 'bus', 'towel', 'light', 'truck', 'tower', 'chandelier', 'awning', 'streetlight', 'booth', 'television', 'airplane', 'dirttrack', 'apparel', 'pole', 'land', 'balustrade', 'escalator', 'ottoman', 'bottle', 'sideboard', 'poster', 'stage', 'van', 'ship', 'fountain', 'conveyerbelt', 'canopy', 'washer', 'toy', 'pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', 'bag', 'motorbike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', 'brandname', 'microwave', 'pot', 'animal', 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', 'vase', 'trafficlight', 'tray', 'trashcan', 'fan', 'pier', 'crtscreen', 'plate', 'monitor', 'bulletinboard', 'shower', 'radiator', 'glass', 'clock', 'flag']

In [None]:
for i in range(0,150):
    images_df[labels[i]] = 0

images_df.head()

Initialize the DPT model for prediction

In [None]:
model_path = "weights/dpt_hybrid-ade20k-53898607.pt"
model_type = 'dpt_hybrid'
optimize = True

In [None]:
#INITIALIZE MODEL - This may take a while.
"""Run segmentation network

Args:
    input_path (str): path to input folder
    output_path (str): path to output folder
    model_path (str): path to saved model
"""
print("initialize")

# select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: %s" % device)

net_w = net_h = 480

# load network
if model_type == "dpt_large":
    model = DPTSegmentationModel(
        150,
        path=model_path,
        backbone="vitl16_384",
    )
elif model_type == "dpt_hybrid":
    model = DPTSegmentationModel(
        150,
        path=model_path,
        backbone="vitb_rn50_384",
    )
else:
    assert (
        False
    ), f"model_type '{model_type}' not implemented, use: --model_type [dpt_large|dpt_hybrid]"

transform = Compose(
    [
        Resize(
            net_w,
            net_h,
            resize_target=None,
            keep_aspect_ratio=True,
            ensure_multiple_of=32,
            resize_method="minimal",
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
        PrepareForNet(),
    ]
)

model.eval()

if optimize == True and device == torch.device("cuda"):
    model = model.to(memory_format=torch.channels_last)
    model = model.half()

model.to(device)

Loop through the dataframe and populate the feature scores by running the DPT model on each file.

In [None]:
input_path = "Exercise1_segmentation/Japan St"
output_path = "Exercise1_segmentation/JapanOut"
savename = "images_scores_Japan.csv"

In [None]:
start_index = 0
for i in range(start_index,len(images_df)):
    print("Processing file "+str(i+1)+"/"+str(len(images_df)))

    filename = images_df.iloc[i]['filename']
    img_name = input_path+'/'+filename
    print(img_name)

    # input
    img = util.io.read_image(img_name)
    img_input = transform({"image": img})["image"]

    # compute
    with torch.no_grad():
        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
        if optimize == True and device == torch.device("cuda"):
            sample = sample.to(memory_format=torch.channels_last)
            sample = sample.half()

        out = model.forward(sample)

        prediction = torch.nn.functional.interpolate(
            out, size=img.shape[:2], mode="bicubic", align_corners=False
        )
        prediction = torch.argmax(prediction, dim=1) + 1
        prediction = prediction.squeeze().cpu().numpy()

        for j in range(1,151):
            score = prediction[prediction == j].size/prediction.size
            images_df[labels[j-1]].iloc[i] = score

    #save file after every 50 images
    if(i%50) == 0:
        images_df.to_csv(output_path+'/'+savename)
        print("File_Saved")

    # save segmentation output image in output folder
    filepath = os.path.join(output_path, os.path.splitext(os.path.basename(img_name))[0])
    util.io.write_segm_img(filepath, img, prediction, alpha=1)

images_df.to_csv(output_path+'/'+savename)
print("File_Saved")
print("finished")