<a href="https://colab.research.google.com/github/PJ-cs/DistanceEstimationTracking/blob/main/DemoDistanceEstimationTracking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
install requirements:
repos: Main Repo, DINO, DPT (with additional weights)
"""
!git clone https://github.com/PJ-cs/DistanceEstimationTracking.git


!git clone https://github.com/intel-isl/DPT.git
!pip install timm
!wget https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt
!mv dpt_hybrid-midas-501f0c75.pt DPT/weights
!mv dpt_large-midas-2f21e586.pt DPT/weights

!git clone https://github.com/aim-uofa/AdelaiDepth.git
!pip install pytorch==1.6.0 torchvision=0.7.0 cudatoolkit=10.2 -c pytorch
!pip install -r AdelaiDepth/LeReS/requirements.txt
!apt-get install libsparsehash-dev
!pip install --upgrade  git+https://github.com/mit-han-lab/torchsparse.git@e268836e64513b9a31c091cd1d517778d4c1b9e6
    
    
!git clone https://github.com/facebookresearch/dino.git
!wget https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth
!mv dino_deitsmall8_pretrain.pth dino

!pip install filterpy
!pip install lap

!mv DistanceEstimationTracking/dataset.py .
!mv DistanceEstimationTracking/models.py .
!mv DistanceEstimationTracking/sort_2_5D.py .
!mv -f DistanceEstimationTracking/run_monodepth.py DPT/

!mkdir S01_color
!unzip DistanceEstimationTracking/S01_color.zip -d S01_color

Cloning into 'DistanceEstimationTracking'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 33 (delta 11), reused 5 (delta 1), pack-reused 0[K
Unpacking objects: 100% (33/33), done.
Cloning into 'DPT'...
remote: Enumerating objects: 776, done.[K
remote: Counting objects: 100% (776/776), done.[K
remote: Compressing objects: 100% (395/395), done.[K
remote: Total 776 (delta 368), reused 720 (delta 330), pack-reused 0[K
Receiving objects: 100% (776/776), 455.51 KiB | 5.49 MiB/s, done.
Resolving deltas: 100% (368/368), done.
Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 5.3 MB/s 
Installing collected packages: timm
Successfully installed timm-0.5.4
--2022-04-08 13:06:17--  https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt
Resolving github.com (github.com)... 140.82.112.3
Conn

In [2]:
# imports
import argparse

import cv2
import os 
import json
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageFile, ImageFont, ImageDraw
import shutil
from sort_2_5D import Sort2_5D, KalmanBoxTracker
import glob
from models import SPVCNN_CLASSIFICATION
import torch
from dataset import *
from torchsparse.utils.helpers import sparse_collate_tensors
from collections import OrderedDict
import glob

import sys
sys.path.append("DPT")
import DPT.run_monodepth as run_dpt_depth

import matplotlib.pyplot as plt

"""HYPERPARAMETERS"""
ALPHA_IOU = 0.4270 # ! >0 [0, 1]
#BETA_DISTZ = 0.5 # ! >0   = 1 - alpha_iou
MAX_DIST = 4.0962 # [m]
IOU_THRES = 0.0101
MAX_AGE = 111
MIN_HITS = 1
DET_CONF_THRES = 0.9160973480326474 # 0.9

PERCENTILE = 50
DINO_THRESH = 26 # [0, 255]
DINO_RES = 256 # or 512

def dino_semseg(rgb_dir, output_dir, threshold = DINO_THRESH):
    # output_dir must be empty, mask-original_file_name in output dir
    #os.system(f'python dino/video_generation.py --pretrained_weights dino_deitsmall8_pretrain.pth --input_path "{rgb_dir}" --output_path "{output_dir}" --resize 512 ')
    stream = os.popen(f'python dino/video_generation.py --pretrained_weights "dino/dino_deitsmall8_pretrain.pth" --input_path "{rgb_dir}" --output_path "{output_dir}" --resize {DINO_RES} ') #TODO reconsider resize
    output = stream.readlines()
    stream.close()
    attn_dir = os.path.join(output_dir, "attention")
    for line in output:
        print(line)
    # delete unnecessary video
    #os.remove(os.path.join(output_dir, "video.mp4"))
    # create binary masks of images, names: mask-original_file_name and resize to original res
    for rgb_img in os.scandir(rgb_dir):
        if rgb_img.is_file() and (rgb_img.name.endswith(".jpg") or rgb_img.name.endswith(".png")):
            # open att_img and get original shape
            rgb_shape = cv2.imread(rgb_img.path).shape[:2]
            att_img_path = os.path.join(attn_dir, "attn-"+rgb_img.name)
            att_img_file = cv2.imread(att_img_path, cv2.IMREAD_GRAYSCALE)
            # resize att img to orignal dims
            att_img_res = cv2.resize(att_img_file, (rgb_shape[1], rgb_shape[0]))
            # create binary mask
            att_img_res[att_img_res <= threshold] = 0
            att_img_res[att_img_res > threshold] = 255
            # save mask
            cv2.imwrite(os.path.join(output_dir, "mask-"+rgb_img.name[:-3]+"png"), att_img_res)
            
    # delete attention dir
    shutil.rmtree(attn_dir) 

"""inference notebook"""


# TODO add later: argparse for these arguments and change focal_length calculation
input_frames_dir = "S01_color/color"
input_focal_lenght_px = 424.7448425292969
algn_out_dir = "inference_test/algn_out"
tracks_out_dir = "inference_test"

mega_det_onnx_path = "DeepChimpact/weights/md_v4.1.0.onnx"
pvcnn_weights_path = "DistanceEstimationTracking/align_weights.pth"
dpt_weights_path = "DPT/weights/dpt_large-midas-2f21e586.pt"

# end argparse

crops_temp_folder = "temp/crops"
masks_temp_folder = "temp/masks"
dpt_temp_folder = "temp/dpt"
# detections_temp_folder = "temp/detections"
tracks_out_path = os.path.join(tracks_out_dir, os.path.basename(input_frames_dir)+".csv")
img_height = 0
img_width = 0

os.makedirs(crops_temp_folder, exist_ok=True)
os.makedirs(masks_temp_folder, exist_ok=True)
os.makedirs(dpt_temp_folder, exist_ok=True)

# get img_height, img_width
for rgb_img in os.scandir(input_frames_dir):
    if rgb_img.is_file() and rgb_img.name.endswith((".png", ".jpg")):
        test_img = cv2.imread(rgb_img.path)
        img_height, img_width = test_img.shape[:2]
        break


print(f"1: Calculating DPT images, saving to {dpt_temp_folder} ...")
run_dpt_depth.run(input_frames_dir,
                  dpt_temp_folder,
                  dpt_weights_path,
                  "dpt_large")

print(f"2: Converting Relative Depth images to absolute images via PVCNN, saving results to {algn_out_dir}...")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
voxel_size=0.01
num_points=50000
spvcnn_model = SPVCNN_CLASSIFICATION(input_channel=3, num_classes=2, cr=1.0, pres=voxel_size, vres=voxel_size)
checkpoint = torch.load(pvcnn_weights_path, map_location=device)
spvcnn_model.load_state_dict(checkpoint['spvcnn_model_state_dict'])

# move model to device
spvcnn_model.to(device)
spvcnn_model.eval()

# transforms, datasets, dataloader
dpt_transforms = get_transforms_dpt(voxel_size, num_points)

img_paths = glob.glob(os.path.join(dpt_temp_folder, "*.pfm"))

with torch.no_grad():
    for dpt_img_file in tqdm(img_paths):

        dpt_img = cv2.imread(dpt_img_file, cv2.IMREAD_UNCHANGED)
        dpt_img_name = os.path.basename(dpt_img_file)
        # transform dpt desparity to relative depth
        dpt_pcd = dpt_img.copy()


        dpt_pcd -= dpt_pcd.min()
        dpt_pcd /= dpt_pcd.max()
        dpt_pcd = 1./(dpt_pcd*0.5+0.02)
        dpt_pcd_tensor = torch.from_numpy(dpt_pcd).unsqueeze(0)
        
        dpt_shape = tuple(dpt_pcd_tensor.shape[-2:])
        gt_shape = (img_height, img_width)
        
        if dpt_shape != gt_shape:
          dpt_pcd_tensor =torch.nn.functional.interpolate(
                          dpt_pcd_tensor.unsqueeze(0),
                          size=gt_shape,
                          mode="bicubic",
                          align_corners=False,).squeeze(0)
            
        # transform dpt img to pointcloud
        dpt_sparse, dpt_normalized = dpt_transforms((dpt_pcd_tensor, input_focal_lenght_px))
        dpt_sparse_input = sparse_collate_tensors([dpt_sparse]).to(device)

        # inference
        model_out = spvcnn_model(dpt_sparse_input)
        scale_out = model_out[:,0]
        shift_out = model_out[:,1]

        # align depth image with output
        dpt_aligned = dpt_pcd_tensor.squeeze(0).squeeze(0).cpu().numpy() * scale_out[0].cpu().numpy() + shift_out[0].cpu().numpy()

        # save output
        cv2.imwrite(os.path.join(algn_out_dir, dpt_img_name), dpt_aligned)




print(f"3: Calculating Detections, saving crops to {crops_temp_folder}...")
mega_det_net = cv2.dnn.readNetFromONNX(mega_det_onnx_path) 
rgb_img_paths = [rgb_img.path for rgb_img in os.scandir(input_frames_dir) if rgb_img.is_file() and rgb_img.name.endswith((".png", ".jpg"))]
frame_det_dict = []

# TODO skip first 30 frames
for rgb_img_path in tqdm(rgb_img_paths):
    frame_name = os.path.basename(rgb_img_path)
    input_pil = Image.open(rgb_img_path)
    input_cv = cv2.imread(rgb_img_path)
    frame_height = input_pil.height
    frame_width = input_pil.width
    
    frame_det_dict[frame_name] = {}
    
    # TODO insert preprocessing and saving code here
    #
    #
    #
    #
    
    
    for det_ind, detection in enumerate(detections):
        if detection["category"] != "1" or detection["conf"] < DET_CONF_THRES:
            continue
        # TODO bb = detection ?
        bbx = int(bb[0] * frame_width)
        bby = int(bb[1] * frame_height)
        bbwidth = int(bb[2] * frame_width)
        bbheight = int(bb[3] * frame_height)
        # print(bbx, bby, bbwidth, bbheight

        # new bb
        # egde cases, want to guarantee new bb with double the old size
        bbx_buffer = bbx - (bbwidth // 2) if bbx - (bbwidth // 2) >= 0 else 0
        bbwidth_buffer = 2 * bbwidth 
        if bbx_buffer + bbwidth_buffer >= frame_width: # move bbx to the left by amount of difference over allowed width

            bbx_buffer = frame_width- bbwidth_buffer 

            if bbx_buffer < 0:
                bbx_buffer = 0
                bb_width_buffer = frame_width

        bby_buffer = bby - (bbheight // 2) if bby - (bbheight // 2) >= 0 else 0
        bbheight_buffer = 2 * bbheight 
        if bby_buffer + bbheight_buffer >= frame_height:
            bby_buffer = frame_height - bbheight_buffer 
            if bby_buffer < 0:
                bby_buffer = 0
                bb_height_buffer = frame_height

        img_det_part = np.copy(input_cv[bby_buffer: bby_buffer + bbheight_buffer, bbx_buffer: bbx_buffer + bbwidth_buffer])


        # save to crop_folder

        #print(os.path.join(crop_folder, frame_name[:-4]+f"_{det_ind:04d}.png"))

        bbx_crop = bbx - bbx_buffer
        bby_crop = bby - bby_buffer
        # reuse bbwidht, bbheight when extracting depth
        frame_det_dict[frame_name][det_ind] = [(bbx, bby, bbwidth, bbheight), (bbx_crop, bby_crop)]

        assert np.all(img_det_part[bby_crop: bby_crop + bbheight, bbx_crop: bbx_crop + bbwidth] == frame_img[bby: bby + bbheight, bbx: bbx + bbwidth])
        img_det_part -= img_det_part.min()
        img_det_part *= int(255/img_det_part.max())

        cv2.imwrite(os.path.join(crops_temp_folder, frame_name[:-4]+f"_{det_ind:04d}.png"), img_det_part)


print(f"4: Starting dino segmentation, saving masks to {masks_temp_folder}...")
semseg_f(crops_temp_folder, masks_temp_folder)

print(f"5: Extracting distances of detections to camera...")
for frame_name, dets_dict in tqdm(frame_det_dict.items()):
    depth_img = cv2.imread(os.path.join(algn_out_dir, frame_name[:-3]+"pfm"), cv2.IMREAD_UNCHANGED)
    frame_height, frame_width = depth_img.shape[:2]
    
    for det_ind, det_info in (dets_dict.values()):
        bbx, bby, bbwidth, bbheight = det_info[0]
        bbx_crop, bby_crop = det_info[1]
        
        # open segmentation mask for detection
        seg_det_full = cv2.imread(os.path.join(masks_temp_folder, "mask-"+frame_name[:-4]+f"_{det_ind:04d}.png"), cv2.IMREAD_GRAYSCALE) / 255
        seg_det_crop = seg_det_full[bby_crop: bby_crop + bbheight, bbx_crop: bbx_crop + bbwidth]
        
        # get detection crop of depth img
        depth_det_crop = depth_img[bby: bby + bbheight, bbx: bbx + bbwidth]
        
        depth_values_seg = depth_det_crop[np.where((seg_det_crop == 1))[:2]]
        if (seg_det_crop == 1).any() == False:
            print(frame_name, f"no sem seg pixel of deer in bb {bby},{bbx} dist = {PERCENTILE}th percentile")
            det_info.append(float(np.percentile(depth_det_crop, PERCENTILE)))
        else:
            det_info.append(float(np.percentile(depth_values_seg, PERCENTILE)))
            
print(f"6: Connecting positions of animals over video to coherent tracks...")
KalmanBoxTracker.count = 0
# init Sort
mot_tracker = Sort2_5D(max_age=MAX_AGE, min_hits=MIN_HITS, iou_threshold=IOU_THRES, alpha_iou=ALPHA_IOU, max_dist=MAX_DIST)

frame_det_dict = OrderedDict(sorted(frame_det_dict.items(), key=lambda x: abs(int(x[0][:-4]))))

cam_u0 = img_width / 2.0 #848 / 2.0 #frame_depth.shape[1] / 2.0
cam_v0 = img_height / 2.0 # 480 / 2.0

with open(tracks_out_path, 'w', newline='') as csvfile:
    fieldnames = ['frame_name', 'track_num', 'bb_x', 'bb_y', 'bb_width', 'bb_height', 'distance', '3D_x', '3D_y', '3D_z']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for frame_name, dets_dict in tqdm(frame_det_dict.items()):
        frame_bbxs = []

        for det_ind, det_info in (dets_dict.values()):
            bbx, bby, bbwidth, bbheight = det_info[0]
            distance = det_info[2]
            frame_bbxs.append(np.array([bbx, bby, bbx+bbwidth, bby+bbheight, distance]))
        if len(frame_bbxs) == 0: # no detections in frame
            frame_bbxs = np.empty((0, 5))

        trackers = mot_tracker.update(frame_bbxs)

        for d in trackers:
            x1,y1,w, h,distance, track_num = d

            # calculations to project position of animal to 3d
            # middle of lower bound of bounding box
            x3d = x1 + 0.5 * w
            y3d = y1 + h

            # project to 3d
            x3d = x3d / input_focal_lenght_px * distance
            y3d = y3d / input_focal_lenght_px * distance

            writer.writerow({'frame_name': frame_name, 'track_num': track_num, 'bb_x':x1, 'bb_y':y1, 'bb_width':w, 'bb_height':h, 'distance': distance, '3D_x':x3d, '3D_y':y3d, '3D_z':distance})
            
# TODO create 3d plot of everything...


1: Calculating DPT images, saving to temp/dpt ...
initialize
device: cuda
start processing
  processing S01_color/color/000043.jpg (1/451)


  "See the documentation of nn.Upsample for details.".format(mode)


  processing S01_color/color/000117.jpg (2/451)
  processing S01_color/color/000374.jpg (3/451)
  processing S01_color/color/000387.jpg (4/451)
  processing S01_color/color/000442.jpg (5/451)
  processing S01_color/color/000220.jpg (6/451)
  processing S01_color/color/000253.jpg (7/451)
  processing S01_color/color/000072.jpg (8/451)
  processing S01_color/color/000351.jpg (9/451)
  processing S01_color/color/000142.jpg (10/451)
  processing S01_color/color/000106.jpg (11/451)
  processing S01_color/color/000219.jpg (12/451)
  processing S01_color/color/000147.jpg (13/451)
  processing S01_color/color/000004.jpg (14/451)
  processing S01_color/color/000410.jpg (15/451)
  processing S01_color/color/000179.jpg (16/451)
  processing S01_color/color/000148.jpg (17/451)
  processing S01_color/color/000181.jpg (18/451)
  processing S01_color/color/000437.jpg (19/451)
  processing S01_color/color/000195.jpg (20/451)
  processing S01_color/color/000140.jpg (21/451)
  processing S01_color/color

100%|██████████| 451/451 [02:10<00:00,  3.47it/s]

3: Calculating Detections, saving crops to temp/crops...





error: ignored