In [1]:
'''
Author       : Aditya Jain
Date Started : 18th August, 2021
About        : This file does DL-based localization by directly reading video and outputing final video
'''
import torch
import torchvision.models as torchmodels
import torchvision
import os
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms
from PIL import Image
import cv2
import numpy as np

In [2]:
from azureml.core import Workspace
from azureml.core.model import Model

ws = Workspace.get(name='MothAI',
                     subscription_id='1e5f7432-8004-48f7-a32d-668eee0f349e',
                     resource_group='MothProject'
                     )                    

In [3]:
# User Input
vid_name       = 'maxim_video2'
des_fps        = 5
frame_height   = 540
frame_width    = 960
home_path      = '/home/azureuser/cloudfiles/code/Users/adijain0707/data/maxim_videos/'


final_vid_name = vid_name + '.MOV'
video_path     = home_path + final_vid_name
# raw_img_list   = frames_from_video(vid_path, fps)

#### Model Loading

In [4]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# load a model pre-trained pre-trained on COCO
model       = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 2  # 1 class (person) + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)


MODEL_PATH  = Model.get_model_path('DL_Localization_Model', _workspace=ws)
checkpoint  = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

cpu


<All keys matched successfully>

#### Model Prediction and Annotation

In [7]:
def annotate_image(model, img):
    SCORE_THR  = 0.98

    transform  = transforms.Compose([              
            transforms.ToTensor()])
    
    image      = transform(img)
    image_pred = torch.unsqueeze(image, 0).to(device)
    output     = model(image_pred)    
    
    bboxes     = output[0]['boxes'][output[0]['scores'] > SCORE_THR]
    image_cv   = img
    
    for box in bboxes:
        box_numpy = box.detach().cpu().numpy()        
        cv2.rectangle(image_cv,(box_numpy[0], box_numpy[1]),(box_numpy[2], box_numpy[3]),(0,0,255),2)  
        
    return image_cv

#### Main Loop

In [8]:
vidcap      = cv2.VideoCapture(video_path)     
fps         = vidcap.get(cv2.CAP_PROP_FPS)           #  FPS of the video 
frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)   #  total frame count
total_sec   = frame_count/fps
sec         = 0
n_frames    = total_sec*des_fps
time_sec    = total_sec/n_frames                     # the video will be sampled after every time_sec
    
model       = model.to(device)
model.eval()

# initialising video writer
out        = cv2.VideoWriter(home_path + vid_name + '_localiz.avi',
                      cv2.VideoWriter_fourcc('M','J','P','G'), 5, (frame_width, frame_height))

while sec < total_sec:        
    vidcap.set(cv2.CAP_PROP_POS_MSEC, sec * 1000)    # setting which frame to get        
    success, image = vidcap.read()
    if success:
        image_annot = annotate_image(model, image)
        out.write(image_annot)
    
    sec += time_sec
    
out.release()