In [None]:
!pip3 install opencv-python
!pip3 install matplotlib

# apt-get update && apt-get install ffmpeg libsm6 libxext6 -y

In [1]:
import torch
import numpy as np
import argparse
from models.retinanet import build_retinanet
from modules import utils
from torchvision import transforms
import data.transforms as vtf
from data import VideoDataset


In [2]:
def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")


In [3]:
parser = argparse.ArgumentParser(
    description="Training single stage FPN with OHEM, resnet as backbone"
)

parser.add_argument("--DATA_ROOT", type=str, default="/workspace/")
parser.add_argument("--SAVE_ROOT", type=str, default='/workspace/road/cache/resnet50I3D512-Pkinetics-b4s8x1x1-roadt3-h3x3x3/')
parser.add_argument("--MODEL_PATH", type=str, default='/workspace/kinetics-pt/')

parser.add_argument("--ANNO_ROOT", type=str, default="")
parser.add_argument(
    "--MODE",
    default="gen_dets",
    help="MODE can be train, gen_dets, eval_frames, eval_tubes define SUBSETS accordingly, build tubes",
)
# Name of backbone network, e.g. resnet18, resnet34, resnet50, resnet101 resnet152 are supported
parser.add_argument("--ARCH", default="resnet50", type=str, help=" base arch")
parser.add_argument("--MODEL_TYPE", default="I3D", type=str, help=" base model")
parser.add_argument("--model_subtype", default="I3D", type=str, help=" sub model")
parser.add_argument(
    "--ANCHOR_TYPE",
    default="RETINA",
    type=str,
    help="type of anchors to be used in model",
)

parser.add_argument("--SEQ_LEN", default=8, type=int, help="NUmber of input frames")
parser.add_argument(
    "--TEST_SEQ_LEN", default=8, type=int, help="NUmber of input frames"
)
parser.add_argument(
    "--MIN_SEQ_STEP",
    default=1,
    type=int,
    help="DIFFERENCE of gap between the frames of sequence",
)
parser.add_argument(
    "--MAX_SEQ_STEP",
    default=1,
    type=int,
    help="DIFFERENCE of gap between the frames of sequence",
)
# if output heads are have shared features or not: 0 is no-shareing else sharining enabled
# parser.add_argument('--MULIT_SCALE', default=False, type=str2bool,help='perfrom multiscale training')
parser.add_argument(
    "--HEAD_LAYERS",
    default=3,
    type=int,
    help="0 mean no shareding more than 0 means shareing",
)
parser.add_argument(
    "--NUM_FEATURE_MAPS",
    default=5,
    type=int,
    help="0 mean no shareding more than 0 means shareing",
)
parser.add_argument(
    "--CLS_HEAD_TIME_SIZE",
    default=3,
    type=int,
    help="Temporal kernel size of classification head",
)
parser.add_argument(
    "--REG_HEAD_TIME_SIZE",
    default=3,
    type=int,
    help="Temporal kernel size of regression head",
)

#  Name of the dataset only voc or coco are supported
parser.add_argument(
    "--DATASET", default="road", type=str, help="dataset being used"
)
parser.add_argument(
    "--TRAIN_SUBSETS",
    default="train_3,",
    type=str,
    help="Training SUBSETS seprated by ,",
)
parser.add_argument(
    "--VAL_SUBSETS", default="", type=str, help="Validation SUBSETS seprated by ,"
)
parser.add_argument(
    "--TEST_SUBSETS", default="", type=str, help="Testing SUBSETS seprated by ,"
)
# Input size of image only 600 is supprted at the moment
parser.add_argument("--MIN_SIZE", default=512, type=int, help="Input Size for FPN")

#  data loading argumnets
parser.add_argument(
    "-b", "--BATCH_SIZE", default=4, type=int, help="Batch size for training"
)
parser.add_argument(
    "--TEST_BATCH_SIZE", default=1, type=int, help="Batch size for testing"
)
# Number of worker to load data in parllel
parser.add_argument(
    "--NUM_WORKERS",
    "-j",
    default=8,
    type=int,
    help="Number of workers used in dataloading",
)
# optimiser hyperparameters
parser.add_argument("--OPTIM", default="SGD", type=str, help="Optimiser type")
parser.add_argument("--RESUME", default=0, type=int, help="Resume from given epoch")
parser.add_argument(
    "--MAX_EPOCHS", default=30, type=int, help="Number of training epoc"
)
parser.add_argument(
    "-l",
    "--LR",
    "--learning-rate",
    default=0.004225,
    type=float,
    help="initial learning rate",
)
parser.add_argument("--MOMENTUM", default=0.9, type=float, help="momentum")
parser.add_argument(
    "--MILESTONES", default="20,25", type=str, help="Chnage the lr @"
)
parser.add_argument("--GAMMA", default=0.1, type=float, help="Gamma update for SGD")
parser.add_argument(
    "--WEIGHT_DECAY", default=1e-4, type=float, help="Weight decay for SGD"
)

# Freeze layers or not
parser.add_argument(
    "--FBN",
    "--FREEZE_BN",
    default=True,
    type=str2bool,
    help="freeze bn layers if true or else keep updating bn layers",
)
parser.add_argument(
    "--FREEZE_UPTO",
    default=1,
    type=int,
    help="layer group number in ResNet up to which needs to be frozen",
)

# Loss function matching threshold
parser.add_argument(
    "--POSTIVE_THRESHOLD",
    default=0.5,
    type=float,
    help="Min threshold for Jaccard index for matching",
)
parser.add_argument(
    "--NEGTIVE_THRESHOLD",
    default=0.4,
    type=float,
    help="Max threshold Jaccard index for matching",
)
# Evaluation hyperparameters
parser.add_argument(
    "--EVAL_EPOCHS",
    default="30",
    type=str,
    help="eval epochs to test network on these epoch checkpoints usually the last epoch is used",
)
parser.add_argument(
    "--VAL_STEP",
    default=1,
    type=int,
    help="Number of training epoch before evaluation",
)
parser.add_argument(
    "--IOU_THRESH",
    default=0.5,
    type=float,
    help="Evaluation threshold for validation and for frame-wise mAP",
)
parser.add_argument(
    "--CONF_THRESH",
    default=0.025,
    type=float,
    help="Confidence threshold for to remove detection below given number",
)
parser.add_argument(
    "--NMS_THRESH",
    default=0.5,
    type=float,
    help="NMS threshold to apply nms at the time of validation",
)
parser.add_argument(
    "--TOPK", default=10, type=int, help="topk detection to keep for evaluation"
)
parser.add_argument(
    "--GEN_CONF_THRESH",
    default=0.05,
    type=float,
    help="Confidence threshold at the time of generation and dumping",
)
parser.add_argument(
    "--GEN_TOPK", default=100, type=int, help="topk at the time of generation"
)
parser.add_argument(
    "--GEN_NMS", default=0.5, type=float, help="NMS at the time of generation"
)
parser.add_argument(
    "--CLASSWISE_NMS",
    default=False,
    type=str2bool,
    help="apply classwise NMS/no tested properly",
)
parser.add_argument(
    "--JOINT_4M_MARGINALS",
    default=False,
    type=str2bool,
    help="generate score of joints i.e. duplexes or triplet by marginals like agents and actions scores",
)

## paths hyper parameters
parser.add_argument(
    "--COMPUTE_PATHS",
    default=False,
    type=str2bool,
    help=" COMPUTE_PATHS if set true then it overwrite existing ones",
)
parser.add_argument(
    "--PATHS_IOUTH",
    default=0.5,
    type=float,
    help="Iou threshold for building paths to limit neighborhood search",
)
parser.add_argument(
    "--PATHS_COST_TYPE",
    default="score",
    type=str,
    help="cost function type to use for matching, other options are scoreiou, iou",
)
parser.add_argument(
    "--PATHS_JUMP_GAP",
    default=4,
    type=int,
    help="GAP allowed for a tube to be kept alive after no matching detection found",
)
parser.add_argument(
    "--PATHS_MIN_LEN", default=6, type=int, help="minimum length of generated path"
)
parser.add_argument(
    "--PATHS_MINSCORE",
    default=0.1,
    type=float,
    help="minimum score a path should have over its length",
)

## paths hyper parameters
parser.add_argument(
    "--COMPUTE_TUBES",
    default=False,
    type=str2bool,
    help="if set true then it overwrite existing tubes",
)
parser.add_argument(
    "--TUBES_ALPHA",
    default=0,
    type=float,
    help="alpha cost for changeing the label",
)
parser.add_argument(
    "--TRIM_METHOD",
    default="none",
    type=str,
    help="other one is indiv which works for UCF24",
)
parser.add_argument(
    "--TUBES_TOPK",
    default=10,
    type=int,
    help="Number of labels to assign for a tube",
)
parser.add_argument(
    "--TUBES_MINLEN", default=5, type=int, help="minimum length of a tube"
)
parser.add_argument(
    "--TUBES_EVAL_THRESHS",
    default="0.2,0.5",
    type=str,
    help="evaluation threshold for checking tube overlap at evaluation time, one can provide as many as one wants",
)

###
parser.add_argument(
    "--LOG_START",
    default=10,
    type=int,
    help="start loging after k steps for text/tensorboard",
)
parser.add_argument(
    "--LOG_STEP",
    default=10,
    type=int,
    help="Log every k steps for text/tensorboard",
)
parser.add_argument(
    "--TENSORBOARD",
    default=1,
    type=str2bool,
    help="Use tensorboard for loss/evalaution visualization",
)

# Program arguments
parser.add_argument(
    "--MAN_SEED", default=123, type=int, help="manualseed for reproduction"
)
parser.add_argument(
    "--MULTI_GPUS",
    default=True,
    type=str2bool,
    help="If  more than 0 then use all visible GPUs by default only one GPU used ",
)

_StoreAction(option_strings=['--MULTI_GPUS'], dest='MULTI_GPUS', nargs=None, const=None, default=True, type=<function str2bool at 0x75cdaea7ac10>, choices=None, help='If  more than 0 then use all visible GPUs by default only one GPU used ', metavar=None)

In [4]:
# Use CUDA_VISIBLE_DEVICES=0,1,4,6 to select GPUs to use

## Parse arguments
# args = utils.parse_my_args(parser_module=parser)
args, unknown = parser.parse_known_args()
args = utils.set_args(args)  # set directories and SUBSETS fo datasets
args.MULTI_GPUS = False if args.BATCH_SIZE == 1 else args.MULTI_GPUS

Your working directories are::
LOAD::>  /workspace/ 
SAVE::>  /workspace/road/cache/resnet50I3D512-Pkinetics-b4s8x1x1-roadt3-h3x3x3/
Your model will be initialized using /workspace/kinetics-pt/resnet50I3D.pth


In [6]:
if args.MODE == "train":
    args.TEST_SEQ_LEN = args.SEQ_LEN
else:
    args.SEQ_LEN = args.TEST_SEQ_LEN

args.SEQ_LEN = args.TEST_SEQ_LEN
args.MAX_SEQ_STEP = 1
args.SUBSETS = args.TEST_SUBSETS
full_test = True  # args.MODE != 'train'
args.skip_beggning = 0
args.skip_ending = 0
if args.MODEL_TYPE == "I3D" or "SlowFast":
    args.skip_beggning = 2
    args.skip_ending = 2
elif args.MODEL_TYPE != "C2D":
    args.skip_beggning = 2

skip_step = args.SEQ_LEN - args.skip_beggning

val_transform = transforms.Compose(
    [
        vtf.ResizeClip(args.MIN_SIZE, args.MAX_SIZE),
        vtf.ToTensorStack(),
        vtf.Normalize(mean=args.MEANS, std=args.STDS),
    ]
)

val_dataset = VideoDataset(
    args,
    train=False,
    transform=val_transform,
    skip_step=skip_step,
    full_test=full_test,
)

args.num_classes = val_dataset.num_classes
# one for objectness
args.label_types = val_dataset.label_types
args.num_label_types = val_dataset.num_label_types
args.all_classes = val_dataset.all_classes
args.num_classes_list = val_dataset.num_classes_list
args.num_ego_classes = val_dataset.num_ego_classes
args.ego_classes = val_dataset.ego_classes
args.head_size = 256





database.keys() dict_keys(['2014-06-25-16-45-34_stereo_centre_02', '2014-06-26-09-53-12_stereo_centre_02', '2014-07-14-14-49-50_stereo_centre_01', '2014-07-14-15-42-55_stereo_centre_03', '2014-08-08-13-15-11_stereo_centre_01', '2014-08-11-10-59-18_stereo_centre_02', '2014-11-14-16-34-33_stereo_centre_06', '2014-11-18-13-20-12_stereo_centre_05', '2014-11-21-16-07-03_stereo_centre_01', '2014-11-25-09-18-32_stereo_centre_04', '2014-12-09-13-21-02_stereo_centre_01', '2015-02-03-08-45-10_stereo_centre_02', '2015-02-03-19-43-11_stereo_centre_04', '2015-02-06-13-57-16_stereo_centre_02', '2015-02-13-09-16-26_stereo_centre_02', '2015-02-13-09-16-26_stereo_centre_05', '2015-02-24-12-32-19_stereo_centre_04', '2015-03-03-11-31-36_stereo_centre_01'])






In [7]:
net = build_retinanet(args).cuda()
if args.MULTI_GPUS:
    # logger.info("\nLets do dataparallel\n")
    net = torch.nn.DataParallel(net)

In [8]:
torch.cuda.is_available()

True

In [9]:
for epoch in args.EVAL_EPOCHS:
    args.MODEL_PATH = args.SAVE_ROOT + "model_{:06d}.pth".format(epoch)
    net.eval()
    net.load_state_dict(torch.load(args.MODEL_PATH))
    net.eval()  # switch net to evaluation mode

  net.load_state_dict(torch.load(args.MODEL_PATH))


In [10]:
import cv2
import os
base_path = "/workspace/road/rgb-images"
event = "2014-12-09-13-21-02_stereo_centre_01"
im_list = ["00000.jpg", "00001.jpg", "00002.jpg", "00003.jpg", "00004.jpg", "00005.jpg", "00006.jpg", "00007.jpg"]
imgs_paths = [os.path.join(base_path, event, im) for im in im_list]
print(imgs_paths)
images = [cv2.resize(cv2.imread(img_path).astype('float32'), (704, 512)) for img_path in imgs_paths]
images = torch.tensor([images]).permute(0, 4, 1, 2, 3).cuda(0, non_blocking=True)

['/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00000.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00001.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00002.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00003.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00004.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00005.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00006.jpg', '/workspace/road/rgb-images/2014-12-09-13-21-02_stereo_centre_01/00007.jpg']


  images = torch.tensor([images]).permute(0, 4, 1, 2, 3).cuda(0, non_blocking=True)


In [11]:
# # torch.cuda.synchronize()
# # Create a tensor with the given size
# images = torch.randn(1, 3, 8, 512, 704).cuda(0, non_blocking=True)

# # Use permute to change the dimensions
# tensor_permuted = images.permute(0, 2, 1, 3, 4)

# # Verify the shape of the resulting tensor
# print(images.shape)


In [12]:
torch.cuda.synchronize()
activation = torch.nn.Sigmoid().cuda()

In [13]:
decoded_boxes, confidences, ego_preds = net(images)
confidence = activation(confidences)
seq_len = ego_preds.shape[1]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [14]:
decoded_boxes.shape

torch.Size([1, 8, 67536, 4])

In [15]:
confidences.shape

torch.Size([1, 8, 67536, 149])

In [16]:
confidence.shape

torch.Size([1, 8, 67536, 149])

In [17]:
confidence_batch = confidence[0, 0]
scores = confidence_batch[:, 0].squeeze().clone()

In [18]:
decoded_boxes_batch = decoded_boxes[0,0]
confidence_batch = confidence[0, 0]

In [19]:
print("confidence_batch shape", confidence_batch.shape)
print("decoded_boxes_batch shape", decoded_boxes_batch.shape)
print("scores shape", scores.shape)


confidence_batch shape torch.Size([67536, 149])
decoded_boxes_batch shape torch.Size([67536, 4])
scores shape torch.Size([67536])


In [20]:
cls_dets, save_data = utils.filter_detections_for_dumping(
    args, scores, decoded_boxes_batch, confidence_batch)

In [21]:
save_data.shape

(4, 153)