# **MODEL B: YOLOv3 + DeepSORT + ST-DenseNet** 
## A unified framework for pedestrian intention prediction.
1. **YOLOv3** -> Object detector: Responsible for identifying and detecting objects of interest in a given frame or image.

2. **DeepSORT** -> Object Tracker: Responsible for extracting features from the tracked pedestrian to enhance re-identification of the identified and tracked pedestrian even through occlusions.

3. **Spatio-Temporal DenseNet** -> Classifier: Responsible for classifying every identified and tracked pedestrian's intention by using the last 16 frames of a pedetrian.

## **INSTRUCTIONS TO RUN THE MODEL ON GOOGLE COLAB**

This project was completely developed on Google Colab.

1. Connect runtime to GPU for better/faster results.

2. Clone the repository to Colab.

3. Next click this link to activate the folder in your google drive: https://drive.google.com/drive/folders/1QzWZlR4FXJzR6nBT_ZL_k7ruzCep_CjK?usp=sharing

4. To run the remaining cells below, observe the comments and run them appropriately. Also running some codes may provide warnings, so please ignore them. 

5. After running the run_model() function expect around 7 mins for GPU and 20 mins for CPU


In [None]:
# run this to clone the repository Volvo-DataX
!git clone https://github.com/NishilPatel99/pedestrian_intension.git

In [None]:
# run this to access the weight files from the drive link shared 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# run this
try:
  %tensorflow_version 2.x
except Exception:
  pass
import glob

%cd pedestrian_intension
 
import sys #Run this
from absl import app, logging, flags
from absl.flags import FLAGS
import time
import cv2
import numpy as np
import tensorflow as tf
from yolov3_tf2.models import (
    YoloV3, YoloV3Tiny
)
from yolov3_tf2.dataset import transform_images, load_tfrecord_dataset
from yolov3_tf2.utils import draw_outputs

%cd /content/Volvo-DataX/deep_sort
from ds_tools.generate_detections import create_box_encoder
from ds_application_util import preprocessing
from ds_deep_sort import nn_matching
from ds_deep_sort.detection import Detection
from ds_deep_sort.tracker import Tracker

%cd /content/Volvo-DataX
tf.compat.v1.disable_eager_execution()
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

flags.DEFINE_string('classes', 'data/coco.names', 'path to classes file')
flags.DEFINE_string('weights', '/content/drive/My Drive/datax_volvo_additional_files/yolov3_train_5.tf','path to weights file')
flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny')
flags.DEFINE_integer('size', 416, 'resize images to')
flags.DEFINE_string('tfrecord', None, 'tfrecord instead of image')
flags.DEFINE_integer('num_classes', 1, 'number of classes in the model')
flags.DEFINE_string('video', 'data/JAAD_test_video_0339.mp4','path to video file or number for webcam)')
flags.DEFINE_string('output','Result_model_B.mp4', 'path to output video')
flags.DEFINE_string('output_format', 'mp4v', 'codec used in VideoWriter when saving video to file')

app._run_init(['yolov3'], app.parse_flags_with_usage)

with open('densenet_model.json', 'r') as json_file:
    json_savedModel= json_file.read()

model_j = tf.keras.models.model_from_json(json_savedModel)
model_j.load_weights('densenet_1.hdf5')

def pred_func(X_test):
  predictions = model_j.predict(X_test[0:1], verbose=0)
  Y = np.argmax(predictions[0], axis=0)
    
  return Y

In [None]:
# parameters for DeepSORT the object tracker
nms_max_overlap = 1.0
max_cosine_distance = 0.2
nn_budget = None

# DeepSORT initialization
encoder = create_box_encoder('mars-small128.pb', batch_size=32)
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
tracker = Tracker(metric)

# Yolo initialization
FLAGS.yolo_iou_threshold = 0.5
FLAGS.yolo_score_threshold = 0.5

yolo = YoloV3(classes=FLAGS.num_classes)
yolo.load_weights(FLAGS.weights).expect_partial()

class_names = [c.strip() for c in open(FLAGS.classes).readlines()]

resize_out_ratio = 4.0
fps_time = 0

def run_model():
  print('Processing started.......')
  try:
      vid = cv2.VideoCapture(int(FLAGS.video))
  except:
      vid = cv2.VideoCapture(FLAGS.video)

  out = None

  if FLAGS.output:
      # by default VideoCapture returns float instead of int
      width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
      height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
      fps = int(vid.get(cv2.CAP_PROP_FPS))
      codec = cv2.VideoWriter_fourcc(*FLAGS.output_format)
      out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height))

  frame = 0
  rolling_data={}
  fps_time=0
  result=[]
  while True:

    _, img = vid.read() # reading the image

    if img is None:
        break
        logging.warning("Empty Frame")
        time.sleep(0.1)
        continue
    frame += 1   
    currFrame = int(vid.get(cv2.CAP_PROP_POS_FRAMES))
    fps = vid.get(cv2.CAP_PROP_FPS)


    img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
    img_orig = np.copy(img_in)
    img_in = tf.expand_dims(img_in, 0)
    img_in = transform_images(img_in, FLAGS.size)

    boxes, scores, classes, nums = yolo.predict(img_in, steps=1) # yolo
    boxes = boxes[:,:nums[0],:].reshape(nums[0], 4)[classes[0][:nums[0]] == 0]
    scores = scores[0][:nums[0]][classes[0][:nums[0]] == 0]
    nums = len(boxes)

    # converting [x1,y1,x2,y2] -> [x1,y1,w,h] for bboxes detected
    wh = np.flip(img.shape[0:2])
    bbtlwh = []
    for i in range(nums):

      x1y1 = tuple((np.array(boxes[i][0:2]) * wh).astype(np.int32))
      x1 = x1y1[0]
      y1 = x1y1[1]
      x2y2 = tuple((np.array(boxes[i][2:4]) * wh).astype(np.int32))
      bbwh = (x2y2[0]-x1y1[0], x2y2[1]-x1y1[1])
      w = bbwh[0]
      h = bbwh[1]
      bbtlwh.append([x1,y1,w,h])

    features = encoder(img, bbtlwh) # deepsort input
    detections = [Detection(box, conf, feat) for box, conf, feat in zip(bbtlwh, scores, features)] #deep sort output 

    # Update tracker.
    tracker.predict()
    tracker.update(detections)
    
    tracked_bbox = []
    ids = []

    for track in tracker.tracks:

      if not track.is_confirmed() or track.time_since_update > 1:
        continue
      tracked_bbox.append(track.to_tlwh())
      ids.append(track.track_id)


    for i in range(len(tracked_bbox)): # densenet 

      # Show tracker output
      x, y, w, h = tracked_bbox[i]
      x = int(x)  
      y = int(y)
      w = int(w) 
      h = int(h) 

      # looking for previous 16 frames data for a given pedestrian:

      intent = 0 #(default, the pedestrian is not crossing)

      
      if int(ids[i]) in list(rolling_data.keys()):

        if len(rolling_data[int(ids[i])]) == 16:
          
          seq = np.stack(np.array(rolling_data[int(ids[i])]),axis=2)
          seq = np.expand_dims(seq, axis=0)
          intent = pred_func(seq) # classification output

        else:

          seq = np.stack(np.array([rolling_data[int(ids[i])][-1]] * 16),axis=2)
          seq = np.expand_dims(seq, axis=0)
          intent = pred_func(seq) # classification output

      # risky pedestrian identification thru box color

      if intent == 1:
        color = (0, 0, 255) # Red -> Crossing

      else:
        color = (0, 255, 0) # green -> Not crossing

      fps_time = time.time()
      #color = (0, 255, 0)
      img = cv2.rectangle(img, (int(x), int(y)), (int(x + w), int(y + h)), color, 2)
      img = cv2.putText(img, 'TrackID ' + str(ids[i]), (x, y - 5), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 0, 0), thickness=2)
      img = cv2.putText(img,"Frame No: %d" % (frame),(10, 10),  cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0, 0, 255), 2)
      result.append([frame, int(ids[i]), x, y, w+x,h+y, intent])

      # storing the data for last 16 frames
      try:

        if int(ids[i]) in list(rolling_data.keys()): # ID exists in dict

          if len(rolling_data[int(ids[i])]) < 16: # bboxes values for 16 frames
              
            cropped_seq = []
            cropped_img = cv2.resize(img_orig[y:h+y, x:w+x],(100,100))
            rolling_data[int(ids[i])].append(np.asarray(cropped_img)) # append the image      

          else:

            del rolling_data[int(ids[i])][0] # delete oldest frame bbox and append latest frame bbox
            cropped_seq = []
            cropped_img = cv2.resize(img_orig[y:h+y, x:w+x],(100,100))
            rolling_data[int(ids[i])].append(np.asarray(cropped_img))
              
        else:

          cropped_seq = []
          cropped_img = cv2.resize(img_orig[y:h+y, x:w+x],(100,100))
          rolling_data[int(ids[i])] = [np.asarray(cropped_img)]  

      except:
        pass

    
    if FLAGS.output:
      out.write(img)

    if cv2.waitKey(1) == ord('q'):
      break

  cv2.destroyAllWindows()
  print('\nProcessing completed.......!!!')
  print('Check video file in Volvo-DataX folder!')
  return

### Run this to obtain the Model-B output as a video file named **'Result_model_B.mp4'** in pedestrian_intension folder.

In [None]:
run_model()