# **MODEL A: YOLOv3 + SORT + ST-DenseNet** 
## A unified framework for pedestrian intention prediction.
1. **YOLOv3** -> Object detector: Responsible for identifying and detecting objects of interest in a given frame or image.
2. **SORT** -> Object Tracker: Responsible for tracking the identified pedestrians for the sequence of frames and maintain unique IDs for each pedestrian.
3. **Spatio-Temporal DenseNet** -> Classifier: Responsible for classifying every identified and tracked pedestrian's intention by using the last 16 frames of a pedetrian.

## **INSTRUCTIONS TO RUN THE MODEL ON GOOGLE COLAB**

This project was completely developed on Google Colab.

1. Connect runtime to GPU for better/faster results.

2. Clone the repository to Colab.

3. Next click this link and add this folder as shortcut in the drive: https://drive.google.com/drive/folders/1QzWZlR4FXJzR6nBT_ZL_k7ruzCep_CjK?usp=sharing

4. To run the remaining cells below, observe the comments and run them appropriately. 

6. After running the run_model() function expect around 5 mins for GPU and 15 mins for CPU





In [None]:
# run this to clone the repository Volvo-DataX
!git clone https://github.com/NishilPatel99/pedestrian_intension.git

In [None]:
# run this to access the weight files from the drive link shared 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Run this to initialize the required components [YOLO, SORT and DenseNet]
try:
  %tensorflow_version 2.x
except Exception:
  pass

!pip install filterpy
%cd pedestrian_intension

import sys
from absl import app, logging, flags
from absl.flags import FLAGS
import time
import cv2
import numpy as np
import tensorflow as tf
from yolov3_tf2.models import (
    YoloV3, YoloV3Tiny
)
from yolov3_tf2.dataset import transform_images, load_tfrecord_dataset
from yolov3_tf2.utils import draw_outputs

from sortn import *

flags.DEFINE_string('classes', 'data/coco.names', 'path to classes file')
flags.DEFINE_string('weights', '/content/drive/My Drive/datax_volvo_additional_files/yolov3_train_5.tf','path to weights file')
flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny')
flags.DEFINE_integer('size', 416, 'resize images to')
flags.DEFINE_string('tfrecord', None, 'tfrecord instead of image')
flags.DEFINE_integer('num_classes', 1, 'number of classes in the model')
flags.DEFINE_string('video', 'data/JAAD_test_video_0339.mp4','path to video file or number for webcam)')
flags.DEFINE_string('output','Result_model_A.mp4', 'path to output video')
flags.DEFINE_string('output_format', 'mp4v', 'codec used in VideoWriter when saving video to file')

app._run_init(['yolov3'], app.parse_flags_with_usage)

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

#Reading the model from JSON file
with open('densenet_model.json', 'r') as json_file:
    json_savedModel= json_file.read()

model_j = tf.keras.models.model_from_json(json_savedModel)
model_j.load_weights('densenet_1.hdf5')

def pred_func(X_test):
  predictions = model_j.predict(X_test[0:1], verbose=0)
  Y = np.argmax(predictions[0], axis=0)
    
  return Y

In [None]:
# Run this
FLAGS.yolo_iou_threshold = 0.5
FLAGS.yolo_score_threshold = 0.5

color = (255, 0, 0) 
thickness = 2

yolo = YoloV3(classes=FLAGS.num_classes)

yolo.load_weights(FLAGS.weights).expect_partial()
logging.info('weights loaded')

class_names = [c.strip() for c in open(FLAGS.classes).readlines()]
logging.info('classes loaded')



def run_model():

  print('Processing started.......')
  frame = 0

  try:
      vid = cv2.VideoCapture(int(FLAGS.video))
  except:
      vid = cv2.VideoCapture(FLAGS.video)

  out = None

  if FLAGS.output:
      # by default VideoCapture returns float instead of int
      width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
      height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
      fps = int(vid.get(cv2.CAP_PROP_FPS))
      codec = cv2.VideoWriter_fourcc(*FLAGS.output_format)
      out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height))

  #create instance of SORT
  mot_tracker = Sort()
  rolling_data={}

  while True:
    _, img = vid.read()

    if img is None:
        break
    
    frame +=1

    img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
    img_orig = np.copy(img)
    img_in = tf.expand_dims(img_in, 0)
    img_in = transform_images(img_in, FLAGS.size)


    boxes, scores, classes, nums = yolo.predict(img_in) # yolo prediction
    dets = boxes[:,:nums[0],:].reshape(nums[0], 4)  # filter pedestrians 
    trackers = mot_tracker.update(dets[classes[0][:nums[0]] == 0]) # track the pedestrians
  

    for d in trackers:

      wh = np.flip(img.shape[0:2])    
      x1y1 = tuple((np.array(d[0:2]) * wh).astype(np.int32))
      x2y2 = tuple((np.array(d[2:4]) * wh).astype(np.int32))

      y = 0

      if int(d[4]) in list(rolling_data.keys()):

        if len(rolling_data[int(d[4])]) == 16:
          
          seq = np.stack(np.array(rolling_data[int(d[4])]),axis=2) # (100*100*16*3)
          seq = np.expand_dims(seq, axis=0)
          y = pred_func(seq) # classification output

        else:

          seq = np.stack(np.array([rolling_data[int(d[4])][-1]] * 16),axis=2)
          seq = np.expand_dims(seq, axis=0)
          y = pred_func(seq) # classification output

      # risky pedestrian identification thru box color

      if y == 1:
        color = (0, 0, 255)

      else:
        color = (0, 255, 0)

      image = cv2.rectangle(img, x1y1, x2y2, color, thickness) 
      image = cv2.putText(image, str(int(d[4])), org = (x1y1[0],x1y1[1]-5) , fontFace = cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=color, thickness=thickness)
      image = cv2.putText(image, "Frame No: {}".format(frame), (0, 30),cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (255, 0, 0), 2)

      # storing the data for last 16 frames
      try:

        if int(d[4]) in list(rolling_data.keys()): # ID exists in dict

          if len(rolling_data[int(d[4])]) < 16: # bboxes values for 16 frames
              
            cropped_seq = []
            cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
            rolling_data[int(d[4])].append(np.asarray(cropped_img)) # append the image      

          else:

            del rolling_data[int(d[4])][0] # delete oldest frame bbox and append latest frame bbox
            cropped_seq = []
            cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
            rolling_data[int(d[4])].append(np.asarray(cropped_img))
            
        else:

          cropped_seq = []
          cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
          rolling_data[int(d[4])] = [np.asarray(cropped_img)]  
      except:
        pass 


    if FLAGS.output:

      out.write(img)
    #cv2.imshow('output', img)     
    if cv2.waitKey(1) == ord('q'):
      break

  cv2.destroyAllWindows()
  print('\nProcessing completed.......!!!')
  print('Check video file in Volvo-DataX folder!')

  return

### Run this to obtain the Model-A output as a video file named **'Result_model_A.mp4'** in pedestrian_intension folder.

In [None]:
run_model()

To download the result video generated, search for it inside the pedestrian_intension folder in the left pane of the colab notebook and download the file from there. 