In [5]:
import matplotlib.pyplot as plt
import matplotlib.image as imgg
import os
import pandas as pd
import numpy as np

from PIL import Image
from PIL import ImageDraw

In [6]:
class_={
    0: 'ignored regions',
    1: 'pedestrian',
    2: 'people',
    3: 'bicycle',
    4: 'car',
    5: 'van',
    6: 'truck',
    7: 'tricycle',
    8: 'awning-tricycle',
    9: 'bus',
    10: 'motor', 
    11: 'others'
}

In [7]:
def read_annot(path='VisDrone2019-DET-train/annotations/0000002_00005_d_0000014.txt'):
    
    # process annotation files, return pandas dataframe
    
    annot=pd.read_csv(path,header=None)
    annot.columns=['x','y','w','h','conf','class','truncation','occlusion']
    
    annot['x_']=annot['x']+annot['w']
    annot['y_']=annot['y']+annot['h']
    
    return annot

In [8]:
def draw_boxes(image,boxes):
    
    # draw all given boxes onto image
    # boxes is list of tuples of (x,y,x_,y_)
    
    rec=Image.new('RGBA',image.size,(255,255,255,0))
    draw=ImageDraw.Draw(rec)
    
    for box in boxes: 
        draw.rectangle(box,outline=(255,255,0)) # yellow outline
        
    out=Image.alpha_composite(image.convert('RGBA'),rec)
    
    return out

In [9]:
def resized_box(image,box,new_size=(128,128),as_numpy=True):
    
    # from box, crop image and resize to desired size
    # box is tuple of (x,y,x_,y_)
    
    box_=image.crop(box)
    box_=box_.resize(new_size)
    
    if as_numpy:
        return np.array(box_)
    else:
        return box_
    

#### Demo, using VisDrone trainset

get from https://github.com/VisDrone/VisDrone-Dataset

In [10]:
im=Image.open('VisDrone2019-DET-train/images/0000002_00005_d_0000014.jpg')
im.show()

In [11]:
annot=read_annot('VisDrone2019-DET-train/annotations/0000002_00005_d_0000014.txt')
annot.head()

Unnamed: 0,x,y,w,h,conf,class,truncation,occlusion,x_,y_
0,684,8,273,116,0,0,0,0,957,124
1,406,119,265,70,0,0,0,0,671,189
2,255,22,119,128,0,0,0,0,374,150
3,1,3,209,78,0,0,0,0,210,81
4,708,471,74,33,1,4,0,1,782,504


In [12]:
l=[]
for i in range(annot.shape[0]):
    l.append(tuple(annot[['x','y','x_','y_']].iloc[i]))

In [13]:
boxed=draw_boxes(im,l)

In [14]:
boxed.show()

In [15]:
cars=[]
crs=annot[annot['class']==4]
for i in range(crs.shape[0]):
    cars.append(tuple(crs[['x','y','x_','y_']].iloc[i]))
    

In [16]:
boxed_cars=draw_boxes(im,cars)
boxed_cars.show()

In [17]:
car_sample=resized_box(im,cars[0],(224,224),False)
car_sample.show()

In [18]:
car_sample_numpy=np.asarray(car_sample)
car_sample_numpy.shape

(224, 224, 3)

In [19]:
c_test=car_sample_numpy.reshape(-1,224,224,3)

### Keras models used like

In [1]:
import tensorflow as tf

In [2]:
model=tf.keras.applications.VGG16()

In [3]:
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [4]:
help(tf.keras.applications.vgg16.preprocess_input)

Help on function preprocess_input in module tensorflow.python.keras.applications.vgg16:

preprocess_input(x, data_format=None)
    Preprocesses a tensor or Numpy array encoding a batch of images.
    
    Usage example with `applications.MobileNet`:
    
    ```python
    i = tf.keras.layers.Input([None, None, 3], dtype = tf.uint8)
    x = tf.cast(i, tf.float32)
    x = tf.keras.applications.mobilenet.preprocess_input(x)
    core = tf.keras.applications.MobileNet()
    x = core(x)
    model = tf.keras.Model(inputs=[i], outputs=[x])
    
    image = tf.image.decode_png(tf.io.read_file('file.png'))
    result = model(image)
    ```
    
    Arguments:
      x: A floating point `numpy.array` or a `tf.Tensor`, 3D or 4D with 3 color
        channels, with values in the range [0, 255].
        The preprocessed data are written over the input data
        if the data types are compatible. To avoid this
        behaviour, `numpy.copy(x)` can be used.
      data_format: Optional data format of 

In [20]:
# works on numpy arrays!

# got c_test of shape (-1,224,224,3) from image cropping demo

c_test_processed=tf.keras.applications.vgg16.preprocess_input(c_test)

In [22]:
model.predict(c_test_processed)

array([[7.98721612e-06, 1.97758472e-05, 2.67540884e-07, 2.82008216e-07,
        2.65288804e-07, 5.16542173e-07, 1.43303156e-08, 2.31118725e-06,
        2.59092218e-07, 1.56043313e-07, 2.28366116e-05, 8.69477299e-06,
        3.04087820e-07, 1.49266938e-07, 4.53398570e-07, 3.23752289e-07,
        9.44367400e-07, 5.52594088e-07, 1.01379817e-06, 1.10875108e-05,
        6.00302201e-07, 4.42663168e-06, 4.80145616e-07, 1.90776382e-06,
        5.40702160e-07, 1.20011677e-07, 1.33437311e-07, 4.01227169e-07,
        5.41248426e-07, 1.84590897e-06, 6.89191353e-08, 7.04502668e-07,
        7.84828501e-07, 3.33934338e-07, 1.35272705e-06, 1.86143808e-07,
        2.09085511e-06, 1.65741241e-07, 7.81737469e-07, 2.09734665e-07,
        6.69801068e-07, 1.98512978e-07, 3.06694432e-07, 2.63321090e-06,
        1.02304874e-07, 2.42160127e-07, 4.00560026e-07, 1.63145432e-06,
        8.21997119e-08, 1.98001597e-08, 1.03055893e-08, 1.76509911e-05,
        8.85989380e-07, 8.79798222e-07, 1.47184608e-07, 2.094608

In [25]:
help(tf.keras.applications.vgg16.decode_predictions)

Help on function decode_predictions in module tensorflow.python.keras.applications.vgg16:

decode_predictions(preds, top=5)
    Decodes the prediction of an ImageNet model.
    
    Arguments:
      preds: Numpy array encoding a batch of predictions.
      top: Integer, how many top-guesses to return. Defaults to 5.
    
    Returns:
      A list of lists of top class prediction tuples
      `(class_name, class_description, score)`.
      One list of tuples per sample in batch input.
    
    Raises:
      ValueError: In case of invalid shape of the `pred` array
        (must be 2D).



In [26]:
model(c_test)

<tf.Tensor: shape=(1, 1000), dtype=float32, numpy=
array([[4.99585610e-07, 6.66371443e-06, 5.50861401e-08, 6.93907865e-08,
        4.07197867e-08, 4.91934998e-07, 1.40191281e-09, 1.79410890e-07,
        6.66405997e-09, 1.10179634e-08, 4.51446027e-07, 2.98018001e-08,
        4.25635971e-09, 7.98650313e-10, 1.65187046e-08, 1.07168061e-08,
        3.32182317e-08, 9.15751830e-09, 4.13508738e-09, 9.53321191e-08,
        6.33649222e-09, 2.40366717e-07, 1.38586174e-07, 7.07530532e-08,
        1.66155871e-08, 2.23489973e-08, 4.16140473e-08, 7.64560923e-07,
        2.24515716e-07, 4.78141146e-07, 1.86807236e-09, 1.44192470e-07,
        3.09392021e-07, 8.67645511e-08, 1.67819508e-07, 1.03496296e-08,
        1.63773976e-07, 8.43626946e-09, 9.14025179e-07, 2.09174758e-08,
        2.75679582e-07, 4.70874006e-08, 4.89864043e-08, 1.34228776e-06,
        4.05340828e-08, 5.68643124e-08, 1.49365405e-07, 4.55376551e-07,
        2.42940934e-09, 1.65682046e-09, 4.86891194e-10, 2.48741844e-06,
        2.729

In [24]:
tf.keras.applications.vgg16.decode_predictions(np.array(model(c_test_processed)))

[[('n04254680', 'soccer_ball', 0.2916611),
  ('n04254120', 'soap_dispenser', 0.15025946),
  ('n03062245', 'cocktail_shaker', 0.12952496),
  ('n04442312', 'toaster', 0.060182735),
  ('n04286575', 'spotlight', 0.060095843)]]

#### Seems like this network won't be of much help?

Test this and others with more images and multiple classes & boxes from each