In [27]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Conv2D, LocallyConnected2D, LeakyReLU, MaxPool2D, BatchNormalization

In [26]:
class YOLOv1:
    def __init__(img_h=448, img_w=448, img_ch=3, data_format="channels_last", alpha=0.1, momentum=0.9, classes=20, sides=7):
        """
        img_h: Height of input image
        img_w: Width of input image
        img_ch: Channels of input image (3: RGB, 1: Gray-scale)
        data_format: "channels_last" >> (batch, height, width, channels)
                     "channels_first" >> (batch, channels, height, width)
        alpha: Negative slope coefficient for leaky relu
        momentum: Momentum for the moving average for batch normalization
        classes: Numbers of predicted classes
        sides: Numbers of grids NxN on input image
        """
        assert isinstance(img_h, int), "Expect img_h's type to be integer: {}".format(img_h)
        assert isinstance(img_w, int), "Expect img_w's type to be integer: {}".format(img_w)
        assert isinstance(img_ch, int), "Expect img_ch's type to be integer: {}".format(img_ch)
        assert isinstance(classes, int), "Expect classes's type to be integer: {}".format(classes)
        assert isinstance(sides, int), "Expect sides's type to be integer: {}".format(sides)
        assert isinstance(alpha, float), "Expect alpha's type to be float: {}".format(alpha)
        assert isinstance(momentum, float), "Expect momentum's type to be float: {}".format(momentum)
        assert data_format == "channels_last" or data_format == "channels_first"
        
        self.img_h = img_h
        self.img_w = img_w
        self.img_ch = img_ch
        self.data_format = data_format
        self.alpha = alpha
        self.momentum = momentum
        self.classes = classes
        self.sides = sides
        
    def _Conv2D(inputs, filters, kernel_size, strides=(1, 1)):
        """ Since all convolution layers of YOLO always use leaky relu activation and follow by batch normalization """
        outputs = Conv2D(filters=filters, 
                         kernel_size=kernel_size, 
                         strides=strides,  
                         data_format=self.data_format,
                         activation=LeakyReLU(alpha=self.alpha))(inputs)
            
        return BatchNormalization(momentum=self.momentum)(outputs)
    
    def forward_propagation(inputs):
        """
        inputs: Numpy array of images of 4 dimensions (m, w, h, ch)
        """
        assert isinstance(inputs, np.ndarray), "Expect inputs's type to be numpy array: {}".format(inputs)
        assert len(inputs.shape) == 4, "Expect inputs's shape to be 4 dimensions (batch, height, width, channels) or (batch, channels, height, width): {}".format(inputs.shape)
        
        # Create placeholder for input image
        inputs = tf.placeholder(dtype=tf.float32)
        
        # Reshape image according to data_format
        if self.data_format == "channels_last":
            outputs = tf.reshape(inputs, [-1, self.img_h, self.img_w, self.img_ch])
        else:
            outputs = tf.reshape(inputs, [-1, self.img_ch, self.img_h, self.img_w])
        
        # Construct YOLOv1 model
        with tf.variable_scope("YOLOv1"):
            # Convolutional layer1
            outputs = _Conv2D(inputs=outputs, 
                              filters=64, 
                              kernel_size=7, 
                              strides=2)
            
            outputs = MaxPool2D(pool_size=2,
                                strides=2,
                                data_format=self.data_format)(outputs)
            
            # Convolutional layer2
            outputs = _Conv2D(inputs=outputs, 
                              filters=192, 
                              kernel_size=3)
            
            outputs = MaxPool2D(pool_size=2,
                                strides=2,
                                data_format=self.data_format)(outputs)
            
            # Convolutional layer3
            outputs = _Conv2D(inputs=outputs, 
                              filters=128, 
                              kernel_size=1)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=256, 
                              kernel_size=3)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=256, 
                              kernel_size=1)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=512, 
                              kernel_size=3)
            
            outputs = MaxPool2D(pool_size=2,
                                strides=2,
                                data_format=self.data_format)(outputs)
            
            # Convolutional layer4
            for _ in range(4):
                outputs = _Conv2D(inputs=outputs, 
                                  filters=256, 
                                  kernel_size=1)
                
                outputs = _Conv2D(inputs=outputs, 
                                  filters=512, 
                                  kernel_size=3)
                
            outputs = _Conv2D(inputs=outputs, 
                              filters=512, 
                              kernel_size=1)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=1024, 
                              kernel_size=3)
            
            outputs = MaxPool2D(pool_size=2,
                                strides=2,
                                data_format=self.data_format)(outputs)
            
            # Convolutional layer5
            for _ in range(2):
                outputs = _Conv2D(inputs=outputs, 
                                  filters=512, 
                                  kernel_size=1)
                
                outputs = _Conv2D(inputs=outputs, 
                                  filters=1024, 
                                  kernel_size=3)
                
            outputs = _Conv2D(inputs=outputs, 
                              filters=1024, 
                              kernel_size=3)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=1024, 
                              kernel_size=3, 
                              strides=2)
            
            # Convolutional layer6
            outputs = _Conv2D(inputs=outputs, 
                              filters=1024, 
                              kernel_size=3)
            
            outputs = _Conv2D(inputs=outputs, 
                              filters=1024, 
                              kernel_size=3)
            
            # Fully connected layer1
            outputs = LocallyConnected2D(filters=256,
                                         kernel_size=3,
                                         data_format=self.data_format,
                                         activation=LeakyReLU(alpha=self.alpha))

In [16]:
img_h = np.array(np.random.rand(1, 448, 448, 3))
print(type(img_h) == np.ndarray)
print(isinstance(img_h, np.ndarray))
print(len(img_h.shape))

True
True
4
