In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from tensorflow.keras import backend as K
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

from rl_with_videos.preprocessors.convnet import convnet_preprocessor

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
IMG_SIZE = 48
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

TRAINING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/train.csv"
TESTING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/test.csv"

LABELS_CLASS = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
NUM_CLASSES = 5

In [3]:
train_df = pd.read_csv(TRAINING_FILE)
test_df = pd.read_csv(TESTING_FILE)

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 594
Total videos for testing: 224


Unnamed: 0,video_name,tag
277,v_Punch_g13_c06.avi,Punch
17,v_CricketShot_g10_c04.avi,CricketShot
105,v_CricketShot_g24_c02.avi,CricketShot
10,v_CricketShot_g09_c04.avi,CricketShot
51,v_CricketShot_g15_c03.avi,CricketShot
532,v_TennisSwing_g16_c01.avi,TennisSwing
503,v_TennisSwing_g11_c07.avi,TennisSwing
64,v_CricketShot_g17_c02.avi,CricketShot
462,v_ShavingBeard_g23_c06.avi,ShavingBeard
181,v_PlayingCello_g17_c05.avi,PlayingCello


In [4]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [5]:
feature_extractor = convnet_preprocessor([(6912,)], (48,48,3), 256)
feature_extractor.trainable = True

kwargs: {}
Instructions for updating:
Colocations handled automatically by placer.
kwargs: {}
name: feedforward_model
inputs: [<tf.Tensor 'input_2:0' shape=(?, 4608) dtype=float32>]


In [6]:
feature_extractor.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 6912)         0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 6912)         0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               [(None, 6912), (None 0           lambda[0][0]                     
__________________________________________________________________________________________________
reshape (Reshape)               (None, 48, 48, 3)    0           lambda_1[0][0]                   
__________________________________________________________________________________________________
conv2d (Co

In [7]:
class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

#     def compute_mask(self, inputs, mask=None):
#         mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
#         return mask


In [8]:
class MultiHeadAttention(keras.layers.Layer):
    
    def __init__(self,multiheads, head_dim,mask_right=False,**kwargs):
        self.multiheads = multiheads
        self.head_dim = head_dim
        self.output_dim = multiheads * head_dim
        self.mask_right = mask_right
        super(MultiHeadAttention, self).__init__(**kwargs)
        
    def compute_output_shape(self,input_shape):
        return (input_shape[0][0],input_shape[0][1],self.output_dim) #shape=[batch_size,Q_sequence_length,self.multiheads*self.head_dim]

    def build(self,input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1].value, self.output_dim),#input_shape[0] -> Q_seq
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1].value, self.output_dim),#input_shape[1] -> K_seq
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1].value, self.output_dim),#input_shape[2] -> V_seq
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(MultiHeadAttention, self).build(input_shape)
    
    def Mask(self,inputs,seq_len,mode='add'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(indices=seq_len[:,0],num_classes=K.shape(inputs)[1])#mask.shape=[batch_size,short_sequence_length],mask=[[0,0,0,0,1,0,0,..],[0,1,0,0,0,0,0...]...]
            mask = 1 - K.cumsum(mask,axis=1)#mask.shape=[batch_size,short_sequence_length],mask=[[1,1,1,1,0,0,0,...],[1,0,0,0,0,0,0,...]...]
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            elif mode == 'add':
                return inputs - (1 - mask) * 1e12
    
    def call(self,QKVs):

        if len(QKVs) == 3:
            Q_seq,K_seq,V_seq = QKVs
            Q_len,V_len = None,None
        elif len(QKVs) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = QKVs
 
        Q_seq = K.dot(Q_seq,self.WQ)#Q_seq.shape=[batch_size,Q_sequence_length,self.output_dim]=[batch_size,Q_sequence_length,self.multiheads*self.head_dim] 
        Q_seq = K.reshape(Q_seq,shape=(-1,K.shape(Q_seq)[1],self.multiheads,self.head_dim))#Q_seq.shape=[batch_size,Q_sequence_length,self.multiheads,self.head_dim]
        Q_seq = K.permute_dimensions(Q_seq,pattern=(0,2,1,3))#Q_seq.shape=[batch_size,self.multiheads,Q_sequence_length,self.head_dim]

        K_seq = K.dot(K_seq,self.WK)
        K_seq = K.reshape(K_seq,shape=(-1,K.shape(K_seq)[1],self.multiheads,self.head_dim))
        K_seq = K.permute_dimensions(K_seq,pattern=(0,2,1,3))

        V_seq = K.dot(V_seq,self.WV)
        V_seq = K.reshape(V_seq,shape=(-1,K.shape(V_seq)[1],self.multiheads,self.head_dim))
        V_seq = K.permute_dimensions(V_seq,pattern=(0,2,1,3))

        A = K.batch_dot(Q_seq,K_seq,axes=[3,3])/K.sqrt(K.cast(self.head_dim,dtype='float32'))#A.shape=[batch_size,self.multiheads,Q_sequence_length,K_sequence_length]
        A = K.permute_dimensions(A,pattern=(0,3,2,1))#A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]

        A = self.Mask(A,V_len,'add')
        A = K.permute_dimensions(A,pattern=(0,3,2,1))#A.shape=[batch_size,self.multiheads,Q_sequence_length,K_sequence_length]
        
        if self.mask_right:
            ones = K.ones_like(A[:1,:1])
            lower_triangular = K.tf.matrix_band_part(ones,num_lower=-1,num_upper=0) 
            mask = (ones - lower_triangular) * 1e12 
            A = A - mask #Element-wise subtract，A.shape=[batch_size,self.multiheads,Q_sequence_length,K_sequence_length]
        A = K.softmax(A) #A.shape=[batch_size,self.multiheads,Q_sequence_length,K_sequence_length]
        #V_seq.shape=[batch_size,V_sequence_length,V_embedding_dim]
        O_seq = K.batch_dot(A,V_seq,axes=[3,2])#O_seq.shape=[batch_size,self.multiheads,Q_sequence_length,V_sequence_length]
        O_seq = K.permute_dimensions(O_seq,pattern=(0,2,1,3))#O_seq.shape=[batch_size,Q_sequence_length,self.multiheads,V_sequence_length]
        O_seq = K.reshape(O_seq,shape=(-1,K.shape(O_seq)[1],self.output_dim))#O_seq.shape=[,Q_sequence_length,self.multiheads*self.head_dim]
        O_seq = self.Mask(O_seq,Q_len,'mul')
        return O_seq

In [9]:
class TransformerEncoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = MultiHeadAttention(
            num_heads, embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation='relu'), keras.layers.Dense(embed_dim),]
        )
#         self.layernorm_1 = keras.layers.LayerNormalization()
#         self.layernorm_2 = keras.layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention([inputs, inputs, inputs])
        proj_input = keras.layers.BatchNormalization()(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return keras.layers.BatchNormalization()(proj_input + proj_output)

In [10]:
def Transformer(
        input_shapes,
        output_size,
        feature_extractor,
        hidden_state_num = 2,
        hidden_state_size = (16, 8),
        *args,
        **kwargs):
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = 256
    dense_dim = 4
    num_heads = 1
    video = keras.layers.Input(shape=input_shapes,name='video_input')
    encoded_frame = keras.layers.TimeDistributed(keras.layers.Lambda(lambda x: feature_extractor(x)))(video)
    
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(encoded_frame)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = keras.layers.GlobalMaxPooling1D()(x)
    x = keras.layers.Dropout(0.5)(x)
        
    # encoded_vid = keras.layers.Dense(8, activation='relu')(encoded_vid)
    outputs = keras.layers.Dense(output_size, activation='softmax')(x)
    
    model = keras.models.Model(inputs=[video],outputs=outputs)
    
    return model

In [11]:
model = Transformer((None, 6912), 5, feature_extractor)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
video_input (InputLayer)     (None, None, 6912)        0         
_________________________________________________________________
time_distributed (TimeDistri (None, None, 256)         0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 256)         5120      
_________________________________________________________________
transformer_layer (Transform (None, None, 256)         198916    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 1285      
Total para

In [13]:
def label_processor(labels, labels_class):
    new_labels = np.zeros(labels.shape)
    for i in range(labels.shape[0]):
        index = labels_class.index(labels[i])
        new_labels[i] = index
        
    return new_labels

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels, LABELS_CLASS)
    labels = keras.utils.to_categorical(labels, NUM_CLASSES)
    
    video_batch = np.zeros((num_samples, MAX_SEQ_LENGTH, 6912), dtype="float32")

    # For each video.
    
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]
        
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            select_frame = np.linspace(0, video_length-1, MAX_SEQ_LENGTH,endpoint=True,retstep=True,dtype=int)[0]
            # length = min(MAX_SEQ_LENGTH, video_length)
            
            video_batch[idx] = batch[select_frame].reshape(20, 6912).astype('float32') / 255

    return video_batch, labels

In [14]:
train_data, train_labels = prepare_all_videos(train_df, "C:/nyu/DRL/final_project/dataset/UCF101/train")
print(f"Frame features in train set: {train_data[0].shape}")

Frame features in train set: (20, 6912)


In [15]:
model.compile(
    loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]
)

In [16]:
model.fit(train_data, train_labels, shuffle=True,
      batch_size=50, epochs=30, validation_split=0.1,
      verbose=1)

Train on 534 samples, validate on 60 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1d63f691308>