In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

from rl_with_videos.preprocessors.convnet import convnet_preprocessor

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
IMG_SIZE = 48
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

TRAINING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/train.csv"
TESTING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/test.csv"

LABELS_CLASS = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
NUM_CLASSES = 5

In [3]:
train_df = pd.read_csv(TRAINING_FILE)
test_df = pd.read_csv(TESTING_FILE)

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 594
Total videos for testing: 224


Unnamed: 0,video_name,tag
294,v_Punch_g16_c03.avi,Punch
343,v_Punch_g23_c06.avi,Punch
275,v_Punch_g13_c04.avi,Punch
82,v_CricketShot_g20_c04.avi,CricketShot
257,v_Punch_g10_c06.avi,Punch
282,v_Punch_g14_c04.avi,Punch
208,v_PlayingCello_g21_c04.avi,PlayingCello
550,v_TennisSwing_g18_c05.avi,TennisSwing
540,v_TennisSwing_g17_c02.avi,TennisSwing
476,v_ShavingBeard_g25_c06.avi,ShavingBeard


In [4]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [5]:
feature_extractor = convnet_preprocessor([(6912,)], (48,48,3), 256)
feature_extractor.trainable = True

kwargs: {}
Instructions for updating:
Colocations handled automatically by placer.
kwargs: {}
name: feedforward_model
inputs: [<tf.Tensor 'input_2:0' shape=(?, 4608) dtype=float32>]


In [6]:
feature_extractor.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 6912)         0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 6912)         0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               [(None, 6912), (None 0           lambda[0][0]                     
__________________________________________________________________________________________________
reshape (Reshape)               (None, 48, 48, 3)    0           lambda_1[0][0]                   
__________________________________________________________________________________________________
conv2d (Co

In [9]:
def LRCNs(
        input_shapes,
        output_size,
        feature_extractor,
        hidden_state_num = 2,
        hidden_state_size = (16, 8),
        *args,
        **kwargs):
    video = keras.layers.Input(shape=input_shapes,name='video_input')
    encoded_frame = keras.layers.TimeDistributed(keras.layers.Lambda(lambda x: feature_extractor(x)))(video)
    
    for i in range(0, hidden_state_num - 1):
        encoded_frame = keras.layers.LSTM(hidden_state_size[i], return_sequences=True)(encoded_frame)
        
    encoded_vid = keras.layers.LSTM(hidden_state_size[hidden_state_num-1], return_sequences=False)(encoded_frame)
        
    # encoded_vid = keras.layers.Dense(8, activation='relu')(encoded_vid)
    outputs = keras.layers.Dense(output_size, activation='softmax')(encoded_vid)
    
    model = keras.models.Model(inputs=[video],outputs=outputs)
    
    return model

In [10]:
model = LRCNs((None, 6912), 5, feature_extractor)

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
video_input (InputLayer)     (None, None, 6912)        0         
_________________________________________________________________
time_distributed (TimeDistri (None, None, 256)         0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 16)          17472     
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 800       
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 45        
Total params: 18,317
Trainable params: 18,317
Non-trainable params: 0
_________________________________________________________________


In [18]:
def label_processor(labels, labels_class):
    new_labels = np.zeros(labels.shape)
    for i in range(labels.shape[0]):
        index = labels_class.index(labels[i])
        new_labels[i] = index
        
    return new_labels

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels, LABELS_CLASS)
    labels = keras.utils.to_categorical(labels, NUM_CLASSES)
    
    video_batch = np.zeros((num_samples, MAX_SEQ_LENGTH, 6912), dtype="float32")

    # For each video.
    
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]
        
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            select_frame = np.linspace(0, video_length-1, MAX_SEQ_LENGTH,endpoint=True,retstep=True,dtype=int)[0]
            # length = min(MAX_SEQ_LENGTH, video_length)
            
            video_batch[idx] = batch[select_frame].reshape(20, 6912).astype('float32') / 255

    return video_batch, labels

In [19]:
train_data, train_labels = prepare_all_videos(train_df, "C:/nyu/DRL/final_project/dataset/UCF101/train")
print(f"Frame features in train set: {train_data[0].shape}")

Frame features in train set: (20, 6912)


In [20]:
model.compile(
    loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]
)

In [21]:
model.fit(train_data, train_labels, shuffle=True,
      batch_size=10, epochs=20,
      verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x25b793da988>