In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

from rl_with_videos.preprocessors.convnet import convnet_preprocessor

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Use UCF101 dataset as examples

In [2]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

TRAINING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/train.csv"
TESTING_FILE = "C:/nyu/DRL/final_project/dataset/UCF101/test.csv"

LABELS_CLASS = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
NUM_CLASSES = 5

In [3]:
train_df = pd.read_csv(TRAINING_FILE)
test_df = pd.read_csv(TESTING_FILE)

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 594
Total videos for testing: 224


Unnamed: 0,video_name,tag
295,v_Punch_g16_c04.avi,Punch
180,v_PlayingCello_g17_c04.avi,PlayingCello
383,v_ShavingBeard_g11_c04.avi,ShavingBeard
203,v_PlayingCello_g20_c06.avi,PlayingCello
441,v_ShavingBeard_g20_c06.avi,ShavingBeard
573,v_TennisSwing_g22_c04.avi,TennisSwing
4,v_CricketShot_g08_c05.avi,CricketShot
221,v_PlayingCello_g23_c05.avi,PlayingCello
235,v_PlayingCello_g25_c05.avi,PlayingCello
85,v_CricketShot_g20_c07.avi,CricketShot


In [4]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [5]:
# def build_feature_extractor():
#     feature_extractor = keras.applications.InceptionV3(
#         weights="imagenet",
#         include_top=False,
#         pooling="avg",
#         input_shape=(IMG_SIZE, IMG_SIZE, 3),
#     )
#     preprocess_input = keras.applications.inception_v3.preprocess_input

#     inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
#     preprocessed = tf.keras.layers.Lambda(
#         lambda x: preprocess_input(x))(inputs)

#     outputs = feature_extractor(preprocessed)
#     return keras.Model(inputs, outputs, name="feature_extractor")

In [6]:
feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
)
feature_extractor.trainable = True

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
# feature_extractor = build_feature_extractor()
# feature_extractor.trainable = True

In [8]:
video = keras.layers.Input(shape=(None,224,224,3),name='video_input')
encoded_frame = keras.layers.TimeDistributed(keras.layers.Lambda(lambda x: feature_extractor(x)))(video)
encoded_vid = keras.layers.LSTM(16, return_sequences=True)(encoded_frame)
encoded_vid = keras.layers.LSTM(8, return_sequences=False)(encoded_vid)
encoded_vid = keras.layers.Dense(8, activation='relu')(encoded_vid)
outputs = keras.layers.Dense(5, activation='softmax')(encoded_vid)
model = keras.models.Model(inputs=[video],outputs=outputs)

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
video_input (InputLayer)     (None, None, 224, 224, 3) 0         
_________________________________________________________________
time_distributed (TimeDistri (None, None, 2048)        0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 16)          132160    
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 800       
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 45        
Total params: 133,077
Trainable params: 133,077
Non-trainable params: 0
_________________________________________________________________


In [10]:
def label_processor(labels, labels_class):
    new_labels = np.zeros(labels.shape)
    for i in range(labels.shape[0]):
        index = labels_class.index(labels[i])
        new_labels[i] = index
        
    return new_labels

In [11]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels, LABELS_CLASS)
    labels = keras.utils.to_categorical(labels, NUM_CLASSES)
    
    video_batch = np.zeros((num_samples, MAX_SEQ_LENGTH, 224, 224, 3), dtype="float32")

    # For each video.
    
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]
        
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            select_frame = np.linspace(0, video_length-1, MAX_SEQ_LENGTH,endpoint=True,retstep=True,dtype=int)[0]
            # length = min(MAX_SEQ_LENGTH, video_length)
            
            video_batch[idx] = batch[select_frame].astype('float32') / 255

    return video_batch, labels

In [12]:
train_data, train_labels = prepare_all_videos(train_df, "C:/nyu/DRL/final_project/dataset/UCF101/train")
print(f"Frame features in train set: {train_data[0].shape}")

Frame features in train set: (20, 224, 224, 3)


In [13]:
# test_data, test_labels = prepare_all_videos(test_df, "C:/nyu/DRL/final_project/dataset/UCF101/test")

In [14]:
model.compile(
    loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]
)

In [None]:
model.fit(train_data, train_labels, shuffle=True,
      batch_size=10, epochs=20, validation_split=0.15,
      verbose=1)

Train on 504 samples, validate on 90 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20