In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install loguru >> stdout.txt

In [68]:
from rich.traceback import install; install()
from pathlib import Path 
import shutil
import pandas as pd 
import numpy as np 
import tensorflow as tf 
import tensorflow.keras.layers as layers
from sklearn.model_selection import train_test_split
import cv2
import random
from loguru import logger

In [6]:
from pathlib import Path 

drive_dir = Path('/content/drive/MyDrive')
ucf_dir = drive_dir / "UCF101/UCF-101"

In [42]:
train_df = pd.read_csv(drive_dir / f"UCF101/train_paths.csv")

In [51]:
log_path = drive_dir / "UCF101/logs" 
log_path.mkdir(exist_ok=True)
logger.remove()
logger.add(str(log_path / "first_frame_capture.log"), level='DEBUG')

1

In [24]:
def format_frame(frame, output_shape):
    # convert the values from uint8 to float 32 and also normalize values
    # frame = tf.image.convert_image_dtype(frame, tf.float32)

    # pad the image with black while preserving the aspect ratio
    # frame = tf.image.resize_with_pad(frame, *output_size)

    frame = frame.astype(np.float32) / 255.0

    h, w = frame.shape[:2]
    target_h, target_w = output_shape
    scale = min(target_h / h, target_w / w) # scale is selected so that the new image has a size that fits into target_h, target_w

    new_h = int(h * scale)
    new_w = int(w * scale)

    # resize the image
    resized = cv2.resize(frame, (new_w, new_h)) # the resize dim is width x height

    # compute the padding
    pad_h = (target_h - new_h) // 2
    pad_w = (target_w - new_w) // 2

    new_frame = np.zeros((target_h, target_w, 3))
    new_frame[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = resized
    
    return new_frame

def capture_frames(video_path, frame_step=4, n_frames=32, output_size=(224, 224), mode='train'):
    video_path = str(video_path)
    video = cv2.VideoCapture(video_path) # open the video

    if not video.isOpened():
        video.release()
        logger.warning(f'Failed to open video for {video_path}')
        return

    total_frames = video.get(cv2.CAP_PROP_FRAME_COUNT) # get total frame count
    total_frames = int(total_frames) # cv2 return a float
    required_frames = 1 + (n_frames - 1) * frame_step
    result = []

    if required_frames > total_frames:
        start = 0 # if the required frames is greater than total frames available then we start sampling from the first frame
    else: 
        # otherwise we pick a random starting point
        max_start = total_frames - required_frames

        # during inference or validation the start is not random
        if mode == 'train':
            start = random.randint(0, max_start) # 0 and max_start are inclusive
        else:
            start = 0
    
    # move the pointer to start
    video.set(cv2.CAP_PROP_POS_FRAMES, start)

    # capture the first frame
    # ret is a boolean indicating if frame was captured successfully
    # frame is a numpy array of image (height, width, 3 channels BGR)
    ret, frame = video.read() 
    if ret:
        result.append(format_frame(frame, output_size))
        logger.info(f'First frame captured from {video_path}')
    else:
        result.append()
        logger.warning(f'Failed to capture first frame from {video_path}')
        return 
    
    # now start capturing remaining frames
    for _ in range(n_frames - 1):
        for _ in range(frame_step):
            # skip frame_step number of frames
            ret, frame = video.read()
        if ret: # store the last frame in since skipping frame_step
            result.append(format_frame(frame, output_size))
        else:
            result.append(np.zeros_like(result[0]))
    
    video.release()
    # this [..., [2, 1, 0]] reads as all dimensions before the last one
    # [2, 1, 0] change channel order from BGR to RGB
    result_arr = np.array(result)[..., [2, 1, 0]]

    return result_arr

In [43]:
jav_df = train_df.loc[train_df.action == "JavelinThrow"]

In [77]:
dataset = tf.data.Dataset.from_tensor_slices((jav_df.path.values, jav_df.class_idx.values))
dataset = dataset.map(
    lambda path, label: tf.py_function(
        lambda p, l: (capture_frames(p.numpy().decode('utf-8')), l),
        [path, label],
        [tf.float32, tf.int64]
    )
)
batch_size = 32
dataset = dataset.batch(batch_size).shuffle(1000).prefetch(tf.data.AUTOTUNE)

In [118]:
x = next(iter(dataset))

In [86]:
def build_cnn_backbone():
    input_layer = layers.Input(shape=(224, 224, 3))

    x = input_layer # setting the input to a variable that changes is a common pattern
    
    # define the number of filters and where to add pooling
    filters = [2**(5 + i) for i in range(5) for _ in range(2)]

    for i, filter_size in enumerate(filters):
        x = layers.Conv2D(filters=filter_size, kernel_size=(3, 3), padding='same')(x)
        x = layers.ReLU()(x)
        x = layers.BatchNormalization()(x)

        if i % 2 != 0: # apply max pooling at 1, 3, 5, 7, 9 index; after 2nd layer of filters having the same size
            x = layers.MaxPooling2D((2, 2))(x)
    
    # finally to reduce the feature maps to flattened dimension or 1 x 1 
    # we use globalaveragepooling 2d which will take average across the 
    # entire feature may and then the output will be number of channels
    x = layers.GlobalAveragePooling2D()(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x) 

    return model

In [87]:
cnn_backbone = build_cnn_backbone()

In [88]:
def build_cnn_lstm_model(cnn_backbone, num_classes=101):
    input_layer = layers.Input(shape=(32, 224, 224, 3))
    x = layers.TimeDistributed(cnn_backbone)(input_layer)

    # apply the lstm
    x = layers.LSTM(256, return_sequences=False)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(num_classes)(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)

    return model

In [125]:
model = build_cnn_lstm_model(cnn_backbone, num_classes=1)

In [126]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [127]:
model.summary()

In [128]:
model.fit(x[0], x[1])

: 

: 

: 