# Video Classification with a CNN-RNN Architecture


In [7]:
# !wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz
# !tar xf ucf101_top5.tar.gz

from google.colab import drive
drive.mount('/content/drive')
!pip install -q git+https://github.com/tensorflow/docs

# !pip install -q py-feat
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths
from google.colab.patches import cv2_imshow
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os
import glob

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 100
MAX_SEQ_LENGTH = 300 #20
NUM_FEATURES = 2048

# ls 'drive/MyDrive/seqimg_youtube/unzip/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# ls 'drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/cropped_v1/'

In [9]:
# ls 'drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/'
train_df = pd.read_csv("drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/labels_5classes.csv")
# test_df = pd.read_csv("drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/test_df_5class.csv")
print(f"Total videos for training: {len(train_df)}")
# print(f"Total videos for testing: {len(test_df)}")
# A = glob.glob('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/cropped_v1/*.mp4')
# len(A)

Total videos for training: 1394


In [10]:
# def crop_center_square(frame):
#     # cropped = cv2.resize(frame, (224, 224), interpolation = cv2.INTER_AREA)
#     faceimg = detector.detect_faces(frame)
#     try:
#       pos = faceimg[0][:4]
#       cropimg =frame[int(pos[1]):int(pos[3]),int(pos[0]):int(pos[2])] 
#       cropped = cv2.resize(cropimg, (224, 224), interpolation=cv2.INTER_CUBIC)
#     except:
#       cropped = frame
#       cropped = cv2.resize(cropped, (224, 224), interpolation = cv2.INTER_AREA)
#       print('error')    
#     return cropped

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            # frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(weights="imagenet",include_top=False,pooling="avg",input_shape=(IMG_SIZE, IMG_SIZE, 3),)
    preprocess_input = keras.applications.inception_v3.preprocess_input
    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)
    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()
label_processor = keras.layers.experimental.preprocessing.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
print(label_processor.get_vocabulary())
print(pd.unique(train_df['tag']))

['', 'angry', 'happy', 'neutral', 'relax', 'sad']
['neutral' 'relax' 'angry' 'happy' 'sad']


In [None]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for idx, path in tqdm(enumerate(video_paths)):
        print('idx', idx, 'path', path)
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_featutes = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
        # print('frames', frames.shape)

        for i, batch in enumerate(frames):
            # print('batch', batch.shape)
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            # print('video_length', video_length)
            # print('###########')
            
            for j in range(length):
                # print('j =', j, 'batch[None, j, :] =', batch[None, j, :].shape)
                temp_frame_featutes[i, j, :] = feature_extractor.predict(batch[None, j, :])
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_featutes.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()
        # np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/video_classification_croped/frame_features_crop'+str(idx+1079)+'.npy', frame_features)
        # np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/video_classification_croped/frame_masks_crop'+str(idx+1079)+'.npy', frame_masks)
        # np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/video_classification_croped/labels_crop'+str(idx+1079)+'.npy', labels)
    return (frame_features, frame_masks), labels

train_data, train_labels = prepare_all_videos(train_df, "drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/cropped_v1/")
# test_data, test_labels = prepare_all_videos(test_df, "drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/test")
# print(f"Frame features in train set: {train_data[0].shape}")
# print(f"Frame masks in train set: {train_data[1].shape}")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

idx 0 path face2_01_0_59_3296_neutral.mp4
idx 1 path face2_01_1_3356_4075_relax.mp4
idx 2 path face2_01_2_4135_5034_neutral.mp4
idx 3 path face2_01_3_5514_5754_stress.mp4
idx 4 path face2_01_4_5814_6053_stress.mp4
idx 5 path face2_01_5_6053_6233_happy.mp4
idx 6 path face2_01_6_6293_7012_neutral.mp4
idx 7 path face2_01_7_7072_7252_stress.mp4
idx 8 path face2_01_8_7312_7672_neutral.mp4
idx 9 path face2_01_9_7732_8031_stress.mp4
idx 10 path face2_01_10_8091_8451_happy.mp4
idx 11 path face2_01_11_8451_8871_happy.mp4
idx 12 path face2_01_12_9050_9230_happy.mp4
idx 13 path face2_01_13_9410_9650_happy.mp4
idx 14 path face2_01_14_9710_10129_neutral.mp4
idx 15 path face2_01_15_11448_11568_stress.mp4
idx 16 path face2_01_16_11628_11928_neutral.mp4
idx 17 path face2_01_17_10909_12647_stress.mp4
idx 18 path face2_01_18_12707_12887_sad.mp4
idx 19 path face2_01_19_12947_14565_sad.mp4
idx 20 path face2_01_20_14625_14685_sad.mp4
idx 21 path face2_01_21_14745_14925_angry.mp4
idx 22 path face2_01_22_144

In [5]:
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/frame_features.npy', train_data[0])
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/frame_masks.npy', train_data[1])
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/labels.npy', train_labels)

In [6]:
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/frame_features.npy', train_data[0])
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/frame_masks.npy', train_data[1])
np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/labels.npy', train_labels)

In [None]:
#Train
frame_features  = np.load("drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/frame_features_total.npy")
frame_masks  = np.load("drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/frame_masks_total.npy")
train_data = (frame_features, frame_masks)
train_labels = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels0.npy')
#Test
test_data =(frame_features, frame_masks)
test_labels = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels0.npy')

In [None]:
# ls 'drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/frame_features196.npy'
# for i in range(A.shape[0]): #278, 783  
#   A = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/frame_features'+str(i)+'.npy')
#   print('i=' , i, ' value = ' , A[i-1].sum())
# X1 = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels277.npy')
# print('X1', X1.shape)
# X2 = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels782.npy')
# print('X2', X2.shape)
# X3 = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels1193.npy')
# print('X3', X3.shape)
# X4 = np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels783.npy')[783:784]
# print(X4.shape)

# print(X2[783].shape)
# X2[783] = X4
# print(X2[783] .shape)
# X1_new = X1[:278]
# X2_new = X2[278:783]
# X3_new = X3[783:]
# print('total', X1_new.shape[0]+X2_new.shape[0]+X3_new.shape[0])
# print('X1_new', X1_new.shape)
# print('X2_new', X2_new.shape)
# print('X3_new', X3_new.shape)

# np.load('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/labels0.npy').shape
# Xtotal = np.concatenate((X1_new,X2_new,X3_new),axis=0)
# np.save('drive/MyDrive/AIHealthcare/AIcare_Phrase1/data/season2/file_v1/csv_frame/trainv1/videos/video_classification_croped/label_total.npy', X1)
# print('Xtotal', Xtotal.shape)

In [None]:
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()
    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)
    rnn_model = keras.Model([frame_features_input, mask_input], output)
    rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return rnn_model

def run_experiment():
    filepath = "/tmp/video_classifier/"
    checkpoint = keras.callbacks.ModelCheckpoint(filepath, save_weights_only=True, save_best_only=True, verbose=1)
    seq_model = get_sequence_model()
    history = seq_model.fit( [train_data[0], train_data[1]], train_labels, validation_split=0.3, epochs=EPOCHS, callbacks=[checkpoint],)
    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    return history, seq_model

# _, sequence_model = run_experiment()

In [None]:
num_folds = 10
acc_per_fold = []
loss_per_fold = []
fold_no = 1

kfold = KFold(n_splits=num_folds, shuffle=True)
for train, test in kfold.split(train_data[0], train_labels):
    seq_model = get_sequence_model()
    history = seq_model.fit( [train_data[0][train], train_data[1][train]], train_labels[train], epochs=EPOCHS)
    _, accuracy = seq_model.evaluate([train_data[0][test], train_data[1][test]], train_labels[test])
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    acc_per_fold.append(round(accuracy * 100, 2))
    print(f'Training for fold {fold_no} ...')
    fold_no = fold_no + 1

In [None]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_featutes = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[1]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_featutes[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked
    return frame_featutes, frame_mask

def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()
    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]
    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")

test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
to_gif(test_frames[:MAX_SEQ_LENGTH])

## Next steps

* In this example, we made use of transfer learning for extracting meaningful features
from video frames. You could also fine-tune the pre-trained network to notice how that
affects the end results.
* For speed-accuracy trade-offs, you can try out other models present inside
`tf.keras.applications`.
* Try different combinations of `MAX_SEQ_LENGTH` to observe how that affects the
performance.
* Train on a higher number of classes and see if you are able to get good performance.
* Following [this tutorial](https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub), try a
[pre-trained action recognition model](https://arxiv.org/abs/1705.07750) from DeepMind.
* Rolling-averaging can be useful technique for video classification and it can be
combined with a standard image classification model to infer on videos.
[This tutorial](https://www.pyimagesearch.com/2019/07/15/video-classification-with-keras-and-deep-learning/)
will help understand how to use rolling-averaging with an image classifier.
* When there are variations in between the frames of a video not all the frames might be
equally important to decide its category. In those situations, putting a
[self-attention layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Attention) in the
sequence model will likely yield better results.
* Following [this book chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11),
you can implement Transformers-based models for processing videos.