# Installing TensorFlow Docs

In [1]:
!pip install -q git+https://github.com/tensorflow/docs

In [2]:
pip install stn

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install googlenet-pytorch

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


# Imports

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import imageio
import cv2
import os
import torch

from googlenet_pytorch import GoogLeNet
from PIL import Image
from tensorflow_docs.vis import embed
from tensorflow.keras import layers
from tensorflow import keras
from stn import spatial_transformer_network as Stransformer
from PIL import Image as im

# Define hyperparameters

In [2]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 50

MAX_SEQ_LENGTH = 50
NUM_FEATURES = 2048

# Lookups 

In [4]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [5]:
print(tf.__version__)

2.5.0


In [3]:
tf.test.is_gpu_available(cuda_only=True)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

# Data preparation

In [3]:
train_df = pd.read_csv("exp_train.csv")
test_df = pd.read_csv("exp_test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 256
Total videos for testing: 47


Unnamed: 0,video_name,tag
112,explosions/videoplayback_2. - 041.mp4,explosions
206,Torturing and bulling/Fantasy Island (2020 ) -...,Torturing and bulling
182,Sport fights/v_Punch_g08_c01.avi,Sport fights
24,blood and gore/Top 10 Movie Massacres - 081.mp4,blood and gore
145,firearms/vlc-record-2018-08-06-23h25m23s-EPhAe...,firearms
177,Sport fights/fight_g21 (2).mp4,Sport fights
178,Sport fights/fight_g21 (4).mp4,Sport fights
190,Sport fights/v_SumoWrestling_g12_c03.avi,Sport fights
243,Violence fights/Nobody (2021) - Bus Fight Scen...,Violence fights
25,blood and gore/Top 10 Movie Massacres - 090.mp4,blood and gore


In [4]:
exp_validation_df = pd.read_csv("exp_validation.csv")

# Video frame Processinng Functions

In [5]:
center_crop_layer = layers.experimental.preprocessing.CenterCrop(IMG_SIZE, IMG_SIZE)


def crop_center(frame):
    cropped = center_crop_layer(frame[None, ...])
    cropped = cropped.numpy().squeeze()
    return cropped


# Following method is modified from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def load_video(path, max_frames=0):
    
    cap = cv2.VideoCapture(path)
    
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center(frame)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

# Feature Extractor

In [6]:
def build_feature_extractor():
   
    feature_extractor = keras.applications.Xception(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    
    preprocess_input = keras.applications.xception.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

In [56]:
def build_feature_extractor():
    feature_extractor = keras.applications.ResNet152V2(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.resnet_v2.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [58]:
feature_extractor.summary()

Model: "feature_extractor"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 224, 224, 3)]     0         
_________________________________________________________________
tf.math.truediv_3 (TFOpLambd (None, 224, 224, 3)       0         
_________________________________________________________________
tf.math.subtract_3 (TFOpLamb (None, 224, 224, 3)       0         
_________________________________________________________________
resnet152v2 (Functional)     (None, 2048)              58331648  
Total params: 58,331,648
Trainable params: 58,187,904
Non-trainable params: 143,744
_________________________________________________________________


# Label preprocessing

In [8]:
label_processor = keras.layers.experimental.preprocessing.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)
print(label_processor.get_vocabulary())

['Sport fights', 'Torturing and bulling', 'Violence fights', 'blood and gore', 'carchase', 'coldarms', 'explosions', 'firearms', 'nonViolent']


In [10]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    print(num_samples)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_features` are what we will feed to our sequence model.
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        
        

        if len(frames) == 0 : 
            continue
        
        print(idx)
        
        frame_feature_maps = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
        
        # Pad shorter videos.
        if len(frames) < MAX_SEQ_LENGTH:
            
            diff = MAX_SEQ_LENGTH - len(frames)
            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
            frames = np.concatenate((frames, padding))
        
        frames = frames[None, ...]

        # Initialize placeholder to store the features of the current video.
        temp_frame_featutes = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[1]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                if np.mean(batch[j, :]) > 0.0:
                    temp_frame_featutes[i, j, :] = feature_extractor.predict(
                        batch[None, j, :]
                    )

                else:
                    temp_frame_featutes[i, j, :] = 0.0

        frame_features[idx,] = temp_frame_featutes.squeeze()
        

    return frame_features, labels

In [13]:
train_data, train_labels = prepare_all_videos(train_df, "experiment/train")
#test_data, test_labels = prepare_all_videos(test_df, "experiment/test")

print(f"Frame features in train set: {train_data[0].shape}")
#print(f"Frame masks in train set: {train_data[1].shape}")

256
0
1
2


KeyboardInterrupt: 

In [14]:
validation_data, validation_labels = prepare_all_videos(exp_validation_df, "experiment/val")

print(f"Frame features in train set: {validation_data[0].shape}")

71
0


KeyboardInterrupt: 

In [15]:
print(f"Frame features in train set: {train_data.shape}")

Frame features in train set: (256, 50, 2048)


In [16]:
print(f"Frame features in train set: {test_data.shape}")

Frame features in train set: (47, 50, 2048)


In [17]:
print(f"Frame features in train set: {train_data[0].shape}")

Frame features in train set: (50, 2048)


In [18]:
print(f"Frame features in train set: {validation_data.shape}")

Frame features in train set: (71, 50, 2048)


In [15]:
from numpy import save

train_data

save('exp_50_train_data.npy', train_data)
save('exp_50_train_labels.npy', train_labels)
save('exp_50_test_data.npy', test_data)
save('exp_50_test_labels.npy', test_labels)
save('exp_50_validation_data.npy', validation_data)
save('exp_50_validation_labels.npy', validation_labels)

NameError: name 'train_data' is not defined

In [9]:
train_data, train_labels = np.load("exp_50_train_data.npy"), np.load("exp_50_train_labels.npy")
test_data, test_labels = np.load("exp_50_test_data.npy"), np.load("exp_50_test_labels.npy")
validation_data, validation_labels = np.load("exp_50_validation_data.npy"), np.load("exp_50_validation_labels.npy")

In [10]:
print(f"Frame features in train set: {train_data.shape}")
print(f"Frame features in train set: {test_data.shape}")
print(f"Frame features in train set: {validation_data.shape}")

Frame features in train set: (256, 50, 2048)
Frame features in train set: (47, 50, 2048)
Frame features in train set: (71, 50, 2048)


In [11]:
print(f"Frame features in train set: {train_labels.shape}")
print(f"Frame features in train set: {test_labels.shape}")
print(f"Frame features in train set: {validation_labels.shape}")

Frame features in train set: (256, 1)
Frame features in train set: (47, 1)
Frame features in train set: (71, 1)


In [12]:
print(f"Frame features in train set: {train_data.shape[1]}")
print(f"Frame features in train set: {train_data.shape[2]}")

Frame features in train set: 50
Frame features in train set: 2048


In [None]:
>>> inputs = tf.random.normal([32, 10, 8])
>>> lstm = tf.keras.layers.LSTM(4)
>>> output = lstm(inputs)
>>> print(output.shape)
(32, 4)
>>> lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
>>> whole_seq_output, final_memory_state, final_carry_state = lstm(inputs)
>>> print(whole_seq_output.shape)
(32, 10, 4)
>>> print(final_memory_state.shape)
(32, 4)
>>> print(final_carry_state.shape)

In [48]:
inputs = train_data
lstm = layers.LSTM(256,return_sequences=True,input_shape=(MAX_SEQ_LENGTH,NUM_FEATURES))
output = lstm(inputs)
x = layers.Dense(6, activation='softmax')(output)
print(x.shape)

(256, 50, 6)


In [49]:
p = tf. make_tensor_proto(x)
x = tf.make_ndarray(p)
x = x.reshape(x.shape[0]*x.shape[1],2,3)
f = inputs.reshape(inputs.shape[0]*inputs.shape[1],inputs.shape[2])

print(x)
print(x.shape)
print(f.shape)

[[[0.15592754 0.17151207 0.16156733]
  [0.16798759 0.17005064 0.17295486]]

 [[0.15217018 0.17256998 0.1580989 ]
  [0.1699029  0.17231143 0.17494659]]

 [[0.15371743 0.17088374 0.15289803]
  [0.1727466  0.1731145  0.17663972]]

 ...

 [[0.14657235 0.14747484 0.21345176]
  [0.10557949 0.25916094 0.12776062]]

 [[0.13333859 0.1630908  0.19489412]
  [0.10527468 0.28080896 0.12259284]]

 [[0.1300137  0.1536531  0.19105616]
  [0.11031277 0.29052696 0.12443729]]]
(12800, 2, 3)
(12800, 2048)


In [50]:
tranformed_features = Stransformer(f,x)

InvalidArgumentError: slice index 2 of dimension 0 out of bounds. [Op:StridedSlice] name: strided_slice/

In [45]:
rand_int = np.random.randn(2,6)
print(rand_int)

[[-0.48486672  0.94373425  1.05978332 -0.30593336  1.00844242  0.84385549]
 [-0.33081138  0.82345387  0.13053626 -0.71942256  1.43448078 -0.80042563]]


In [47]:
rand_int = rand_int.reshape(rand_int.shape[0],2,3)
print(rand_int)

[[[-0.48486672  0.94373425  1.05978332]
  [-0.30593336  1.00844242  0.84385549]]

 [[-0.33081138  0.82345387  0.13053626]
  [-0.71942256  1.43448078 -0.80042563]]]


In [33]:
rand_int = np.random.randn(2,4,5)
print(rand_int)
print(rand_int.shape)
print(rand_int.shape[0])
print(rand_int.shape[1])
rand_int = rand_int.reshape(rand_int.shape[0]*rand_int.shape[1],5)
print(rand_int.shape)
print(rand_int)

(2, 4, 5)
2
4
(8, 5)
[[ 0.64500756 -0.63740419 -0.81135399  0.19377677 -0.77418602]
 [-0.66396687 -0.62143017  1.04838916  1.11598405  0.34800483]
 [-0.45548252 -0.02031571  1.16169277  1.14426899  1.51808414]
 [ 1.24463263 -0.61181416 -0.49328212 -1.28572211  1.12544586]
 [-0.0125484  -0.85252744 -0.44338665 -1.3459441  -0.63386733]
 [ 0.65441263  0.64755902 -0.12523646 -0.87122525  0.36713315]
 [-0.9398293  -0.66768179  0.2966116  -0.28655874  0.72352219]
 [ 0.67588898  1.39819707  0.78635672  1.62267612 -1.67252424]]


In [16]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask

In [17]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


In [31]:
from matplotlib import pyplot

def get_compiled_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = len(label_processor.get_vocabulary())
    
    n_timesteps, n_features, n_outputs = train_data.shape[1], train_data.shape[2], train_data.shape[1]

    inputs = keras.Input(shape=(train_data.shape[1],n_timesteps,n_features))
#     x = layers.LSTM(256,return_sequences=True,input_shape=(train_data.shape[0],sequence_length,NUM_FEATURES))(inputs)
#     x = layers.Dense(6, activation='softmax')(x)
#     x = Stransformer(inputs,x.output)
    
    x = layers.ConvLSTM2D(96, kernel_size=(7, 7), strides=(2, 2), padding='same', name='tmp_conv1')(inputs)
    x = layers.Dropout(0.5)(x)
    x = layers.BatchNormalization(axis=3)(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    x = layers.ConvLSTM2D(256, kernel_size=(5, 5), strides=(2, 2), padding='same', name='tmp_conv2')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.BatchNormalization(axis=3)(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    x = layers.ConvLSTM2D(512, kernel_size=(3, 3), strides=(1, 1), padding='same', name='tmp_conv3')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.BatchNormalization(axis=3)(x)
    x = layers.Activation('relu')(x)

    x = layers.ConvLSTM2D(512, kernel_size=(3, 3), strides=(1, 1), padding='same', name='tmp_conv4')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.BatchNormalization(axis=3)(x)
    x = layers.Activation('relu')(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(4096, activation='relu', name='tmp_fc6')(x)
    x = layers.Dropout(0.9)(x)

    x = layers.Dense(2048, activation='relu', name='tmp_fc7')(x)
    x = layers.Dropout(0.9)(x)

    outputs = layers.Dense(classes, activation="softmax")(x)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return model


def run_experiment():
#     filepath = "/tmp/video_classifier"
#     checkpoint = keras.callbacks.ModelCheckpoint(
#         filepath, save_weights_only=True, save_best_only=True, verbose=1
#     )

    model = get_compiled_model()
    history = model.fit(
        train_data,
        train_labels,
        epochs=EPOCHS,
        batch_size = 32,
        validation_data=(validation_data, validation_labels)
    )
    
    #model.load_weights(filepath)
    _, accuracy = model.evaluate(test_data, test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    
     # plot learning curves
    pyplot.title('Learning Curves')
    pyplot.xlabel('Epoch')
    pyplot.ylabel('Root Mean Squared Error')
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='val')
    pyplot.legend()
    pyplot.show()

    return model

In [32]:
trained_model = run_experiment()

ValueError: Input 0 of layer tmp_conv1 is incompatible with the layer: expected ndim=5, found ndim=4. Full shape received: (None, 50, 50, 2048)