### Import 

In [2]:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math
import os
import glob
import pandas as pd
import cv2
import gc
import numpy as np
import random
import imageio
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

### Extract frames from video fucntion

In [3]:
def format_frames(frame, output_size):
  """
    Pad and resize an image from a video.

    Args:
      frame: Image that needs to resized and padded. 
      output_size: Pixel size of the output frame image.

    Return:
      Formatted frame with padding of specified output size.
  """
  frame = tf.image.convert_image_dtype(frame, tf.float32)
  frame = tf.image.resize_with_pad(frame, *output_size)
  return frame

def frames_from_video_file(video_path, n_frames, output_size = (120,180), frame_step = 5):
  """
    Creates frames from each video file present for each category.

    Args:
      video_path: File path to the video.
      n_frames: Number of frames to be created per video file.
      output_size: Pixel size of the output frame image.

    Return:
      An NumPy array of frames in the shape of (n_frames, height, width, channels).
  """
  # Read each video frame by frame
  result = []
  src = cv2.VideoCapture(str(video_path))  

  video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

  need_length = 1 + (n_frames - 1) * frame_step

  if need_length > video_length:
    start = 0
  else:
    max_start = video_length - need_length
    start = random.randint(0, 0)

  src.set(cv2.CAP_PROP_POS_FRAMES, start)
  # ret is a boolean indicating whether read was successful, frame is the image itself
  ret, frame = src.read()
  result.append(format_frames(frame, output_size))

  for _ in range(n_frames - 1):
    for _ in range(frame_step):
      ret, frame = src.read()
    if ret:
      frame = format_frames(frame, output_size)
      result.append(frame)
    else:
      result.append(np.zeros_like(result[0]))
  src.release()
  result = np.array(result) #[..., [2, 1, 0]]

  return result

#def to_gif(images):
 # converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
 # imageio.mimsave('./animation.gif', converted_images, fps=10)
 # return embed.embed_file('./animation.gif')

### Get class names

In [4]:
class CFG:
    epochs = 10
    batch_size = 32
    classes = os.listdir("/kaggle/input/ucf21-new/UCF101_new")
    #classes = ["FloorGymnastics","PullUps"]

### Get video paths 

In [5]:
file_paths = []
targets = []
target_name = []
for i, cls in enumerate(CFG.classes):
    sub_file_paths = glob.glob(f"/kaggle/input/ucf21-new/UCF101_new/{cls}/**.avi")
    file_paths += sub_file_paths
    targets += [i] * len(sub_file_paths)
    target_name += [cls]

In [6]:
len(file_paths)

4249

In [7]:
len(targets)

4249

In [8]:
label_dict = dict()
for i, video_label in enumerate(target_name):
    label_dict[i] = video_label

### Split the paths dataset to train and test

In [9]:
train_paths, test_paths, train_targets, test_targets = train_test_split(file_paths, targets, test_size=0.01, random_state=143)
len(train_paths), len(test_paths), len(train_targets), len(test_targets)

(4206, 43, 4206, 43)

### Split the train_paths dataset to train and val

In [10]:
train_paths, val_paths, train_targets, val_targets = train_test_split(train_paths, train_targets, test_size=0.2, random_state=143)
len(train_paths), len(val_paths), len(train_targets), len(val_targets)

(3364, 842, 3364, 842)

### Data loader

In [11]:
# using data loader to load images in batches
# `x_set` is list of path to the images
# `y_set` are the associated classes.

class train_DataLoader(tf.keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        #one hot encoding the labels data in our batch 
        frames_batch_x=[]
        for video_path in batch_x:
            frames_batch_x.append(frames_from_video_file(video_path, n_frames=10, output_size = (120,180)))          
        #print(np.array(frames_batch_x).shape[-1])

        return np.array(frames_batch_x), np.array(batch_y)


In [12]:
# using data loader to load images in batches
# `x_set` is list of path to the images
# `y_set` are the associated classes.

class val_DataLoader(tf.keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        #one hot encoding the labels data in our batch 
        frames_batch_x=[]
        for video_path in batch_x:
            frames_batch_x.append(frames_from_video_file(video_path, n_frames=10, output_size = (120,180)))          
        #print(np.array(frames_batch_x).shape[-1])
        return np.array(frames_batch_x), np.array(batch_y)


### CNN

In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications.vgg16 import preprocess_input
from keras.layers import Flatten,Dense,BatchNormalization,Activation,Dropout,GlobalAveragePooling2D

In [14]:
# Loading VGG16 model
base_model = VGG16(weights = "imagenet", include_top = False, input_shape = (120,180,3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [15]:
cnn_model = Sequential()
for layer in base_model.layers:
    cnn_model.add(layer)
cnn_model.add(Flatten())

### Transformer model

In [13]:
import tensorflow as tf
from tensorflow.keras import layers

def transformer_block(inputs, num_heads, ff_dim, dropout_rate):
    # Multi-Head Attention
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=inputs.shape[-1])(inputs, inputs)
    attn_output = layers.Dropout(dropout_rate)(attn_output)
    out1 = layers.LayerNormalization(epsilon=1.001e-5)(inputs + attn_output)

    # Feed Forward layer
    ffn = tf.keras.Sequential(
        [layers.Dense(ff_dim, activation="relu"), layers.Dense(inputs.shape[-1])]
    )
    ffn_output = ffn(out1)
    ffn_output = layers.Dropout(dropout_rate)(ffn_output)
    out2 = layers.LayerNormalization(epsilon=1.001e-5)(out1 + ffn_output)
    return out2


In [14]:
import tensorflow as tf
from tensorflow.keras import layers

def get_transformer_model(num_frames, height, width, channels, num_classes):
    inputs = layers.Input(shape=(num_frames, height, width, channels))
    x = layers.TimeDistributed(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))(inputs)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), strides=(3, 3)))(x)
    x = layers.TimeDistributed(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))(x)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), strides=(3, 3)))(x)
    x = layers.TimeDistributed(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))(x)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), strides=(3, 3)))(x)
    x = layers.TimeDistributed(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))(x)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), strides=(2, 2)))(x)
    x = layers.TimeDistributed(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))(x)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), strides=(2, 2)))(x)
    
    x = layers.TimeDistributed(layers.Flatten())(x)
    x = transformer_block(inputs=x, num_heads=8, ff_dim=512, dropout_rate=0.1)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(1024, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


In [15]:
a=tf.keras.optimizers.Adam(
    learning_rate=0.0001)

In [16]:
# compile the model

model = get_transformer_model(num_frames=10, height=120, width=180, channels=3, num_classes=21)
model.compile(loss='sparse_categorical_crossentropy', optimizer=a, metrics=['accuracy'])

In [17]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "model.h5", 
    monitor="val_accuracy",
    mode="max",
    save_best_only=True, 
    restore_best_weights=True
)

In [32]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=5,callbacks=[checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
model.save("/kaggle/working/transformer_model.h5")

In [34]:
model = tf.keras.models.load_model("/kaggle/input/transformer-model-1/transformer_model.h5")

In [35]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=3,callbacks=[checkpoint])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [36]:
model.save("/kaggle/working/transformer_model_2.h5")

In [37]:
model = tf.keras.models.load_model("/kaggle/input/transformer-model-2/transformer_model_2.h5")

In [38]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=3,callbacks=[checkpoint])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [39]:
model.save("/kaggle/working/transformer_model_3.h5")

In [40]:
model = tf.keras.models.load_model("/kaggle/input/transformer-model-3/transformer_model_3.h5")

In [41]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=4,callbacks=[checkpoint])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [42]:
model.save("/kaggle/working/transformer_model_4.h5")

In [43]:
model = tf.keras.models.load_model("/kaggle/input/transformer-model-4/transformer_model_4.h5")

In [44]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=5,callbacks=[checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
model.save("/kaggle/working/transformer_model_5.h5")

In [18]:
model = tf.keras.models.load_model("/kaggle/input/transformer-final/transformer_conv2D_final.h5")

In [None]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=5,callbacks=[checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [28]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=5,callbacks=[checkpoint])

Epoch 1/5

KeyboardInterrupt: 

In [None]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=10,callbacks=[checkpoint])


In [None]:
# fit the model
history = model.fit(train_DataLoader(train_paths, train_targets,batch_size=4), 
                    validation_data=val_DataLoader(val_paths, val_targets,batch_size=4), 
                    epochs=10,callbacks=[checkpoint])


Epoch 1/10
Epoch 2/10
Epoch 3/10
 86/841 [==>...........................] - ETA: 8:44 - loss: 2.4679 - accuracy: 0.2791

In [None]:
#accuracy plot
plt.figure(figsize = (8,5))
x = np.arange(0,10,1)
plt.plot(x,history.history["accuracy"],label = "training accuracy",c = 'blue')
plt.plot(x,history.history["val_accuracy"],label = "validation accuracy",c = 'red')
plt.legend()
plt.title("Accuracy vs epochs (Transformer model)")
plt.xlabel("epochs")
plt.ylabel("Accuracy")
plt.show()

In [None]:
#accuracy plot
plt.figure(figsize = (8,5))
x = np.arange(0,10,1)
plt.plot(x,history.history["loss"],label = "training loss",c = 'blue')
plt.plot(x,history.history["val_loss"],label = "validation loss",c = 'red')
plt.legend()
plt.title("Loss vs epochs (Transformer model)")
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.show()

# Prediction

In [46]:
test_videos_path = test_paths
num = np.random.randint(0,len(test_videos_path),1)
test_file_path = test_videos_path[num[0]]
test_file_path

'/kaggle/input/ucf21-new/UCF101_new/PlayingCricket/v_CricketShot_g11_c03.avi'

In [47]:
features = []
features.append(frames_from_video_file(test_file_path, n_frames = 10))
features = np.array(features)


In [48]:
predicted_class = model.predict(np.array(features))



In [49]:
# Getting indices of N = 3 maximum values
index = np.argsort(predicted_class[0])[::-1][:3]
#print("Indices:",index)

# Getting N maximum values
percentage_class = predicted_class[0][index]
#print("Values:",percentage_class)

num = -1
for i in index:
    num+=1
    a = label_dict[i]
    b = percentage_class[num]*100
    print(f"{a} : {b:.3f} percent")

PlayingCricket : 99.942 percent
ApplyingMakeup : 0.042 percent
SkyDiving : 0.007 percent


In [53]:
model.save("/kaggle/working/transformer_conv2D_final.h5")

### Test video

In [38]:
model = tf.keras.models.load_model("/kaggle/input/transformer-final/transformer_conv2D_final.h5")

In [39]:
test_videos_path = r"/kaggle/input/test-video-3/test3.mp4"

In [43]:
features = []
features.append(frames_from_video_file(test_videos_path, n_frames = 10))
features = np.array(features)

In [44]:
predicted_class = model.predict(np.array(features))



In [45]:
# Getting indices of N = 3 maximum values
index = np.argsort(predicted_class[0])[::-1][:5]
#print("Indices:",index)

# Getting N maximum values
percentage_class = predicted_class[0][index]
#print("Values:",percentage_class)

num = -1
for i in index:
    num+=1
    a = label_dict[i]
    b = percentage_class[num]*100
    print(f"{a} : {b:.3f} percent")

ApplyingMakeup : 36.324 percent
PullUps : 34.044 percent
SalsaSpin : 9.698 percent
PlayingMusicalInstrument : 5.971 percent
SkyDiving : 3.804 percent


In [50]:
predicted_class = model.predict(np.array(features))



In [51]:
# Getting indices of N = 3 maximum values
index = np.argsort(predicted_class[0])[::-1][:5]
#print("Indices:",index)

# Getting N maximum values
percentage_class = predicted_class[0][index]
#print("Values:",percentage_class)

num = -1
for i in index:
    num+=1
    a = label_dict[i]
    b = percentage_class[num]*100
    print(f"{a} : {b:.3f} percent")

ApplyingMakeup : 36.324 percent
PullUps : 34.044 percent
SalsaSpin : 9.698 percent
PlayingMusicalInstrument : 5.971 percent
SkyDiving : 3.804 percent
