In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import cv2
import numpy as np
from keras.applications import VGG16
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten
from keras.applications.vgg16 import preprocess_input

In [3]:
# importing the required libraries
%matplotlib inline
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image

# importing libraries for defining the architecture of model
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import Linear, ReLU, BCELoss, Sequential, Sigmoid

# import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

## Storing all video names and assigning labels to it

In [4]:
#data-preparation
dir = "/content/drive/MyDrive/dl_proj/UCF-21-modified/UCF21"
subdirectory = [os.path.join(dir, d) for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
video_paths = []
labels = []
for class_idx, subdir in enumerate((subdirectory)):
    videos = [os.path.join(subdir, f) for f in os.listdir(subdir) if os.path.isfile(os.path.join(subdir, f))]
    video_paths.extend(videos)
    labels.extend([class_idx] * len(videos))

In [5]:
df = pd.DataFrame({'video_path': video_paths,'label': labels})
# Extract the class name from the 'video_path' column
df['class_name'] = df['video_path'].apply(lambda x: x.split('/')[-2])
# Define the CSV file name
csv_file_name = 'video_paths_labels.csv'

# Save DataFrame to CSV
df.to_csv(csv_file_name, index=False)

print(f"CSV file '{csv_file_name}' created successfully.")
df.head()

CSV file 'video_paths_labels.csv' created successfully.


Unnamed: 0,video_path,label,class_name
0,/content/drive/MyDrive/dl_proj/UCF-21-modified...,0,WalkingWithDog
1,/content/drive/MyDrive/dl_proj/UCF-21-modified...,0,WalkingWithDog
2,/content/drive/MyDrive/dl_proj/UCF-21-modified...,0,WalkingWithDog
3,/content/drive/MyDrive/dl_proj/UCF-21-modified...,0,WalkingWithDog
4,/content/drive/MyDrive/dl_proj/UCF-21-modified...,0,WalkingWithDog


In [6]:
# Create a dictionary mapping video names to labels
video_label_dict = dict(zip(df['label'],df['class_name'], ))
video_label_dict

{0: 'WalkingWithDog',
 1: 'WritingOnBoard',
 2: 'SoccerPenalty',
 3: 'SkyDiving',
 4: 'SumoWrestling',
 5: 'SoccerJuggling',
 6: 'SalsaSpin',
 7: 'Rafting',
 8: 'PushUps',
 9: 'SkateBoarding',
 10: 'Shotput',
 11: 'PullUps',
 12: 'IceDancing',
 13: 'JavelinThrow',
 14: 'LongJump',
 15: 'FieldHockeyPenalty',
 16: 'FloorGymnastics',
 17: 'ApplyingMakeUp',
 18: 'PlayingBasketball',
 19: 'PlayingCricket',
 20: 'PlayingMusicalInstrument'}

## Custom DataLoader to load videos batchwise and preprocess images from videos along with extracting features using pretrained model VGG16

In [7]:
import cv2
import pandas as pd
import numpy as np
import os
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.models import Model

class VideoDataset:
    def __init__(self, csv_path=None, videos_dir=None, frame_count=10):
        if csv_path is not None and os.path.exists(csv_path):
            self.data = pd.read_csv(csv_path)
        else:
            self.data = None

        self.videos_dir = videos_dir
        self.frame_count = frame_count
        self.base_model = VGG16(weights='imagenet', include_top=False)
        self.model = Model(inputs=self.base_model.input, outputs=self.base_model.output)


    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        step = max(1, total_frames // self.frame_count)
        frames = []

        for count in range(self.frame_count):
            cap.set(cv2.CAP_PROP_POS_FRAMES, count * step)
            ret, frame = cap.read()
            if not ret:
                break

            frame = cv2.resize(frame, (224, 224))  # Resize frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
            frame = np.expand_dims(frame, axis=0)  # Add batch dimension
            frame = preprocess_input(frame)  # Preprocess input for VGG16
            frames.append(frame)

        cap.release()
        return np.vstack(frames)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path = os.path.join(self.videos_dir, self.data.iloc[idx, 0])
        label = self.data.iloc[idx, 1]

        frames = self.extract_frames(video_path)

        # Extract features using VGG16
        features = self.model.predict(frames)

        return features, label

In [30]:
# Create an empty array to store features and targets
features = []
target = []

# Instantiate dataset and DataLoader
train_dataset = VideoDataset(csv_path='/content/video_paths_labels.csv',
                             videos_dir='/content/drive/MyDrive/dl_proj/UCF-21-modified/UCF21')
train_loader = DataLoader(dataset=train_dataset, batch_size=32)

for batch_idx, (batch_features, batch_labels) in enumerate(train_loader):
    # Append in list
    features.append(batch_features)
    target.append(batch_labels)

# Save to the array
features = np.concatenate(features, axis=0)
target = np.concatenate(target, axis=0)




In [31]:
# shape of the features
print(features.shape,target.shape)

(1600, 10, 7, 7, 512) (1600,)


In [32]:
# creating the training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.3, random_state=42)

In [33]:
# shape of training and validation set
(X_train.shape, y_train.shape), (X_valid.shape, y_valid.shape)

(((1120, 10, 7, 7, 512), (1120,)), ((480, 10, 7, 7, 512), (480,)))

# Model Architecture
## Includes LSTM, Self Attention and Fully Connected Layers

In [34]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Flatten, TimeDistributed, Dropout, Attention

# Define the model
model = Sequential()

# TimeDistributed Flatten layer to flatten each frame
model.add(TimeDistributed(Flatten(), input_shape=(10, 7, 7, 512)))  # Correct input_shape

# LSTM layer - return sequences for subsequent LSTM layer
model.add(LSTM(512))  # Set return_sequences=True
model.add(Dropout(0.5))

# Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))

# Classification layer
model.add(Dense(21, activation='softmax'))  # 21 classes

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed_2 (TimeDi  (None, 10, 25088)         0         
 stributed)                                                      
                                                                 
 lstm_2 (LSTM)               (None, 512)               52430848  
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               65664     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 128)               16512     
                                                      

In [35]:
import numpy as np

# X_train[0] is of shape (10, 7, 7, 512)
input_video = X_train[0]

# Add the batch dimension
input_video = np.expand_dims(input_video, axis=0)

# Pass the input to the model
output = model.predict(input_video)

# Print the output
print(output)

[[0.04336837 0.06653976 0.03839356 0.05139139 0.05941158 0.03208438
  0.03826306 0.0495029  0.041029   0.03575246 0.04545109 0.05950084
  0.06728184 0.05032486 0.03917891 0.04465079 0.03694846 0.04422449
  0.05877724 0.06400048 0.03392449]]


In [36]:
# Fit the model
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=5)

# Print training and validation loss
print(f"Training Loss: {history.history['loss'][-1]}")
print(f"Validation Loss: {history.history['val_loss'][-1]}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Loss: 0.9079968333244324
Validation Loss: 0.44189614057540894


# Inferencing of any new video

In [37]:
# Create an instance of the VideoDataset class
# Assuming you have a dummy CSV path and videos directory (these won't be used in this case)
video_dataset = VideoDataset()
# Path to the new video
new_video_path = '/content/drive/MyDrive/dl_proj/UCF-21-modified/UCF21/SoccerJuggling/v_SoccerJuggling_g01_c01.avi'

# Process the new video
processed_frames = video_dataset.extract_frames(new_video_path)
# Extract features using VGG16
video_features = video_dataset.model.predict(processed_frames)



In [38]:
video_features.shape

(10, 7, 7, 512)

In [39]:
# Reshape video_features to include the time step dimension
# Assuming you have 10 frames per video and each frame feature is of shape (7, 7, 512)
video_features_reshaped = video_features.reshape(1, 10, 7, 7, 512)  # Add batch dimension as well

# Make a prediction
predictions = model.predict(video_features_reshaped)

# Assuming a single video input, get the predicted class
# Get top 3 predicted class indices
top_3_indices = np.argsort(predictions[0])[-3:][::-1]
print(f"Predicted Class: {top_3_indices}")
print("Top 3 Predictions:")
for i, index in enumerate(top_3_indices):
    class_name = video_label_dict.get(index)
    print(f"{i+1}: Class Index: {index}, Class Name: {class_name}")

Predicted Class: [ 5 10  6]
Top 3 Predictions:
1: Class Index: 5, Class Name: SoccerJuggling
2: Class Index: 10, Class Name: Shotput
3: Class Index: 6, Class Name: SalsaSpin
