In [2]:
import cv2     # for capturing videos
import math   # for mathematical operations
import matplotlib.pyplot as plt    # for plotting the images
%matplotlib inline
import pandas as pd
from keras.preprocessing import image   # for preprocessing the images
import numpy as np    # for mathematical operations
from keras.utils import np_utils
from skimage.transform import resize   # for resizing images
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm

### Read file names into the train dataframe

In [6]:
# open the .txt file which have names of training videos
f = open("trainlist01.txt", "r")
temp = f.read()
videos = temp.split('\n')

# creating a dataframe having video names
train = pd.DataFrame()
train['video_name'] = videos
train = train[:-1]
train.head()

Unnamed: 0,video_name
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi 1
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi 1
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi 1
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi 1
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi 1


### Get the tagnames from folder names

In [15]:
train_video_tag = []
for i in range(train.shape[0]):
    train_video_tag.append(train['video_name'][i].split('/')[0])

train['tag'] = train_video_tag
train.head()

Unnamed: 0,video_name,tag
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi 1,ApplyEyeMakeup
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi 1,ApplyEyeMakeup
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi 1,ApplyEyeMakeup
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi 1,ApplyEyeMakeup
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi 1,ApplyEyeMakeup


### Create test data frame and corresponding tags

In [14]:
# open the .txt file which have names of test videos
f = open("testlist01.txt", "r")
temp = f.read()
videos = temp.split('\n')

# creating a dataframe having video names
test = pd.DataFrame()
test['video_name'] = videos
test = test[:-1]

# creating tags for test videos
test_video_tag = []
for i in range(test.shape[0]):
    test_video_tag.append(test['video_name'][i].split('/')[0])
    
test['tag'] = test_video_tag
test.head()

Unnamed: 0,video_name,tag
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi,ApplyEyeMakeup
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c02.avi,ApplyEyeMakeup
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c03.avi,ApplyEyeMakeup
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c04.avi,ApplyEyeMakeup
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c05.avi,ApplyEyeMakeup


## Create a new folder 'train_1' to contain extracted frames
use `cap.get( )` from `cv2` to get certain properties of the video capture

In [57]:
# storing the frames from training videos
for i in tqdm(range(train.shape[0])):
    count = 0
    videoFile = train['video_name'][i]
    cap = cv2.VideoCapture('UCF-101/'+videoFile.split(' ')[0])
    frameRate = cap.get(5) # get frames per second
    print(f'The video is taking at {frameRate} frames per second')
    
    while(cap.isOpened()):
        frameId = cap.get(1) # get current frame number
        ret, frame = cap.read()
        if(ret != True):
            break
        if (frameId % math.floor(frameRate) == 0):
            # storing the frames in a new folder named train_1
            filename = 'train_1/' + videoFile.split('/')[1].split(' ')[0] +"_frame%d.jpg" % count;count+=1
            cv2.imwrite(filename, frame)
    cap.release()

100%|██████████| 9537/9537 [06:39<00:00, 23.86it/s]


## Next, create a `.csv` file that contains paths to these images as well as their `class`

In [64]:
# get the names of all the images
images = glob('train_1/*.jpg')
train_image = []
train_class = []
for i in tqdm(range(len(images))):
    # create the image name
    train_image.append(images[i].split('/')[1])
    # create the class of this image, the activity name
    train_class.append(images[i].split('/')[1].split('_')[1])
    
# storing the images and their class in a dataframe
train_data = pd.DataFrame()
train_data['image'] = train_image
train_data['class'] = train_class

# save dataframe into `.csv` file
train_data.to_csv('UCF-101/train_new.csv', header = True, index=False)

100%|██████████| 73844/73844 [00:00<00:00, 600840.34it/s]


# Training most basic video classification model

## Here we will consider using the most basic architecure 3D-CNN with a very light base architecture: VGG-16
We have created our training image names are corresponding classes in a dataframe.
Now we just need to:
* Define model architecture
* Train and validate performance using unseen data
* Hyper-parameter tuning
* Upgrade model capability and repeat process for better accuracy

In [1]:
import keras
from keras.models import Sequential
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, InputLayer, Dropout, Flatten
from keras.layers import Conv2D, MaxPool2D, GlobalMaxPool2D
from keras.preprocessing import image
from sklearn.model_selection import train_test_split

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
train = pd.read_csv('UCF-101/train_new.csv')
train.head()

Unnamed: 0,image,class
0,v_BlowDryHair_g10_c03.avi_frame0.jpg,BlowDryHair
1,v_CleanAndJerk_g11_c02.avi_frame12.jpg,CleanAndJerk
2,v_RockClimbingIndoor_g24_c06.avi_frame8.jpg,RockClimbingIndoor
3,v_PlayingCello_g10_c05.avi_frame6.jpg,PlayingCello
4,v_CuttingInKitchen_g09_c03.avi_frame6.jpg,CuttingInKitchen


In [4]:
train.shape[0]

73844

In [5]:
train_image = []

# for loop to read and store frames
for i in tqdm(range(10000)):
    # loading the image and keeping the target size as (224,224,3) as required by VGG-16
    img = image.load_img('train_1/'+train['image'][i], target_size=(224,224,3))
    # converting it to array
    img = image.img_to_array(img)
    # normalizing the pixel to 0-1
    img = img/255
    # appending the image to the train_image list
    train_image.append(img)

# converting the list to numpy array
X = np.array(train_image)
X.shape

100%|██████████| 10000/10000 [01:54<00:00, 87.01it/s]


(10000, 224, 224, 3)

### Note here we created a stratified split on `label_Y` so as to combat class imbalance

In [13]:
y = train['class'][:10000]
# creating the training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify = y)

In [15]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000, 224, 224, 3) (2000, 224, 224, 3) (8000,) (2000,)


### We have 101 categories
Here we apply `get_dummies()`  for on hot-encoding

In [16]:
y_train = pd.get_dummies(y_train)
y_test  = pd.get_dummies(y_test)

In [17]:
# one-hot-encoded dataframe
y_train.head()

Unnamed: 0,ApplyEyeMakeup,ApplyLipstick,Archery,BabyCrawling,BalanceBeam,BandMarching,BaseballPitch,Basketball,BasketballDunk,BenchPress,...,TennisSwing,ThrowDiscus,TrampolineJumping,Typing,UnevenBars,VolleyballSpiking,WalkingWithDog,WallPushups,WritingOnBoard,YoYo
9753,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5051,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4754,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Define model base architacture

In [None]:
# we use a VGG-16 pre-trained on imagenet
# we don't include the classification layers because we will be training with our own
base_model = VGG16(weights='imagenet', include_top=False)


Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5