In [1]:
# Commonly used CV tools
import os
import cv2     # for capturing videos
import math   # for mathematical operations
import pickle
import matplotlib.pyplot as plt    # for plotting the images
%matplotlib inline
import pandas as pd
import numpy as np    # for mathematical operations
from skimage.transform import resize   # for resizing images
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from glob import glob
from tqdm import tqdm

# for model architectures
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
from functions import *


### Read file names into the train dataframe

In [2]:
# open the .txt file which have names of training videos
f = open("trainlist01.txt", "r")
temp = f.read()
videos = temp.split('\n')

# creating a dataframe having video names
train = pd.DataFrame()
train['video_name'] = videos
train = train[:-1]
train.head()

Unnamed: 0,video_name
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi 1
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi 1
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi 1
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi 1
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi 1


### Get the tagnames from folder names

In [3]:
train_video_tag = []
for i in range(train.shape[0]):
    train_video_tag.append(train['video_name'][i].split('/')[0])

train['tag'] = train_video_tag
train.head()

Unnamed: 0,video_name,tag
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi 1,ApplyEyeMakeup
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi 1,ApplyEyeMakeup
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi 1,ApplyEyeMakeup
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi 1,ApplyEyeMakeup
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi 1,ApplyEyeMakeup


### Create test data frame and corresponding tags

In [4]:
# open the .txt file which have names of test videos
f = open("testlist01.txt", "r")
temp = f.read()
videos = temp.split('\n')

# creating a dataframe having video names
test = pd.DataFrame()
test['video_name'] = videos
test = test[:-1]

# creating tags for test videos
test_video_tag = []
for i in range(test.shape[0]):
    test_video_tag.append(test['video_name'][i].split('/')[0])
    
test['tag'] = test_video_tag
test.head()

Unnamed: 0,video_name,tag
0,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi,ApplyEyeMakeup
1,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c02.avi,ApplyEyeMakeup
2,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c03.avi,ApplyEyeMakeup
3,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c04.avi,ApplyEyeMakeup
4,ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c05.avi,ApplyEyeMakeup


## Create a new folder 'train_1' to contain extracted frames
use `cap.get( )` from `cv2` to get certain properties of the video capture

In [57]:
# storing the frames from training videos
for i in tqdm(range(train.shape[0])):
    count = 0
    videoFile = train['video_name'][i]
    cap = cv2.VideoCapture('UCF-101/'+videoFile.split(' ')[0])
    frameRate = cap.get(5) # get frames per second
    print(f'The video is taking at {frameRate} frames per second')
    
    while(cap.isOpened()):
        frameId = cap.get(1) # get current frame number
        ret, frame = cap.read()
        if(ret != True):
            break
        if (frameId % math.floor(frameRate) == 0):
            # storing the frames in a new folder named train_1
            filename = 'train_1/' + videoFile.split('/')[1].split(' ')[0] +"_frame%d.jpg" % count;count+=1
            cv2.imwrite(filename, frame)
    cap.release()

100%|██████████| 9537/9537 [06:39<00:00, 23.86it/s]


## Next, create a `.csv` file that contains paths to these images as well as their `class`

In [64]:
# get the names of all the images
images = glob('train_1/*.jpg')
train_image = []
train_class = []
for i in tqdm(range(len(images))):
    # create the image name
    train_image.append(images[i].split('/')[1])
    # create the class of this image, the activity name
    train_class.append(images[i].split('/')[1].split('_')[1])
    
# storing the images and their class in a dataframe
train_data = pd.DataFrame()
train_data['image'] = train_image
train_data['class'] = train_class

# save dataframe into `.csv` file
train_data.to_csv('UCF-101/train_new.csv', header = True, index=False)

100%|██████████| 73844/73844 [00:00<00:00, 600840.34it/s]


# Training most basic video classification model

## Here we will consider using the most basic architecure 3D-CNN with a very light base architecture: VGG-16
We have created our training image names are corresponding classes in a dataframe.
Now we just need to:
* Define model architecture
* Train and validate performance using unseen data
* Hyper-parameter tuning
* Upgrade model capability and repeat process for better accuracy

In [5]:
data_path = './jpegs_256'                 # preprocessed RGB images
action_name_path = './UCF101actions.pkl' # preprocessed action names
save_model_path = './Conv3D_ckpt'

#### 3D CNN patameters

In [6]:
# architecture parameters
fc_hidden1, fc_hidden2 = 256, 256
dropout = 0.0        # dropout probability

# training parameters
k = 101            # number of target category
epochs = 10        # do adjust here for shorter training period
batch_size = 8
learning_rate = 1e-4
log_interval = 10
img_x, img_y = 256, 342  # resize video 2d frame size

#### For 3D CNN we will need to determine a fixed No.frames and here we take 28 for UCF 101

In [7]:
begin_frame, end_frame, skip_frame = 1, 29, 1

In [8]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    model.train() # training model, enable dropout and 
    
    losses = []
    scores = []
    N_count = 0 # count total trained sample in one epoch
    
    for batch_idx, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device).view(-1,)
        
        N_count += X.size(0)
        
        optimizer.zero_grad()
        output = model(X)    # output size = (batch, number of classes)
        
        loss = F.cross_entropy(output, y)
        losses.append(loss.item())
        
        # to compute accuracy
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy()) # computed on CPU
        scores.append(step_score)         
        
        loss.backward()
        optimizer.step()
        
        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return losses, scores

In [None]:
def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    model.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device).view(-1, )

            output = model(X)

            loss = F.cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                 # sum up batch loss
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # to compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))

    # save Pytorch models of best record
    torch.save(model.state_dict(), os.path.join(save_model_path, '3dcnn_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
    torch.save(optimizer.state_dict(), os.path.join(save_model_path, '3dcnn_optimizer_epoch{}.pth'.format(epoch + 1)))      # save optimizer
    print("Epoch {} model saved!".format(epoch + 1))

    return test_loss, test_score

## Prepare for training

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if torch.cuda.is_available() else {}

# load UCF101 label categories
with open(action_name_path, 'rb') as f:
    action_names = pickle.load(f)
    
print(f'we have {len(action_names)} categories')

# conver labels to categories using LabelEncoder()
le = LabelEncoder()
le.fit(action_names)

# Then one-hot-encoding the categorical labels
action_category = le.transform(action_names).reshape(-1,1)  # map from names to a number from [0,100]
enc = OneHotEncoder()
enc.fit(action_category)

actions = []
fnames = os.listdir(data_path)

all_names = []
for f in fnames:
    loc1 = f.find('v_')
    loc2 = f.find('_g')
    actions.append(f[(loc1 + 2): loc2])

    all_names.append(f)

all_X_list = all_names
all_y_list = labels2cat(le, actions)

# train, test split
train_list, test_list, train_label, test_label = train_test_split(all_X_list, all_y_list, test_size=0.25, random_state=42)

# image transformation
transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5], std=[0.5])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist() # a list from 1 to 28

train_set = Dataset_3DCNN(data_path, train_list, train_label, selected_frames, transform=transform)
valid_set = Dataset_3DCNN(data_path, test_list, test_list,    selected_frames, transform=transform) 

# adjust params from above
train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

cnn3d = CNN3D(t_dim=len(selected_frames), 
              img_x=img_x, 
              img_y=img_y, 
              drop_p=dropout, 
              fc_hidden1=fc_hidden1,
              fc_hidden2=fc_hidden2,
              num_classes=k
             ).to(device)

# use classic adam optimizer
optimizer = torch.optim.Adam(cnn3d.parameters(), lr=learning_rate)

we have 101 categories


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## Actual training

In [11]:
# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

# start training
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, cnn3d, device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation(cnn3d, device, optimizer, valid_loader)

    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)

    # save all train test results
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)
    C = np.array(epoch_test_losses)
    D = np.array(epoch_test_scores)
    np.save('./3DCNN_epoch_training_losses.npy', A)
    np.save('./3DCNN_epoch_training_scores.npy', B)
    np.save('./3DCNN_epoch_test_loss.npy', C)
    np.save('./3DCNN_epoch_test_score.npy', D)



ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/projectx/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/projectx/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/projectx/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/projectx/Documents/GitHub repos/Video-Classification/functions.py", line 62, in __getitem__
    y = torch.LongTensor([self.labels[index]])                             # (labels) LongTensor are for int64 instead of FloatTensor
ValueError: too many dimensions 'str'
