In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
import time
import torch.optim as optim
import natsort
import bezier

from sklearn.decomposition import PCA
from dataset import encoder
from model import get_pretrained_model
from dataset import jpg2np, get_loader
from torchvision import models
from bezier.hazmat.curve_helpers import evaluate_hodograph, get_curvature
from torch.utils.data import TensorDataset, DataLoader
from torch.utils import data
from UCF_dataset import UCFdataset
from model import UCF_DNN, UCF_CNN1D, UCF_CNN2D
from sklearn.model_selection import train_test_split
from model import UCF_DNN, UCF_CNN1D, UCF_CNN2D
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
cut = 9812
upper, lower = 150, 10000
GPU_NUM = 1
bs=1
upper, lower, dir_path = 150, 10000, './ucf_image'
datastyle ='std' # 'embeddings', 'elementwise', 'minmax', 'std'

device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print('Current cuda device ', torch.cuda.current_device())

Current cuda device  1


# VGG16 Embedding

In [4]:
dataset = UCFdataset(upper, lower, dir_path)
loader = data.DataLoader(dataset=dataset, batch_size=bs, shuffle=True)

num_of_classes = 101
epochs=10


## Embedding ##
# vgg16 = models.vgg16(pretrained=True)
# vgg_embedding = nn.Sequential(vgg16.features,
#                    nn.AdaptiveAvgPool2d((1,1))).to(device)

# bs_step = 0
# for x, y, video_name in loader:
#     if '{}.npy'.format(list(video_name)[0]) in  os.listdir('./ucf_embeddings/'): 
#         bs_step += bs
#         continue
#     x, y = x.to(device).squeeze(), y.squeeze().long().to(device)


#     ## VGG Embedding ##
#     embedding = vgg_embedding(x).squeeze().cpu().detach().numpy() # [30, 512]
#     np.save('./ucf_embeddings/{}.npy'.format(list(video_name)[0]), embedding)    
#     bs_step += bs
#     if bs_step % 10 == 0: 
#         print("Batch {}/{} | {}".format(bs_step, 9812, video_name))

Select The number of frames between [150, 10000] of UCF101 Dataset
The number of selected videos is 9812


# Bezier Approximation and Curvature

In [5]:
## Load vgg16 embedding data ##
video_names = []
embeddings = []
labels = []
for embedding in os.listdir('./ucf_embeddings'):
    if embedding == '.ipynb_checkpoints': continue
    loc1 = embedding.find('_')
    loc2 = loc1 + embedding[loc1+1:].find('_')
    labels.append(embedding[loc1+1:loc2+1])
    video_names.append(embedding[:-4])
  

    embeddings.append(np.load('./ucf_embeddings/{}'.format(embedding)))


y, _ = encoder(labels)

embeddings = np.stack(embeddings)
embeddings_pca = embeddings.reshape(embeddings.shape[0]*embeddings.shape[1],-1)

## PCA ##
pca = PCA(3)
embeddings_pca = pca.fit_transform(embeddings_pca).reshape(embeddings.shape[0], embeddings.shape[1],-1)
embeddings_pca.shape

## Bezier Curve and Curvature ##
k = dict()
curves = []
for i, embedding in enumerate(embeddings_pca):
#     print(embedding.shape)
    curves.append(bezier.Curve.from_nodes(embedding.T))
    kappa = []
    for s in range(30):
        t = s / 30
        tangent_vec = curves[i].evaluate_hodograph(t)
        kappa.append(get_curvature(embedding.T, tangent_vec, t))
    k[video_names[i]] = kappa
    if i % 1000 == 0:
        print(i, len(embeddings_pca))

K = np.stack(list(k.values())) # [N, 30]
K.shape

0 9812
1000 9812
2000 9812
3000 9812
4000 9812
5000 9812
6000 9812
7000 9812
8000 9812
9000 9812


(9812, 30)

# Normalize the kappa vector

In [6]:
## Elementwise operation ##
if datastyle == 'elementwise':
    print('just elementwise operation')
    result = list()
    for video in range(len(embeddings)):
        elementwise = list()
        for frame in range(30):
            elementwise.append(embeddings[video][frame] * K[video][frame])        
        result.append(np.stack(elementwise))
        if (video+1) % 1000 == 0: 
            print("[%d / %d] video processing!" %(video+1, len(embeddings)))
    result = torch.from_numpy(np.stack(result))
#     print(embeddings.shape, embeddings_pca.shape, K.shape, result.shape)
if datastyle == 'minmax': 
    print('minmax scaling and operation')
    min_max_scaler = MinMaxScaler()
    K = min_max_scaler.fit_transform(K.T).T
    result = list()
    for video in range(len(embeddings)):
        elementwise = list()
        for frame in range(30):
            elementwise.append(embeddings[video][frame] * K[video][frame])        
        result.append(np.stack(elementwise))
        if (video+1) % 1000 == 0: 
            print("[%d / %d] video processing!" %(video+1, len(embeddings)))
    result = torch.from_numpy(np.stack(result))
    print(embeddings.shape, embeddings_pca.shape, K.shape, result.shape)
if datastyle == 'std':
    print('standard norm and operation')
    standard_scaler = StandardScaler()
    K = standard_scaler.fit_transform(K.T).T
    result = list()
    for video in range(len(embeddings)):
        elementwise = list()
        for frame in range(30):
            elementwise.append(embeddings[video][frame] * K[video][frame])        
        result.append(np.stack(elementwise))
        if (video+1) % 1000 == 0: 
            print("[%d / %d] video processing!" %(video+1, len(embeddings)))
    result = torch.from_numpy(np.stack(result))
#     print(embeddings.shape, embeddings_pca.shape, K.shape, result.shape)

standard norm and operation
[1000 / 9812] video processing!
[2000 / 9812] video processing!
[3000 / 9812] video processing!
[4000 / 9812] video processing!
[5000 / 9812] video processing!
[6000 / 9812] video processing!
[7000 / 9812] video processing!
[8000 / 9812] video processing!
[9000 / 9812] video processing!


# Split the data with Train and Test 

In [7]:
## Data Setting for Action Recognition(Prediction) ##

print(datastyle)
if datastyle =='embeddings':
    X = torch.from_numpy(embeddings)
if datastyle == 'elementwise' or 'minmax' or 'std':
    X = result
Y = torch.from_numpy(y)

## train / test split ##
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, shuffle=True, random_state=123)
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=bs)
print(X.shape, Y.shape)

std
torch.Size([9812, 30, 512]) torch.Size([9812, 1])


In [8]:
## DNN ##
h_in, h_out =  30*512, num_of_classes # UCF101
h1, h2, h3, h4, h5 = 512, 256, 128, 128, 100
DNN = UCF_DNN(h_in, h1, h2, h3, h4, h5, h_out).to(device)

DNN_criterion = nn.CrossEntropyLoss()
DNN_optimizer = optim.Adam(DNN.parameters(), lr=0.0001,  weight_decay=1e-5)



## Training the DNN ##
DNN.train()
print("\n\n\{}\n".format(DNN.__class__.__name__ ))
epochs=10
DNN.train()
for epoch in range(epochs):
    bs_step = 0
    correct = 0
    for x, y in train_loader: 

        x, y_true = x.to(device), y.squeeze().long().to(device)
        DNN_optimizer.zero_grad()

        y_pred = DNN(x)
        _, predicted = torch.max(y_pred, 1)
    
        loss = DNN_criterion(y_pred, y_true.unsqueeze(0))
        loss.backward()
    
        DNN_optimizer.step()
        bs_step += bs
        correct += (predicted == y_true).sum().item()
        if bs_step %200 == 0:
            print("Epoch {}| Batch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, bs_step, 9812, loss.item(), 100 * correct / bs_step))


    train_acc = 100 * correct / bs_step

    print("\n\nEpoch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, epochs, loss.item(), train_acc))

    
    



\UCF_DNN

Epoch 0| Batch 200/9812 | Loss 4.7225 | Accuracy 0.00
Epoch 0| Batch 400/9812 | Loss 4.8616 | Accuracy 0.25
Epoch 0| Batch 600/9812 | Loss 4.7972 | Accuracy 0.67
Epoch 0| Batch 800/9812 | Loss 4.7185 | Accuracy 0.88
Epoch 0| Batch 1000/9812 | Loss 4.7808 | Accuracy 1.10
Epoch 0| Batch 1200/9812 | Loss 4.4807 | Accuracy 1.33
Epoch 0| Batch 1400/9812 | Loss 4.7006 | Accuracy 1.43
Epoch 0| Batch 1600/9812 | Loss 4.4621 | Accuracy 1.50
Epoch 0| Batch 1800/9812 | Loss 4.4294 | Accuracy 1.50
Epoch 0| Batch 2000/9812 | Loss 4.1868 | Accuracy 1.40
Epoch 0| Batch 2200/9812 | Loss 4.3210 | Accuracy 1.32
Epoch 0| Batch 2400/9812 | Loss 4.7901 | Accuracy 1.29
Epoch 0| Batch 2600/9812 | Loss 4.7330 | Accuracy 1.31
Epoch 0| Batch 2800/9812 | Loss 4.4767 | Accuracy 1.32
Epoch 0| Batch 3000/9812 | Loss 4.6540 | Accuracy 1.33
Epoch 0| Batch 3200/9812 | Loss 4.8509 | Accuracy 1.41
Epoch 0| Batch 3400/9812 | Loss 4.6134 | Accuracy 1.35
Epoch 0| Batch 3600/9812 | Loss 4.5825 | Accuracy 1.36
Ep

In [9]:
DNN.eval()
with torch.no_grad():
    correct = 0
    total = 0

    for x, y in test_loader:
        x, labels = x.to(device), y.squeeze().long().to(device)

        y_pred = DNN(x)
        _, predicted = torch.max(y_pred, 1)
        total += 1
        correct += (predicted == labels).sum().item()

test_acc = 100 * correct / total

print("\n\n\n  The test accuracy {:2.2f}%".format(test_acc)) 




  The test accuracy 14.72%


# 1D CNN

In [10]:
## UCF_CNN1D ##
h_in, h_out = 30, num_of_classes # UCF101
h1, h2, h3, h4, h5 = 256, 128, 64, 1000, 500
CNN_1D = UCF_CNN1D(h_in, h1, h2, h3, h4, h5, h_out).to(device)



criterion_1D = nn.CrossEntropyLoss()
optimizer_1D = optim.Adam(CNN_1D.parameters(), lr=0.0001,  weight_decay=1e-5)

print("\n\n\{}\n".format(CNN_1D.__class__.__name__ ))

## Training the DNN ##
CNN_1D.train()
epochs=10

for epoch in range(epochs):
    bs_step = 0
    correct = 0
    for x, y in train_loader: 

        x, y_true = x.to(device), y.squeeze().long().to(device)
        optimizer_1D.zero_grad()

        y_pred = CNN_1D(x)
        _, predicted = torch.max(y_pred, 1)
    
        loss = criterion_1D(y_pred, y_true.unsqueeze(0))
        loss.backward()
    
        optimizer_1D.step()
        bs_step += bs
        correct += (predicted == y_true).sum().item()
        if bs_step %200 == 0:
            print("Epoch {}| Batch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, bs_step, 9812, loss.item(), 100 * correct / bs_step))


    train_acc = 100 * correct / bs_step

    print("\n\nEpoch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, epochs, loss.item(), train_acc))

    
    



\UCF_CNN1D

Epoch 0| Batch 200/9812 | Loss 4.5827 | Accuracy 1.50
Epoch 0| Batch 400/9812 | Loss 4.3081 | Accuracy 0.75
Epoch 0| Batch 600/9812 | Loss 4.3630 | Accuracy 1.00
Epoch 0| Batch 800/9812 | Loss 4.6849 | Accuracy 1.00
Epoch 0| Batch 1000/9812 | Loss 4.5445 | Accuracy 1.10
Epoch 0| Batch 1200/9812 | Loss 4.4805 | Accuracy 1.00
Epoch 0| Batch 1400/9812 | Loss 4.4598 | Accuracy 1.36
Epoch 0| Batch 1600/9812 | Loss 4.4295 | Accuracy 1.69
Epoch 0| Batch 1800/9812 | Loss 3.3189 | Accuracy 2.39
Epoch 0| Batch 2000/9812 | Loss 3.9807 | Accuracy 3.45
Epoch 0| Batch 2200/9812 | Loss 3.9307 | Accuracy 4.41
Epoch 0| Batch 2400/9812 | Loss 7.4732 | Accuracy 5.50
Epoch 0| Batch 2600/9812 | Loss 3.7324 | Accuracy 6.69
Epoch 0| Batch 2800/9812 | Loss 2.0401 | Accuracy 7.79
Epoch 0| Batch 3000/9812 | Loss 4.2827 | Accuracy 8.70
Epoch 0| Batch 3200/9812 | Loss 1.9187 | Accuracy 10.06
Epoch 0| Batch 3400/9812 | Loss 4.6383 | Accuracy 11.35
Epoch 0| Batch 3600/9812 | Loss 1.5163 | Accuracy 12.

In [11]:
CNN_1D.eval()
with torch.no_grad():
    correct = 0
    total = 0

    for x, y in test_loader:
        x, labels = x.to(device), y.squeeze().long().to(device)

        y_pred = CNN_1D(x)
        _, predicted = torch.max(y_pred, 1)
        total += 1
        correct += (predicted == labels).sum().item()

test_acc = 100 * correct / total

print("\n\n\n  The test accuracy {:2.2f}%".format(test_acc)) 




  The test accuracy 83.70%


# UCF 2D CNN

In [12]:
## UCF_CNN2D ##
num_of_classes = 101
h_in, h_out = 30, num_of_classes # UCF101
h1, h2, h3, h4, h5 = 256, 128, 64, 1000, 500
CNN_2D = UCF_CNN2D(h_in, h1, h2, h3, h4, h5, h_out).to(device)


criterion_2D = nn.CrossEntropyLoss()
optimizer_2D = optim.Adam(CNN_2D.parameters(), lr=0.0001,  weight_decay=1e-5)

print("\n\n\{}\n".format(CNN_2D.__class__.__name__ ))

## Training the 2DCNN ##
CNN_2D.train()
epochs=10

for epoch in range(epochs):
    bs_step = 0
    correct = 0
    for x, y in train_loader: 

        x, y_true = x.to(device), y.squeeze().long().to(device)
        optimizer_2D.zero_grad()

        y_pred = CNN_2D(x)
        _, predicted = torch.max(y_pred, 1)
    
        loss = criterion_2D(y_pred, y_true.unsqueeze(0))
        loss.backward()
    
        optimizer_2D.step()
        bs_step += bs
        correct += (predicted == y_true).sum().item()
        if bs_step %200 == 0:
            print("Epoch {}| Batch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, bs_step, 9812, loss.item(), 100 * correct / bs_step))


    train_acc = 100 * correct / bs_step

    print("\n\nEpoch {}/{} | Loss {:.4f} | Accuracy {:2.2f}".format(
            epoch, epochs, loss.item(), train_acc))

    



\UCF_CNN2D

Epoch 0| Batch 200/9812 | Loss 4.5393 | Accuracy 1.00
Epoch 0| Batch 400/9812 | Loss 4.6083 | Accuracy 1.25
Epoch 0| Batch 600/9812 | Loss 4.1054 | Accuracy 1.33
Epoch 0| Batch 800/9812 | Loss 4.3296 | Accuracy 1.38
Epoch 0| Batch 1000/9812 | Loss 4.4556 | Accuracy 1.30
Epoch 0| Batch 1200/9812 | Loss 4.5529 | Accuracy 1.33
Epoch 0| Batch 1400/9812 | Loss 4.0666 | Accuracy 1.21
Epoch 0| Batch 1600/9812 | Loss 4.3370 | Accuracy 1.19
Epoch 0| Batch 1800/9812 | Loss 4.8201 | Accuracy 1.11
Epoch 0| Batch 2000/9812 | Loss 4.5090 | Accuracy 1.25
Epoch 0| Batch 2200/9812 | Loss 5.9918 | Accuracy 1.27
Epoch 0| Batch 2400/9812 | Loss 4.9907 | Accuracy 1.25
Epoch 0| Batch 2600/9812 | Loss 4.4234 | Accuracy 1.27
Epoch 0| Batch 2800/9812 | Loss 4.8921 | Accuracy 1.32
Epoch 0| Batch 3000/9812 | Loss 4.0373 | Accuracy 1.47
Epoch 0| Batch 3200/9812 | Loss 4.4098 | Accuracy 1.56
Epoch 0| Batch 3400/9812 | Loss 5.2937 | Accuracy 1.62
Epoch 0| Batch 3600/9812 | Loss 4.5386 | Accuracy 1.61


In [13]:
CNN_2D.eval()
with torch.no_grad():
    correct = 0
    total = 0

    for x, y in test_loader:
        x, labels = x.to(device), y.squeeze().long().to(device)

        y_pred = CNN_2D(x)
        _, predicted = torch.max(y_pred, 1)
        total += 1
        correct += (predicted == labels).sum().item()

test_acc = 100 * correct / total

print("\n\n\n  The test accuracy {:2.2f}%".format(test_acc)) 




  The test accuracy 65.97%


In [89]:
# class UCFdataset(data.Dataset):
#     def __init__(self, upper, lower, dir_path): # 'train', 'validation'
#         super(UCFdataset, self).__init__()
        
#         self.file_list, self.y, self.video_names = self.file_load(upper, lower, dir_path)
        
        
#     def __getitem__(self, index):
        
#         x = jpg2np(self.file_list[index]) / 255. # (30, 3, 240, 320)
#         self.x_data = torch.from_numpy(x).float()
#         self.y_data = torch.from_numpy(self.y[index]).float()
#         return self.x_data, self.y_data, self.video_names[index]

#     def __len__(self):
#         return self.y.shape[0]
    
#     def file_load(self, upper, lower, dir_path):
#         """
#         return the input file path list
#         """
#         data_path = []
#         video_imgs_path = os.path.join(os.getcwd(), dir_path)
#         folders = os.listdir(video_imgs_path)

#         frames = {}
#         for folder in folders:
#             path = os.path.join(video_imgs_path, folder)
#             frames[folder] = len(os.listdir(path))

#         video_names = []
#         for video_name, num_of_frames in zip(list(frames.keys()), list(frames.values())):
#             if upper <= num_of_frames and num_of_frames <= lower:
#                 video_names.append(video_name)
#         video_names = natsort.natsorted(video_names)
                
#         print("Select The number of frames between [%d, %d] of UCF101 Dataset" %(upper, lower))
#         print('The number of selected videos is', len(video_names))


#         data_path = [os.path.join(video_imgs_path, video_name) for video_name in video_names]

#         labels = []
#         for label in video_names:
#             loc1 = label.find('_')
#             loc2 = loc1 + label[loc1+1:].find('_')
#             labels.append(label[loc1+1:loc2+1])
#         y, _ = encoder(labels)
# #         y = torch.tensor(y, dtype=torch.float) # [N(13320, 1)

#         return data_path, y, video_names
