# Action Detection

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision
import numpy as np
from scipy.io import loadmat
from random import shuffle
from torch.utils.data import Dataset, DataLoader
import cv2
import os
from sklearn.model_selection import train_test_split

## Data Loader

In [2]:
video_rootdir="./ReCompress_Videos"
mask_rootdir="./puppet_mask"
pose_rootdir="./joint_positions"

video_pathes=[]
mask_pathes=[]
pose_pathes=[]
for root, dirs, files in os.walk(video_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        video_pathes.append(os.path.join(root, file))

for root, dirs, files in os.walk(mask_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        mask_pathes.append(os.path.join(root, file)) 

for root, dirs, files in os.walk(pose_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        pose_pathes.append(os.path.join(root, file))         



In [3]:
from sklearn.model_selection import train_test_split

In [44]:
video_pathes_train, video_pathes_valid, mask_pathes_train, mask_pathes_valid, pose_pathes_train, pose_pathes_valid = \
    train_test_split(video_pathes, mask_pathes, pose_pathes, test_size=0.01)
class_names=[name for name in os.listdir(video_rootdir) if not name.startswith(".")]


In [158]:
class JHMDB(torch.utils.data.Dataset):
    def __init__(self, video_pathes, mask_pathes, pose_pathes, class_names):
        
        self.data = {'video': [], 'label': [], 'mask':[], 'pose':[], 'scale':[]}
        
        self.classdict = {}
        for i, x in enumerate(class_names):
            self.classdict[x] = i

        video_num=len(video_pathes)
        mask_num=len(mask_pathes)

        for i in range(video_num):
            video=[]
            cap = cv2.VideoCapture(video_pathes[i])
            has_frame=True
            while(has_frame):
                _, frame = cap.read()
                has_frame = frame is not None

                if has_frame:
                    video.append(frame)
            cap.release()
            self.data['video'].append(video)

            mask_mat = loadmat(mask_pathes[i]) 
            self.data['mask'].append(mask_mat['part_mask'])
            self.data['label'].append(video_pathes[i].split('/')[-2])
            pose_mat = loadmat(pose_pathes[i])['pos_img']
            scale = loadmat(pose_pathes[i])['scale']
            self.data['pose'].append(pose_mat)
            self.data['scale'].append(scale[0]) # redundant dim
            
            
    #def _compute_mean(self):
    #    meanstd_file = './data/jhmdbmean'
    #    if isfile(meanstd_file):
    #        meanstd = torch.load(meanstd_file)
    #    else:
    #        mean = torch.zeros(3)
    #        std = torch.zeros(3)
    #        for index in self.train:
    #            a = self.anno[index]
    #            img_path = os.path.join(self.img_folder, a['img_paths'])
    #            img = load_image(img_path) # CxHxW
    #            mean += img.view(img.size(0), -1).mean(1)
    #            std += img.view(img.size(0), -1).std(1)
    #        mean /= len(self.train)
    #        std /= len(self.train)
    #        meanstd = {
    #            'mean': mean,
    #            'std': std,
    #            }
    #        torch.save(meanstd, meanstd_file)
    #    if self.is_train:
    #        print('    Mean: %.4f, %.4f, %.4f' % (meanstd['mean'][0], meanstd['mean'][1], meanstd['mean'][2]))
    #        print('    Std:  %.4f, %.4f, %.4f' % (meanstd['std'][0], meanstd['std'][1], meanstd['std'][2]))
    #        
    #    return meanstd['mean'], meanstd['std']
    #
    def __getitem__(self, index):
        # scala, (240, 320, F), (2, 15, F), (F)
        # randomly select 15 consecutive frames (F = 15)
        frame_num = self.data['mask'][index].shape[2]
        F = 15
        start_frame = np.random.randint(0, high=frame_num-F+1)
        

        return torch.from_numpy(np.array(self.data['video'][index][start_frame:start_frame+F])), \
            torch.LongTensor([self.classdict[self.data['label'][index]]]), \
            torch.from_numpy(self.data['mask'][index][:,:,start_frame:start_frame+F]), \
            torch.from_numpy(self.data['pose'][index][:,:,start_frame:start_frame+F]), \
            torch.from_numpy(self.data['scale'][index][start_frame:start_frame+F])
        
    def __len__(self):
        return len(self.data['scale'])

In [24]:
train_dataset = JHMDB(video_pathes_train, mask_pathes_train, pose_pathes_train, class_names)


In [159]:
valid_dataset = JHMDB(video_pathes_valid, mask_pathes_valid, pose_pathes_valid, class_names)


In [147]:
i = 0
for x in valid_dataset:
    #print(x)
    #if i == 100:
    print(x)
    break
    

<class 'torch.ByteTensor'>
<class 'torch.ByteTensor'>
(
( 0 , 0 ,.,.) = 
    5   14    8
    5   14    8
    3   14    8
       ⋮       
    8   19   13
   13   24   18
   16   27   21

( 0 , 1 ,.,.) = 
    6   15    9
    6   15    9
    7   18   12
       ⋮       
   10   21   15
   15   26   20
   17   28   22

( 0 , 2 ,.,.) = 
    8   17   11
    9   18   12
   13   24   18
       ⋮       
   13   24   18
   16   27   21
   18   29   23
    ... 

( 0 ,237,.,.) = 
   22   36   21
   51   65   50
   79   95   80
       ⋮       
   18   30   22
   16   27   21
   28   39   33

( 0 ,238,.,.) = 
   17   26   14
   36   45   33
   55   69   56
       ⋮       
   26   35   29
   34   43   37
   60   69   63

( 0 ,239,.,.) = 
   13   22   10
   24   33   21
   36   50   37
       ⋮       
   20   29   23
   32   41   35
   66   75   69
      ⋮  

( 1 , 0 ,.,.) = 
    0   11    5
    6   17   11
   12   24   21
       ⋮       
   14   25   19
   15   26   20
   10   21   15

( 1 , 1 ,.,.) =

In [25]:
train_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=10, shuffle=True)
    #num_workers=2, pin_memory=True) 

In [160]:
valid_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=5, shuffle=True)
    #num_workers=2, pin_memory=True)

In [161]:
for x in valid_loader:
    print(x)
    break

[
( 0 , 0 , 0 ,.,.) = 
  221  225  213
  221  225  213
  221  225  213
       ⋮       
  218  222  210
  218  222  210
  218  222  210

( 0 , 0 , 1 ,.,.) = 
  221  225  213
  221  225  213
  221  225  213
       ⋮       
  218  222  210
  218  222  210
  218  222  210

( 0 , 0 , 2 ,.,.) = 
  221  225  213
  221  225  213
  221  225  213
       ⋮       
  218  222  210
  218  222  210
  218  222  210
        ... 

( 0 , 0 ,237,.,.) = 
   47   90   63
   47   90   63
   48   91   64
       ⋮       
   33   78   53
   35   77   53
   35   77   53

( 0 , 0 ,238,.,.) = 
   45   88   61
   45   88   61
   45   88   61
       ⋮       
   32   77   52
   35   77   53
   35   77   53

( 0 , 0 ,239,.,.) = 
   44   87   60
   44   87   60
   44   87   60
       ⋮       
   32   77   52
   35   77   53
   35   77   53
          ⋮  

( 0 , 1 , 0 ,.,.) = 
  221  225  213
  221  225  213
  221  225  213
       ⋮       
  220  224  212
  220  224  212
  220  224  212

( 0 , 1 , 1 ,.,.) = 
  221  225  

## Model

In [0]:
class C3D(nn.Module):
    """
    The C3D network as described in [1].
    """

    def __init__(self):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 21)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):

        h = self.relu(self.conv1(x))
        h = self.pool1(h)

        h = self.relu(self.conv2(h))
        h = self.pool2(h)

        h = self.relu(self.conv3a(h))
        h = self.relu(self.conv3b(h))
        h = self.pool3(h)

        h = self.relu(self.conv4a(h))
        h = self.relu(self.conv4b(h))
        h = self.pool4(h)

        h = self.relu(self.conv5a(h))
        h = self.relu(self.conv5b(h))
        h = self.pool5(h)

        h = h.view(-1, 8192)
        h = self.relu(self.fc6(h))
        h = self.dropout(h)
        h = self.relu(self.fc7(h))
        h = self.dropout(h)

        logits = self.fc8(h)

        return logits