# Action Detection

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision
import numpy as np
from scipy.io import loadmat
from random import shuffle
from torch.utils.data import Dataset, DataLoader
import cv2
import os
from sklearn.model_selection import train_test_split

## Data Loader

In [2]:
video_rootdir="./ReCompress_Videos"
mask_rootdir="./puppet_mask"
pose_rootdir="./joint_positions"

video_pathes=[]
mask_pathes=[]
pose_pathes=[]
for root, dirs, files in os.walk(video_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        video_pathes.append(os.path.join(root, file))

for root, dirs, files in os.walk(mask_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        mask_pathes.append(os.path.join(root, file)) 

for root, dirs, files in os.walk(pose_rootdir):
    for file in files:
        if file[0].startswith(".") or root.endswith('.AppleDouble'):
            continue
        pose_pathes.append(os.path.join(root, file))         



In [3]:
from sklearn.model_selection import train_test_split

In [44]:
video_pathes_train, video_pathes_valid, mask_pathes_train, mask_pathes_valid, pose_pathes_train, pose_pathes_valid = \
    train_test_split(video_pathes, mask_pathes, pose_pathes, test_size=0.01)
class_names=[name for name in os.listdir(video_rootdir) if not name.startswith(".")]


In [407]:
class JHMDB(torch.utils.data.Dataset):
    def __init__(self, video_pathes, mask_pathes, pose_pathes, class_names):
        
        self.data = {'video': [], 'label': [], 'mask':[], 'pose':[], 'scale':[]}
        
        self.classdict = {}
        for i, x in enumerate(class_names):
            self.classdict[x] = i

        video_num=len(video_pathes)
        mask_num=len(mask_pathes)

        for i in range(video_num):
            video=[]
            cap = cv2.VideoCapture(video_pathes[i])
            has_frame=True
            while(has_frame):
                _, frame = cap.read()
                has_frame = frame is not None

                if has_frame:
                    frame = cv2.resize(frame, (112, 112), interpolation = cv2.INTER_CUBIC) # (112, 112, 3)
                    #frame = np.transpose(frame, (2, 0, 1)) # (3, 112, 112)
                    video.append(frame)
            cap.release()
            self.data['video'].append(video)

            mask_mat = loadmat(mask_pathes[i]) 
            masks = cv2.resize(mask_mat['part_mask'], (112, 112), interpolation = cv2.INTER_CUBIC) # (112, 112, F)
            self.data['mask'].append(masks)
            self.data['label'].append(video_pathes[i].split('/')[-2])
            pose_mat = loadmat(pose_pathes[i])['pos_img']
            scale = loadmat(pose_pathes[i])['scale']
            self.data['pose'].append(pose_mat)
            self.data['scale'].append(scale[0]) # redundant dim
            
            
    def _compute_mean(self):
        meanstd_file = './data/jhmdbmean'
        if isfile(meanstd_file):
            meanstd = torch.load(meanstd_file)
        else:
            mean = torch.zeros(3)
            std = torch.zeros(3)
            for videos in self.data['video']:
                for img in videos:
                    # HxWxC by now
                    # TODO: to be changed
                    mean += img.view(img.size(0), -1).mean(1)
                    std += img.view(img.size(0), -1).std(1)
            mean /= len(self.train)
            std /= len(self.train)
            meanstd = {
                'mean': mean,
                'std': std,
                }
            torch.save(meanstd, meanstd_file)
        if self.is_train:
            print('    Mean: %.4f, %.4f, %.4f' % (meanstd['mean'][0], meanstd['mean'][1], meanstd['mean'][2]))
            print('    Std:  %.4f, %.4f, %.4f' % (meanstd['std'][0], meanstd['std'][1], meanstd['std'][2]))
            
        return meanstd['mean'], meanstd['std']
    
    def __getitem__(self, index):
        # video (F, C, 112, 112) to be reshaped on the fly
        # label scala, 
        # mask (112, 112, F)
        # pose (2, 15, F)
        # scale (F)
        # randomly select 15 consecutive frames (F = 15)
        frame_num = self.data['mask'][index].shape[2]
        F = 15
        start_frame = np.random.randint(0, high=frame_num-F+1)
        
        # change pose position according to resize
        pose_data = torch.from_numpy(self.data['pose'][index][:,:,start_frame:start_frame+F].astype('float'))
        pose_data[0,:,:] = pose_data[0,:,:] * 112 / 240
        pose_data[1,:,:] = pose_data[1,:,:] * 112 / 320
        
        return torch.from_numpy(np.array(self.data['video'][index][start_frame:start_frame+F])).int(), \
            torch.LongTensor([self.classdict[self.data['label'][index]]]), \
            torch.from_numpy(self.data['mask'][index][:,:,start_frame:start_frame+F].astype('float')), \
            pose_data, \
            torch.from_numpy(self.data['scale'][index][start_frame:start_frame+F].astype('float'))
        
    def __len__(self):
        return len(self.data['scale'])

In [164]:
len(valid_dataset.data['video'][0])

20

In [24]:
train_dataset = JHMDB(video_pathes_train, mask_pathes_train, pose_pathes_train, class_names)


In [408]:
valid_dataset = JHMDB(video_pathes_valid, mask_pathes_valid, pose_pathes_valid, class_names)


In [404]:
i = 0
for x in valid_dataset:
    #print(x)
    #if i == 100:
    #print(x)
    break
    

pose0 
(0 ,.,.) = 

Columns 0 to 6 
    73.2988   73.1375   75.1579   75.1575   85.1226   85.1210   81.2326
   85.9737   82.8679   87.3984   87.3984   85.7549   85.7581   76.3919
   78.9094   78.9198   77.2249   77.2266   80.0737   80.0678   74.1840
   89.2000   89.7714   89.7714   89.7714  100.6286  100.6286   99.4857
   52.5143   52.5143   52.5143   52.5143   59.3714   64.5143   64.5143
  100.9143  100.9143  103.2000  103.2000  103.2000  100.3429   93.4858
   77.0857   79.3711   81.6572   81.6572   81.0857   81.0857   73.0857
   79.0161   71.7272   70.5931   70.5938   99.8437   99.8411  115.2831
   35.4909   32.6325   32.5019   32.4997   39.5206   52.0288   55.4611
  108.5010  114.2172  121.6837  121.6854  127.5623  128.8135  122.5601
   59.1240   60.3645   59.3239   59.3240   65.2265   68.6559   73.5048
   36.5838   35.3158   32.9698   32.9401   68.6881   82.3877   89.8667
   33.0639   30.7856   24.8418   24.8338   69.0412   76.1817   87.1440
  133.4709  135.1547  136.1002  136.0993

In [25]:
train_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=10, shuffle=True)
    #num_workers=2, pin_memory=True) 

In [409]:
valid_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=5, shuffle=True)
    #num_workers=2, pin_memory=True)

In [410]:
for x in valid_loader:
    # transpose 
    print(x[0].transpose(4,3).transpose(3,2).transpose(2,1).size())
    for t in x[1:]:
        print(t.size())
    
    break

torch.Size([5, 3, 15, 112, 112])
torch.Size([5, 1])
torch.Size([5, 112, 112, 15])
torch.Size([5, 2, 15, 15])
torch.Size([5, 15])


## Model

In [0]:
class C3D(nn.Module):
    """
    The C3D network as described in [1].
    """

    def __init__(self):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 21)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):

        h = self.relu(self.conv1(x))
        h = self.pool1(h)

        h = self.relu(self.conv2(h))
        h = self.pool2(h)

        h = self.relu(self.conv3a(h))
        h = self.relu(self.conv3b(h))
        h = self.pool3(h)

        h = self.relu(self.conv4a(h))
        h = self.relu(self.conv4b(h))
        h = self.pool4(h)

        h = self.relu(self.conv5a(h))
        h = self.relu(self.conv5b(h))
        h = self.pool5(h)

        h = h.view(-1, 8192)
        h = self.relu(self.fc6(h))
        h = self.dropout(h)
        h = self.relu(self.fc7(h))
        h = self.dropout(h)

        logits = self.fc8(h)

        return logits