## Load 3D CNN Model

In [1]:
import torch
from torch import nn
import torchvision

#This downloads the model from torchvision
model = torchvision.models.video.r3d_18(pretrained=True, progress=True)

print ('Total number of parameters: {}'.format(sum(p.numel() for p in model.parameters())))

Total number of parameters: 33371472


## Modify the Last Layer

In [2]:
import torch.optim as optim
# for name,param in model.named_parameters():
#     param.requires_grad = False
model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# model = nn.DataParallel(model,device_ids=[2,4])

## Set the data

In [3]:
# Generating random data for testing purposes
torch.manual_seed(6)
trainData = torch.rand(10, 3, 2, 4, 4)
trainLabels = torch.randint(0, 10, (10,))

testData = torch.rand(10, 3, 2, 4, 4)
testLabels = torch.randint(0, 10, (10,))

# trainset_loader = DataLoader(trainset, batch_size=64, shuffle=True, num_workers=1)
# testset_loader = DataLoader(testset, batch_size=64, shuffle=True, num_workers=1)

In [4]:
import torch.nn.functional as F

def train_model(x, y, model, epochs=5):
    model.train()
    for t in range(epochs):
        output = model(x)
        
        loss = F.cross_entropy(output, y)
        
        optimizer.zero_grad()
        loss.backward()
        print ('Epoch: {}, Loss {}'.format(t, loss))

        optimizer.step()

def check_accuracy(x, y, model):
    num_correct = 0
    num_samples = 0
    test_loss = 0
#     model.eval()  # set model to evaluation mode
    with torch.no_grad():
        scores = model(x)
        test_loss += F.cross_entropy(scores, y).item() # sum up batch loss
        _, preds = scores.max(1)
        num_correct += (preds == y).sum()
        num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f), Loss: (%.2f)' % 
              (num_correct, num_samples, 100 * acc, float(test_loss) / num_samples))

## Train (Finetune) the model

In [5]:
train_model(trainData, trainLabels, model)

Epoch: 0, Loss 2.751695394515991
Epoch: 1, Loss 2.0147290229797363
Epoch: 2, Loss 1.0442476272583008
Epoch: 3, Loss 0.44752272963523865
Epoch: 4, Loss 0.18156075477600098


## Test the model

In [6]:
check_accuracy(trainData, trainLabels, model)

Got 10 / 10 correct (100.00), Loss: (0.01)


## Data Loader

In [3]:
import torch.nn as nn            # containing various building blocks for your neural networks
import torch.optim as optim      # implementing various optimization algorithms
import torch.nn.functional as F  # a lower level (compared to torch.nn) interface

# torchvision: popular datasets, model architectures, and common image transformations for computer vision.
import torchvision
# transforms: transformations useful for image processing
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader

import glob
import os.path as osp
import numpy as np
from PIL import Image

import os
import skvideo
from skvideo import io as vp

class Cichlids(Dataset):
    """
    A customized data loader for Cichlids.
    """
    def __init__(self,
                 root,
                 transform=None,
                 spatial_transform=None,
                 preload=False):
        """ Intialize the Cichlids dataset
        
        Args:
            - root: root directory of the dataset
            - tranform: a custom tranform function
            - preload: if preload the dataset into memory
        """
        self.videos = None
        self.labels = None
        self.filenames = []
        self.root = root
        self.transform = transform
        self.spatial_transform = spatial_transform

        # read filenames
        for i, class_dir in enumerate(os.listdir(root)):
#         for i in range(10):
            filenames = glob.glob(osp.join(root, class_dir, '*.mp4'))
            for fn in filenames:
                self.filenames.append((fn, i)) # (filename, label) pair
                
        # if preload dataset into memory
        if preload:
            self._preload()
            
        self.len = len(self.filenames)
                              
    def _preload(self):
        """
        Preload dataset to memory
        """
        self.labels = []
        self.images = []
        for image_fn, label in self.filenames:            
            # load images
            video = vp.vread(image_fn)
#             video = video / 255
            video = np.reshape(video, (video.shape[3], video.shape[0], video.shape[1], video.shape[2]))
            self.videos.append(video.copy())
            # avoid too many opened files bug
            # image.close()
            self.labels.append(label)

    # probably the most important to customize.
    def __getitem__(self, index):
        """ Get a sample from the dataset
        """
        if self.videos is not None:
            # If dataset is preloaded
            video = self.videos[index]
            label = self.labels[index]
        else:
            # If on-demand data loading
            video_fn, label = self.filenames[index]
            video = vp.vread(video_fn)
            video = np.reshape(video, (video.shape[3], video.shape[0], video.shape[1], video.shape[2]))
            
        # May use transform function to transform samples
        # e.g., random crop, whitening
#         if self.transform is not None:
#             clip = [self.transform(img) for img in video]
#         video = torch.stack(clip, 0).permute(0, 2, 1, 3)
        
        if self.spatial_transform is not None:
            self.spatial_transform.randomize_parameters()
            clip = [self.spatial_transform(img) for img in video]
            video = torch.stack(clip, 0).permute(0, 2, 1, 3)
        
        # return image and label
        return video, label

    def __len__(self):
        """
        Total number of samples in the dataset
        """
        return self.len

In [4]:
# transforms.ToTensor() automatically converts PIL images to
# torch tensors with range [0, 1]
from spatial_transforms import (
    Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
    MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)

# spatial_transform = Compose([
#             crop_method,
#             RandomHorizontalFlip(),
#             ToTensor(1), norm_method
#         ])

trainset = Cichlids(
    root='MLclips/training',
    preload=False, transform=transforms.ToTensor(),
)

# Use the torch dataloader to iterate through the dataset
# We want the dataset to be shuffled during training.
trainset_loader = DataLoader(trainset, batch_size=1, shuffle=True, num_workers=1)

# Load the testset
testset = Cichlids(
    root='MLclips/testing',
    preload=False, transform=transforms.ToTensor(),
)

# Use the torch dataloader to iterate through the dataset
testset_loader = DataLoader(testset, batch_size=10, shuffle=False, num_workers=1)

# Use GPU if available, otherwise stick with cpu
use_cuda = torch.cuda.is_available()
torch.manual_seed(123)
device1 = torch.device("cuda: 2" if use_cuda else "cpu")
device2 = torch.device("cuda: 4" if use_cuda else "cpu")

In [5]:
import torch.nn.functional as F
from time import time

def train_model(epochs=5, log_interval=1):
    iteration = 0
    torch.cuda.empty_cache()
#     model = nn.DataParallel(model)
    model.to(device1)
    model.train()
    for t in range(epochs):
        start = time()
        for batch_idx, (data, target) in enumerate(trainset_loader):
            
            data, target = data.to(device1), target.to(device1)
            
            data = data.float()

            output = model(data)
            
            loss = F.cross_entropy(output, target)
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            
            if iteration % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    t, batch_idx * len(data), len(trainset_loader.dataset),
                    100. * batch_idx / len(trainset_loader), loss.item()))
            iteration += 1
            
            break
        end = time()
#         print('Time taken for this epoch: {:.2f}s'.format(end-start))
        check_accuracy(data, target, model) # evaluate at the end of epoch
    torch.cuda.empty_cache()

def check_accuracy(x, y, model):
    num_correct = 0
    num_samples = 0
    test_loss = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        scores = model(x)
        test_loss += F.cross_entropy(scores, y).item() # sum up batch loss
        _, preds = scores.max(1)
        num_correct += (preds == y).sum()
        num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f), Loss: (%.2f)' % 
              (num_correct, num_samples, 100 * acc, float(test_loss) / num_samples))

In [6]:
train_model()

Got 0 / 1 correct (0.00), Loss: (2.28)
Got 1 / 1 correct (100.00), Loss: (0.00)
Got 1 / 1 correct (100.00), Loss: (0.00)
Got 1 / 1 correct (100.00), Loss: (0.00)
Got 0 / 1 correct (0.00), Loss: (2.50)


In [2]:
import skvideo
from skvideo import io as vp

vdata = vp.vread('MLclips/training/b/1025_4473_3982_439_811.mp4')

In [8]:
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

In [16]:
for i in range(3):
    for idx, (data, labels) in enumerate(trainset_loader):
        print (labels)
        break

tensor([0, 0, 0])
tensor([0, 0, 0])
tensor([0, 0, 0])


In [18]:
vdata.shape

(120, 200, 200, 3)

## Sample Keras Model

In [256]:
import keras

from keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout
from keras.models import Sequential

model = Sequential()

model.add(Conv3D(32, (3,3,3), activation='relu', input_shape=(120, 200, 200, 3)))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(64, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(128, (3,3,3), activation='relu'))
model.add(Conv3D(128, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(256, (2,2,2), activation='relu'))
model.add(Conv3D(256, (2,2,2), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))

model.add(Flatten())
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_17 (Conv3D)           (None, 118, 198, 198, 32) 2624      
_________________________________________________________________
max_pooling3d_11 (MaxPooling (None, 118, 99, 99, 32)   0         
_________________________________________________________________
conv3d_18 (Conv3D)           (None, 116, 97, 97, 64)   55360     
_________________________________________________________________
max_pooling3d_12 (MaxPooling (None, 116, 48, 48, 64)   0         
_________________________________________________________________
conv3d_19 (Conv3D)           (None, 114, 46, 46, 128)  221312    
_________________________________________________________________
conv3d_20 (Conv3D)           (None, 112, 44, 44, 128)  442496    
_________________________________________________________________
max_pooling3d_13 (MaxPooling (None, 112, 22, 22, 128) 