In [7]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torch.utils.data as data
# import torchvision
# import torchvision.models as models
# from torch.autograd import Variable

import matplotlib.pyplot as plt
import pickle

from functions import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from model import FineTuneModel, resnet152FineTune

In [13]:
## set path ##
data_path = "./ucf_image/"    # define UCF-101 spatial data path
action_name_path = "./UCF101actions.pkl"  # load preprocessed action names
save_model_path = "./record/"  # save Pytorch models

## gpu setting ##
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print('Current cuda device ', torch.cuda.current_device())


## training parameters ##

epochs = 10
batch_size = 3
learning_rate = 1e-4
log_interval = 10
img_x, img_y = 240, 320  # resize video 2d frame size

## model parameter ##
dim = 25
k = 101                 # number of target category

## Loader parameter ##
params = {'batch_size': batch_size, 'shuffle': True}

Current cuda device  1


In [14]:
## load UCF101 actions names ##
with open(action_name_path, 'rb') as f:
    action_names = pickle.load(f)   # load UCF101 actions names    
action_names[58] = 'HandStandPushups' # fix the label name to match the folder name

## convert labels -> category ##
le = LabelEncoder()
le.fit(action_names)
action_category = le.transform(action_names).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(action_category)
actions = []
fnames = os.listdir(data_path)
all_names = []
for f in fnames:
    loc1 = f.find('v_')
    loc2 = f.find('_g')
    actions.append(f[(loc1 + 2): loc2]) # 저장되어 있는 파일 순서대로 labeling

    all_names.append(f)
    
## list all data files ##
all_X_list = all_names              # all video file names
all_y_list = labels2cat(le, actions)    # all video labels

## train, test split ##
train_list, test_list, train_label, test_label = train_test_split(all_X_list, all_y_list, test_size=0.25, random_state=42)

## total 25 frames (min size) ##
begin_frame, end_frame, skip_frame = 1, 26, 1 
selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()
print("The total Number of selected frame is", len(selected_frames))


## image transformation ##
transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5], std=[0.5])])

## UCF101 30 images dataset ##
train_set, valid_set = Dataset_3DCNN(data_path, train_list, train_label, selected_frames, transform=transform), \
                       Dataset_3DCNN(data_path, test_list, test_label, selected_frames, transform=transform)


train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

The total Number of selected frame is 25


In [15]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    model.train()

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    model.train()
    for epoch in range(epochs):
        for batch_idx, (Xs, y) in enumerate(train_loader):
            y = y.to(device).view(-1, )
            N_count += Xs.size(0)

            output = []
            for i, X in enumerate(Xs):
                X = X.to(device)
                output.append(model(X))  
                
            output = torch.stack(output)  # output size = (batch, number of classes)

            optimizer.zero_grad()
            loss = F.cross_entropy(output, y)
            losses.append(loss.item())

            # to compute accuracy
            y_pred = torch.max(output, 1)[1]  # y_pred != output
            step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
            scores.append(step_score)         # computed on CPU

            loss.backward()
            optimizer.step()

            # show information
            if (batch_idx + 1) % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accuracy: {:.2f}%'.format(
                    epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return losses, scores

def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    model.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for i, (Xs, y) in enumerate(test_loader):
            # distribute data to device
            y =  y.to(device).view(-1, )
            
            output = []
            for i, X in enumerate(Xs):
                X = X.to(device)
                output.append(model(X))  

            output = torch.stack(output)  # output size = (batch, number of classes)


            loss = F.cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                 # sum up batch loss
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # to compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))
    
    # save Pytorch models of best record
    torch.save({
        'model': model.state_dict(),
        'optimizer':optimizer.state_dict()
    }, './record/FT_{}_epoch_{}.pth'.format(dim, epoch + 1))
    
    print("Epoch {} model saved!".format(epoch + 1))


    return test_loss, test_score

In [16]:
model = resnet152FineTune(dim, k).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)   # optimize all cnn parameters


# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []


for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, model, device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation(model, device, optimizer, valid_loader)
    
    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)

    # save all train test results
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)
    C = np.array(epoch_test_losses)
    D = np.array(epoch_test_scores)
    np.save('./record/FTmodel_epoch_training_losses.npy', A)
    np.save('./record/FTmodel_epoch_training_scores.npy', B)
    np.save('./record/FTmodel_epoch_test_loss.npy', C)
    np.save('./record/FTmodel_epoch_test_score.npy', D)

    # plot
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(np.arange(1, epochs + 1), A[:, -1])  # train loss (on epoch end)
plt.plot(np.arange(1, epochs + 1), C)         #  test loss (on epoch end)
plt.title("model loss")
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc="upper left")
# 2nd figure
plt.subplot(122)
plt.plot(np.arange(1, epochs + 1), B[:, -1])  # train accuracy (on epoch end)
plt.plot(np.arange(1, epochs + 1), D)         #  test accuracy (on epoch end)
# plt.plot(histories.losses_val)
plt.title("training scores")
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc="upper left")
title = "./record/fig_UCF101_FT.png"
plt.savefig(title, dpi=600)
# plt.close(fig)
plt.show()

Get Pre-trained Model: resnet


RuntimeError: CUDA error: out of memory

In [42]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq

In [44]:
model = ResCNNEncoder()
y = model(x[0])

IndexError: too many indices for tensor of dimension 3

In [41]:
x.shape

torch.Size([25, 3, 240, 320])

In [16]:
import torch
import numpy as np

x = np.random.rand(25, 3, 240, 320)
x = torch.from_numpy(x).float()
x.shape

torch.Size([25, 3, 240, 320])

In [32]:
import torch.nn as nn
model.fc = nn.Linear(in_features=2048, out_features=1000, bias=True)

Linear(in_features=2048, out_features=1000, bias=True)

In [30]:
# model = get_pretrained_model('resnet')
y = model(x)


In [31]:
y.shape

torch.Size([25, 1000])

In [33]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 