In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
import math
import random
from timesformer_pytorch import TimeSformer

In [2]:
ucfPath = 'D:/Files/Datasets/UCF-101'
framesPerVideo = 10
maxVideoPerClass = 5
numEpochs = 10
batchSize = 8

In [3]:
class AdvancedTimeSformer(TimeSformer):
    def __init__(
        self,
        *,
        dim,
        num_frames,
        num_classes,
        image_size = 320,
        patch_size = 16,
        channels = 3,
        depth = 12,
        heads = 8,
        dim_head = 64,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__(dim = dim, 
                         num_frames = num_frames, 
                         num_classes = num_classes, 
                         image_size = image_size, 
                         patch_size = patch_size,
                         channels = channels, 
                         depth = depth,
                         heads = heads,
                         dim_head = dim_head,
                         attn_dropout = attn_dropout,
                         ff_dropout = ff_dropout)
        self.to_out = torch.nn.Sequential(
            torch.nn.LayerNorm(dim),
            torch.nn.Linear(dim, num_classes),
            #torch.nn.Softmax(num_classes)
        )

In [4]:
model = AdvancedTimeSformer(
    dim = 64,
    image_size = 320,
    patch_size = 16,
    num_frames = framesPerVideo,
    num_classes = 101,
    depth = 4,
    heads = 4,
    dim_head = 16,
    attn_dropout = 0.1,
    ff_dropout = 0.1
)
print(model)

AdvancedTimeSformer(
  (to_patch_embedding): Linear(in_features=768, out_features=64, bias=True)
  (pos_emb): Embedding(4001, 64)
  (layers): ModuleList(
    (0): ModuleList(
      (0): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=64, out_features=192, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=64, out_features=64, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (1): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=64, out_features=192, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=64, out_features=64, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (2): PreNorm(
        (fn): FeedForward(
          (net): Sequential(
            (0): Line

In [5]:
trainData = []
trainLabels = []
trainLabelsNames = {}
testData = []
testLabels = []

for k, classFolderName in enumerate(sorted(os.listdir(ucfPath))):
    print('Process class ' + classFolderName)
    trainLabelsNames[classFolderName] = k
    for i, videoName in enumerate(sorted(os.listdir(os.path.join(ucfPath, classFolderName)))):
        if i >= maxVideoPerClass:
            break
                    
        if i == 0:
            testLabels.append(k)
            testData.append(np.zeros((framesPerVideo, 3, 240, 320), dtype = np.float32))
        else:
            trainLabels.append(k)
            trainData.append(np.zeros((framesPerVideo, 3, 240, 320), dtype = np.float32))

        count = 0
        video = cv2.VideoCapture(os.path.join(ucfPath, classFolderName, videoName))
        numberOfFrames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        
        for j in range(framesPerVideo):
            video.set(cv2.CAP_PROP_POS_FRAMES, count)
            success, image = video.read()
            if i == 0:
                testData[-1][j] = np.swapaxes(
                                       np.swapaxes(image, 
                                           0, 2),
                                       1, 2)
            else:
                trainData[-1][j] = np.swapaxes(
                                       np.swapaxes(image, 
                                           0, 2),
                                       1, 2)
            count += numberOfFrames // framesPerVideo

Process class ApplyEyeMakeup
Process class ApplyLipstick
Process class Archery
Process class BabyCrawling
Process class BalanceBeam
Process class BandMarching
Process class BaseballPitch
Process class Basketball
Process class BasketballDunk
Process class BenchPress
Process class Biking
Process class Billiards
Process class BlowDryHair
Process class BlowingCandles
Process class BodyWeightSquats
Process class Bowling
Process class BoxingPunchingBag
Process class BoxingSpeedBag
Process class BreastStroke
Process class BrushingTeeth
Process class CleanAndJerk
Process class CliffDiving
Process class CricketBowling
Process class CricketShot
Process class CuttingInKitchen
Process class Diving
Process class Drumming
Process class Fencing
Process class FieldHockeyPenalty
Process class FloorGymnastics
Process class FrisbeeCatch
Process class FrontCrawl
Process class GolfSwing
Process class Haircut
Process class HammerThrow
Process class Hammering
Process class HandstandPushups
Process class Hand

In [6]:
lossFunc = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
model.to(device)

AdvancedTimeSformer(
  (to_patch_embedding): Linear(in_features=768, out_features=64, bias=True)
  (pos_emb): Embedding(4001, 64)
  (layers): ModuleList(
    (0): ModuleList(
      (0): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=64, out_features=192, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=64, out_features=64, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (1): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=64, out_features=192, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=64, out_features=64, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (2): PreNorm(
        (fn): FeedForward(
          (net): Sequential(
            (0): Line

In [9]:
print('Learnable params: ' + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

for epoch in range(numEpochs):  # loop over the dataset multiple times
    start_time = time.time()
    indices = [i for i in range(len(trainData))]
    random.shuffle(indices)
    
    #Train
    train_loss = 0.0
    for batchNumber in range(len(trainData) // batchSize):
        inputs = torch.tensor([trainData[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]]) / 255.0
        labels = torch.tensor([trainLabels[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]])
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = lossFunc(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    indices = [i for i in range(len(testData))]
    random.shuffle(indices)
    
    #Validation
    val_loss = 0.0
    for batchNumber in range(len(testData) // batchSize):
        inputs = torch.tensor([testData[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]]) / 255.0
        labels = torch.tensor([testLabels[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]])
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = lossFunc(outputs, labels)
        
        val_loss += loss.item()
    
    print('Epoch: %d Train loss: %.3f Val loss: %.3f' %
          (epoch + 1, train_loss / len(trainData), val_loss / len(testData)))
    
    print("%s seconds for epoch" % (time.time() - start_time))

print('Finished Training')

Learnable params: 644069
Epoch: 1 Train loss: 0.607 Val loss: 0.555
119.85035634040833 seconds for epoch
Epoch: 2 Train loss: 0.586 Val loss: 0.546
120.83449792861938 seconds for epoch
Epoch: 3 Train loss: 0.568 Val loss: 0.508
119.89849901199341 seconds for epoch
Epoch: 4 Train loss: 0.536 Val loss: 0.493
122.3320004940033 seconds for epoch
Epoch: 5 Train loss: 0.519 Val loss: 0.480
120.68999934196472 seconds for epoch
Epoch: 6 Train loss: 0.500 Val loss: 0.464
120.19100046157837 seconds for epoch
Epoch: 7 Train loss: 0.476 Val loss: 0.440
120.33250045776367 seconds for epoch
Epoch: 8 Train loss: 0.452 Val loss: 0.420
120.9659993648529 seconds for epoch
Epoch: 9 Train loss: 0.418 Val loss: 0.388
120.12649965286255 seconds for epoch
Epoch: 10 Train loss: 0.372 Val loss: 0.390
120.2930006980896 seconds for epoch
Finished Training
