In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
import math
import random
import pickle
import sys
from timesformer_pytorch import TimeSformer
from collections import defaultdict
from sklearn.cluster import MiniBatchKMeans

In [9]:
ucfPath = 'D:/Files/Datasets/UCF-101'
ucfSplitNumber = 1
#ucfPath = 'UCF-101'
modelPath = 'model_1'
framesPerVideo = 8
maxVideoPerClass = 5
maxClasses = 98
valPerClass = 3
embeddingsSize = 256
batchSize = 4

In [3]:
class AdvancedTimeSformer(TimeSformer):
    def __init__(
        self,
        *,
        dim,
        num_frames,
        num_classes,
        image_width = 320,
        image_height = 240,
        patch_size = 16,
        channels = 3,
        depth = 12,
        heads = 8,
        dim_head = 64,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__(dim = dim, 
                         num_frames = num_frames, 
                         num_classes = num_classes, 
                         image_width = image_width, 
                         image_height = image_height,
                         patch_size = patch_size,
                         channels = channels, 
                         depth = depth,
                         heads = heads,
                         dim_head = dim_head,
                         attn_dropout = attn_dropout,
                         ff_dropout = ff_dropout)
        self.to_out = torch.nn.Sequential(
            torch.nn.LayerNorm(dim),
            torch.nn.Linear(dim, num_classes),
            #torch.nn.Softmax(num_classes)
        )

In [None]:
if __name__ == '__main__':
    if len(sys.argv) > 1:
        modelPath = sys.argv[1]
        embeddingsSize = int(sys.argv[2])

    print("%d %d %d %d %d" % (tsf_dim, tsf_patch_size, tsf_depth, tsf_heads, tsf_dim_head))

In [4]:
model = torch.load(modelPath)

  "type " + container_type.__name__ + ". It won't be checked "


In [5]:
model.eval()

DataParallel(
  (module): AdvancedTimeSformer(
    (to_patch_embedding): Linear(in_features=768, out_features=128, bias=True)
    (pos_emb): Embedding(2401, 128)
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (fn): Attention(
            (to_qkv): Linear(in_features=128, out_features=384, bias=False)
            (to_out): Sequential(
              (0): Linear(in_features=128, out_features=128, bias=True)
              (1): Dropout(p=0.1, inplace=False)
            )
          )
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (1): PreNorm(
          (fn): Attention(
            (to_qkv): Linear(in_features=128, out_features=384, bias=False)
            (to_out): Sequential(
              (0): Linear(in_features=128, out_features=128, bias=True)
              (1): Dropout(p=0.1, inplace=False)
            )
          )
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (2): 

In [6]:
class DataStorage():
    def __init__(self, ucfDataPath, framesPerVideo, ucfSplitNumber = 1, maxVideoPerClass = None, maxClasses = None):
        
        ucfFullSize = 13320
        self.trainLabelsNames = {}
        lastTrainIndex = 0
        lastTestIndex = 0
        
        self.trainDict = defaultdict(list)
        self.testDict = defaultdict(list)
        if maxClasses is None:
            self.trainData = np.zeros((ucfFullSize - 101 * valPerClass, framesPerVideo, 3, 240, 320), dtype=np.uint8)
            self.trainLabels = np.zeros((ucfFullSize - 101 * valPerClass), dtype=int)
            self.testData = np.zeros((101 * valPerClass, framesPerVideo, 3, 240, 320), dtype=np.uint8)
            self.testLabels = np.zeros((101 * valPerClass), dtype=int)
        else:
            if maxVideoPerClass is None:
                numberOfVideos = 0
                for k, classFolderName in enumerate(sorted(os.listdir(ucfPath))):
                    if k >= maxClasses:
                        break
                    numberOfVideos += len([name for name in os.listdir(os.path.join(ucfPath, classFolderName)) if os.path.isfile(os.path.join(ucfPath, classFolderName, name))])

                print('Number of training and validation videos: %d' % numberOfVideos)
                self.trainData = np.zeros((numberOfVideos - valPerClass * maxClasses, framesPerVideo, 3, 240, 320), dtype=np.uint8)
                self.trainLabels = np.zeros((numberOfVideos - valPerClass * maxClasses), dtype=int)
                self.testData = np.zeros((maxClasses * valPerClass, framesPerVideo, 3, 240, 320), dtype=np.uint8)
                self.testLabels = np.zeros((maxClasses * valPerClass), dtype=int)
            else:
                self.trainData = np.zeros((maxClasses * (maxVideoPerClass - valPerClass), framesPerVideo, 3, 240, 320), dtype=np.uint8)
                self.trainLabels = np.zeros((maxClasses * (maxVideoPerClass - valPerClass)), dtype=int)
                self.testData = np.zeros((maxClasses * valPerClass, framesPerVideo, 3, 240, 320), dtype=np.uint8)
                self.testLabels = np.zeros((maxClasses * valPerClass), dtype=int)                
        
        for k, classFolderName in enumerate(sorted(os.listdir(ucfPath))):
            if maxClasses is not None and k >= maxClasses:
                break
            
            print('Process class ' + classFolderName)
            self.trainLabelsNames[classFolderName] = k
            for i, videoName in enumerate(sorted(os.listdir(os.path.join(ucfPath, classFolderName)))):
                if maxVideoPerClass is not None and i >= maxVideoPerClass:
                    break

                if i < valPerClass:
                    self.testLabels[lastTestIndex] = k
                    self.testDict[k].append(lastTestIndex)
                else:
                    self.trainLabels[lastTrainIndex] = k
                    self.trainDict[k].append(lastTrainIndex)

                count = 0
                video = cv2.VideoCapture(os.path.join(ucfPath, classFolderName, videoName))
                numberOfFrames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

                for j in range(framesPerVideo):
                    video.set(cv2.CAP_PROP_POS_FRAMES, count)
                    success, image = video.read()
                    if success:
                        if image.shape != (240, 320, 3):
                            image = cv2.resize(image, (320, 240))
                        if i < valPerClass:
                            self.testData[lastTestIndex][j] = np.swapaxes(
                                                np.swapaxes(image, 
                                                    0, 2),
                                                1, 2)
                        else:
                            self.trainData[lastTrainIndex][j] = np.swapaxes(
                                                np.swapaxes(image, 
                                                    0, 2),
                                                1, 2)
                    count += numberOfFrames // framesPerVideo
                    
                if i < valPerClass:
                    lastTestIndex += 1
                else:
                    lastTrainIndex += 1
            
        assert lastTrainIndex == self.trainData.shape[0], "Error in train data length"
        assert lastTestIndex == self.testData.shape[0], "Error in test data length"

In [10]:
dataStorage = DataStorage(ucfPath, framesPerVideo, maxClasses=maxClasses)

Number of training and validation videos: 12910
Process class ApplyEyeMakeup
Process class ApplyLipstick
Process class Archery
Process class BabyCrawling
Process class BalanceBeam
Process class BandMarching
Process class BaseballPitch
Process class Basketball
Process class BasketballDunk
Process class BenchPress
Process class Biking
Process class Billiards
Process class BlowDryHair
Process class BlowingCandles
Process class BodyWeightSquats
Process class Bowling
Process class BoxingPunchingBag
Process class BoxingSpeedBag
Process class BreastStroke
Process class BrushingTeeth
Process class CleanAndJerk
Process class CliffDiving
Process class CricketBowling
Process class CricketShot
Process class CuttingInKitchen
Process class Diving
Process class Drumming
Process class Fencing
Process class FieldHockeyPenalty
Process class FloorGymnastics
Process class FrisbeeCatch
Process class FrontCrawl
Process class GolfSwing
Process class Haircut
Process class HammerThrow
Process class Hammering
P

KeyboardInterrupt: 

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.cuda.device_count())

cuda
1


In [41]:
model = model.module
model.to(device)

AdvancedTimeSformer(
  (to_patch_embedding): Linear(in_features=768, out_features=128, bias=True)
  (pos_emb): Embedding(2401, 128)
  (layers): ModuleList(
    (0): ModuleList(
      (0): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=128, out_features=384, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=128, out_features=128, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (1): PreNorm(
        (fn): Attention(
          (to_qkv): Linear(in_features=128, out_features=384, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=128, out_features=128, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (2): PreNorm(
        (fn): FeedForward(
          (net): Sequential(
           

In [42]:
trainEmbeddings = np.zeros((dataStorage.trainData.shape[0], embeddingsSize), dtype=np.float)
#testEmbeddings = np.zeros((dataStorage.testData.shape[0], embeddingsSize), dtype=np.float)

In [44]:
kmeans = MiniBatchKMeans(n_clusters=maxClasses)

indices = [i for i in range(len(dataStorage.trainData))]

for batchNumber in range(len(dataStorage.trainData) // batchSize):
    inputs = torch.tensor([dataStorage.trainData[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]], dtype = torch.float32)
    inputs = inputs.to(device)

    outputs = model(inputs)
    outputs = outputs.cpu().detach().numpy()
    trainEmbeddings[batchNumber * batchSize : (batchNumber + 1) * batchSize] = outputs

kmeans.fit(trainEmbeddings, dataStorage.trainLabels)

correctPreds = 0
allPreds = 0

indices = [i for i in range(len(dataStorage.testData))]

for batchNumber in range(len(dataStorage.testData) // batchSize):
    inputs = torch.tensor([dataStorage.testData[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]], dtype = torch.float)
    labels = [dataStorage.testLabels[i] for i in indices[batchNumber * batchSize : (batchNumber + 1) * batchSize]]
    inputs = inputs.to(device)

    outputs = model(inputs)
    outputs = outputs.cpu().detach().numpy()
    preds = kmeans.predict(outputs)
    allPreds += len(labels)
    correctPreds += len([i for i in range(len(labels)) if preds[i] == labels[i]])

print(correctPreds / allPreds)
pickle.dump(kmeans, open('KMeans', 'wb'))

0.7866666666666666
