<a href="https://colab.research.google.com/github/ShreeyashGo/ADifferentVideoVisionTransformer/blob/main/VideoViTsModified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading the dataset
I have implemented the modified Video Vision Transformer on two classes which I took from Kinetics dataset. I have used a pre implemented downloader and downloaded about 150 videos from each class. Downloading even those many videos takes a sufficiently large amount of time. Hence I have downloaded it into a drive which you may link into the colab notebook and skip this section.

The Drive: [drive](https://drive.google.com/drive/u/0/folders/1-1BGLNghpKtHQ0AdfsNyWThzy2HnEpBq)

In [None]:
!git clone 'https://github.com/Showmax/kinetics-downloader.git'
%cd kinetics-downloader

In [None]:
!pip install --upgrade youtube-dl
!pip install --upgrade ffmpeg

In [None]:
!python list_classes.py

In [None]:
import json 
import os


number_of_vids_perclass = 161
f = open('/content/kinetics-downloader/resources/kinetics_train.json')
traindat = json.load(f)
dicti = {}
dunkCount = 0
dribbleCount = 0
for i in traindat:
    if(traindat[i]['annotations']['label'] == 'dunking basketball') and dunkCount<number_of_vids_perclass:
        dicti[i] = traindat[i]
        dunkCount+=1
    elif(traindat[i]['annotations']['label'] == 'dribbling basketball') and dribbleCount<number_of_vids_perclass:
        dicti[i] = traindat[i]
        dribbleCount+=1
print(dunkCount, dribbleCount)
json_train = json.dumps(dicti, indent = 4)

with open("/content/kinetics-downloader/resources/kinetics_train.json", "w") as outfile:
    outfile.write(json_train)


In [None]:
#the following code takes lots of time(almost 3 hrs)
!python download.py --classes 'dribbling basketball' 'dunking basketball' --num-workers 20


In [None]:
print(len(os.listdir('/content/kinetics-downloader/dataset/train/dribbling_basketball')))
print(len(os.listdir('/content/kinetics-downloader/dataset/train/dunking_basketball')))

In [None]:
f = open('/content/kinetics-downloader/resources/kinetics_train.json')
traindat = json.load(f)
dicti = {}
dunkCount = 0
dribbleCount = 0
for i in traindat:
    # print((i+'.mp4') in os.listdir('/content/kinetics-downloader/dataset/train/dribbling_basketball'))
    # break
    if(i+'.mp4' in os.listdir('/content/kinetics-downloader/dataset/train/dribbling_basketball')):
        continue
    elif(i[1:]+'.mp4' in os.listdir('/content/kinetics-downloader/dataset/train/dunking_basketball')):
        continue
    elif(i[1:]+'.mp4' not in os.listdir('/content/kinetics-downloader/dataset/train/dunking_basketball') and traindat[i]['annotations']['label'] == 'dunking basketball'):
        dicti[i] = traindat[i]
        dunkCount+=1
    else:
        dicti[i] = traindat[i]
        dribbleCount+=1
print(dunkCount, dribbleCount)
json_train = json.dumps(dicti, indent = 4)

with open("/content/kinetics-downloader/resources/kinetics_train.json", "w") as outfile:
    outfile.write(json_train)


In [None]:
!python videos_to_frames.py --classes 'dribbling basketball' 'dunking basketball' --num-workers 20

#The Implementation




In [1]:
from google.colab import drive
import os
from PIL import Image
import numpy as np
import tensorflow as tf
import keras
from keras import layers


In [None]:
drive.mount('/content/gdrive')
#if you are using drive link, please add shortcut to the folder in "MyDrive" folder else you will have to change the path
#if you are downloading it again, please change the path so as to avoid clashes in the further code
from_path='/content/gdrive/MyDrive/KineticsBasketBall'


In [3]:
# we will find the length of each image in the entire dataset to get a common 
# if using the same data as per the drive, no need to run this cell again as I got the smallest size
# if we are using any other data please uncomment the code


# os.chdir(from_path)
# for classes in os.listdir('./'):
#     path2vids = f'./{classes}'
#     mindim = []
#     for videos in os.listdir(path2vids):
#         minx = 1080
#         miny = 1080
        
#         path2imgs = path2vids +f'/{videos}'
#         for images in os.listdir(path2imgs):
#             finpath = path2imgs+f'/{images}'
#             with Image.open(finpath) as im:
#                 if minx>=im.size[0]:
#                     minx = im.size[0]
#                 if miny>=im.size[1]:
#                     miny = im.size[1]
#         mindim.append([minx, miny])

# mindim = np.array(mindim)
# print(min(mindim[:,0]), min(mindim[:,1]))


In [4]:
# the embedder layer which comprises of a FLatten layer and a few Dense layers which are then added with the positional embeddings

class Embedder(layers.Layer):
    def __init__(self, frame_num, embeddims):
        super(Embedder, self).__init__()
        self.framenum = frame_num
        self.flat = layers.Flatten()
        self.linear1 = layers.Dense(2048, activation='relu')
        self.linear2 = layers.Dense(embeddims, activation = 'relu')
        self.posdims = layers.Embedding(input_dim=numframes, output_dim = 1)
    
    def call(self, batch):
        position = tf.range(start = 0, limit = self.framenum, delta = 1)
        embedding = self.flat(batch)
        embedding = self.linear1(embedding)
        embedding = self.linear2(embedding)
        posembeds = self.posdims(position)
        embedding = layers.Add()([embedding, posembeds])
        # print(posembeds.shape)
        return embedding

In [5]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Transformer block multi-head Self Attention
        self.multiheadSelf = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.1)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation = 'sigmoid'), 
             layers.Dense(embed_dim, activation = 'sigmoid'),]
        )
        self.embedDIM = embed_dim
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, inputs):
        inputs = layers.Reshape(target_shape = (self.embedDIM, 1))(inputs)
        x1 = self.layernorm1(inputs)
        attention_op1 = self.multiheadSelf(x1, x1)
        # print(attention_op1)
        attention_op1 = self.dropout1(attention_op1)
        spatioOP = layers.Add()([attention_op1, inputs]) 
   
        x2 = self.layernorm1(spatioOP)
        attention_op2 = self.multiheadSelf(x2, x2)
        attention_op2 = self.dropout1(attention_op2)
        tempoOP = layers.Add()([attention_op2, spatioOP])   
     
        ff_inp = layers.Flatten()(tempoOP)
        ff_inp = self.layernorm2(ff_inp)
        ff_op = self.ffn(ff_inp)
        ff_op = self.dropout2(ff_op)
        ff_op = layers.Add()([ff_inp, ff_op]) 
        return self.layernorm2(ff_op)
    


In [6]:
## so the a minimum size of 256, 256, 3 can be extracted for all the images
## so we are arranging the frames in a single list and storing the list of size of frames per video

finalclass = []
framenum = []
for classes in os.listdir('/content/gdrive/MyDrive/KineticsBasketBall/'):
    final = []
    midframe = []
    if '.npy' in classes:
        continue 
    for video in os.listdir(f'/content/gdrive/MyDrive/KineticsBasketBall/{classes}'):
        vid = []
        framed = []
        dir = np.asarray(os.listdir(f'/content/gdrive/MyDrive/KineticsBasketBall/{classes}/{video}'))
        midframe.append(len(dir))
        dirkeys = np.array([i[5:-4] for i in dir])
        dirkeys = dirkeys.astype(int)
        dir_fin = np.lexsort((dir, dirkeys))
        for i in dir_fin:
            vid.append(f'/content/gdrive/MyDrive/KineticsBasketBall/{classes}/{video}/{dir[i]}')
        final.append(vid)
    finalclass.append(final)
    framenum.append(midframe)

In [7]:
# forming batches to be fed into the transformer
embedDim = 512
numframes = 32


batcheddata = []
for classnum in range(len(finalclass)):
    classbatch = []
    for vidnum in range(len(finalclass[classnum])):
        batchnum = 0
        while((batchnum+numframes)<framenum[classnum][vidnum]):
            classbatch.append(finalclass[classnum][vidnum][batchnum:batchnum+numframes])
            batchnum+=numframes
    batcheddata.append(classbatch)

np.random.shuffle(batcheddata[0])
np.random.shuffle(batcheddata[1])

In [8]:
inp = layers.Input(shape = (128, 128, 3), batch_size = numframes)
embeddings = Embedder(numframes, embedDim)(inp)
transformed = TransformerBlock(embedDim, num_heads = 3, ff_dim = 128)(embeddings)

transformed = tf.reshape(transformed, (1, -1))
mlpLevel1 = layers.Dense(64, activation = tf.keras.activations.gelu)(transformed)
mlpLevel2 = layers.Dense(32, activation = tf.keras.activations.gelu)(mlpLevel1)
op = layers.Dense(1, activation = 'sigmoid')(mlpLevel2)

model = keras.models.Model(inp, op)

In [None]:
model.summary()

# keras.utils.vis_utils.plot_model(model, to_file= 'model_plot.png', show_shapes = True, show_layer_names = True)

In [None]:
# training takes a really long time as the size of the image is considerably large
num_epochs = 1
lenDunking = len(batcheddata[0])
lenDribbling = len(batcheddata[1])
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)


for i in range(num_epochs):
    print(f'Epoch {i+1}')
    print('----------------------')
    label = 0
    dunkingPtr = 0    
    dribblingPtr = 0
    losses = [] 
    while dunkingPtr<lenDunking or dribblingPtr<lenDribbling:
        if label == 0:
            if dunkingPtr<lenDunking:
                batch = batcheddata[label][dunkingPtr]
                dunkingPtr+=1
            else:
                batch = batcheddata[1][dribblingPtr]
                dribblingPtr+=1
        else:
            if dribblingPtr<lenDribbling:
                batch = batcheddata[label][dribblingPtr]
                dribblingPtr+=1
            else:
                batch = batcheddata[0][dunkingPtr]
                dunkingPtr+=1

        imbatch = []
        for image in batch:
            im = Image.open(image)
            im = im.resize((128, 128))
            imbatch.append(np.array(im))
        imbatch = np.array(imbatch)
 
        with tf.GradientTape() as tape:
            y_pred = model(imbatch)
            # print(y_pred)
            loss = -(label * tf.math.log(y_pred) + (1-label) * tf.math.log((1-y_pred)))
        # print(loss)

        trainable_vars_tranformer = model.trainable_variables
        gradients_transformer = tape.gradient(loss, trainable_vars_tranformer)
        # print(gradients_transformer)

        optimizer.apply_gradients(grads_and_vars =zip(gradients_transformer, trainable_vars_tranformer))
        losses.append(loss)
        label = label^1

        if (dunkingPtr + dribblingPtr) % 20 == 0:
            print(f'the loss on the {dunkingPtr + dribblingPtr}/{lenDunking + lenDribbling} step is: {loss}')
            if (dunkingPtr + dribblingPtr)%100 == 0:
                np.save(f'{from_path}/weightschkpt', np.array(model.get_weights(), dtype=object))
                print(f"the checkpoint pointers are: DribblingPtr: {dribblingPtr}, DunkingPtr: {dunkingPtr}")

        
        if (np.mean(losses[-10:])<=0.01):
            print('changing the lr')
            optimizer.learning_rate = optimizer.learning_rate//2

    
    print(f'the loss at the end of epoch{i+1}: {np.mean(losses)}')
    print('----------------------')
    

In [10]:
os.chdir(from_path)

In [18]:
np.save(f'{from_path}/weights2', np.array(model.get_weights(), dtype=object))

In [11]:
weights = np.load('/content/gdrive/MyDrive/KineticsBasketBall/weights1.npy', allow_pickle=True)
model.set_weights(weights)

In [20]:
#taking random videos from the train set to check the accuracy
samples = 30
test = np.random.choice(len(batcheddata[0]),samples)

truPos = 0
falsePos = 0
truNegs = 0
falseNegs = 0


for i in test:
    imbatch = []
    batch = batcheddata[0][i]
    for image in batch:
        im = Image.open(image)
        im = im.resize((128, 128))
        imbatch.append(np.array(im))
    imbatch = np.array(imbatch)
    if model.predict(imbatch)<=0.5:
        truNegs+=1
    else:
        falseNegs+=1

test = np.random.choice(len(batcheddata[1]), samples)

for i in test:
    imbatch = []
    batch = batcheddata[1][i]
    for image in batch:
        im = Image.open(image)
        im = im.resize((128, 128))
        imbatch.append(np.array(im))
    imbatch = np.array(imbatch)
    if model.predict(imbatch)>=0.5:
        truPos+=1
    else:
        falsePos+=1

In [21]:
# trained on fewer epochs so the accuracy is not that great
print(f'acc = {(truPos+truNegs)/(truPos+truNegs+falsePos+falseNegs)}')
print(f'f1Score = {(2*truPos)/(2*truPos+falsePos+falseNegs)}')

acc = 0.7166666666666667
f1Score = 0.6666666666666666
