In [1]:
import torchvision
from torchvision.models import resnet18, ResNet18_Weights


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# torchvision.models.list_models()

In [3]:
weights = ResNet18_Weights.verify("ResNet18_Weights.IMAGENET1K_V1")
state_dict = weights.get_state_dict(True)

In [4]:
embed_size = 96

backbone = torchvision.models.resnet18(num_classes = embed_size)

In [5]:
import torch
with torch.no_grad():
    for i, j in backbone.named_parameters():
        if i in state_dict.keys() and j.shape == state_dict[i].shape:
            j[:] = state_dict[i]
            # print(i)

In [6]:
backbone.state_dict()['conv1.weight'].view((-1,))[:10]

tensor([-0.0104, -0.0061, -0.0018,  0.0748,  0.0566,  0.0171, -0.0127,  0.0111,
         0.0095, -0.1099])

In [7]:
state_dict['conv1.weight'].view((-1,))[:10]

tensor([-0.0104, -0.0061, -0.0018,  0.0748,  0.0566,  0.0171, -0.0127,  0.0111,
         0.0095, -0.1099], grad_fn=<SliceBackward0>)

In [8]:
backbone(torch.zeros(size=(1, 3, 640, 640))).shape

torch.Size([1, 96])

In [9]:
# torchvision.models.retinanet_resnet50_fpn
# torchvision.models.resnet18.__name__
# torchvision.models.resnet

In [10]:
import requests

headers = {
    'Authorization': 'Token e4342ac4fcf98c2e1910b122cb4103c059f8bbfc',
}

response = requests.get('https://bilishorturl.ml/api/projects/3/export?exportType=JSON', headers=headers)

import json
annotations = json.loads(response.content)

In [11]:
import os
import numpy as np

keypoints_mapping = {}

def getCenter(keypoints):
    for point in keypoints:
        point['center_x'] = point['x'] + point['width'] / 2 
        point['center_y'] = point['y'] + point['height'] / 2

# return_interpolation: When true append whether interpolated at the end
# 1 means exist, 0 means missing
def interpolation(keypoints, frames, return_interpolation):
    prev = keypoints[0]['frame'] - 1
    prev_x = 0
    prev_y = 0
    res = np.zeros((frames, 3 if return_interpolation else 2))
    for i in keypoints:
        diff = i['frame'] - prev
        cur_x = i['center_x']
        cur_y = i['center_y']
        cur = i['frame']
        for j in range(prev + 1, i['frame']):
            # tmp = {'frame': j}
            tmp_x = (prev_x * (cur - j) + cur_x * (j - prev)) / diff
            tmp_y = (prev_y * (cur - j) + cur_y * (j - prev)) / diff

            res[j - 1, :2] = (tmp_x / 100, tmp_y / 100)
            if return_interpolation:
                res[j - 1, -1] = 1
            # tmp['interpolated'] = True
            # res.append(tmp)
        res[cur - 1, :2] = (cur_x / 100, cur_y / 100)
        if return_interpolation:
            res[cur - 1, -1] = 1
        prev_x = cur_x
        prev_y = cur_y
        prev = i['frame']

    return res


labels_name = ['wand tip', 'wand end']
for annotation in annotations:
    vid_name = annotation['file_upload']

    boxes = annotation['annotations'][0]['result']
    
    wand_end_keypoint = None
    wand_tip_keypoint = None
    wand_end_framesCount = None
    wand_tip_framesCount = None

    for i in boxes:
        if 'labels' not in i['value'].keys():
            continue
        if i['value']['labels'][0] == labels_name[0]:
            wand_tip_keypoint = i['value']['sequence']
            wand_tip_framesCount = i['value']['framesCount']
        elif i['value']['labels'][0] == labels_name[1]:
            wand_end_keypoint = i['value']['sequence']
            wand_end_framesCount = i['value']['framesCount']
    
    assert wand_tip_keypoint and wand_end_keypoint, f"missing annotations for {annotation['id']}"
    assert wand_end_framesCount == wand_tip_framesCount, f'frames not matched for {annotation["id"]}'

    framesCount = wand_end_framesCount
    # assert boxes[0]['value']['framesCount'] == boxes[1]['value']['framesCount'], f'frames not matched for {annotation["id"]}'
    # assert len(boxes) >= 2, f"missing annotations for {annotation['id']}"

    
    getCenter(wand_end_keypoint)

    wand_end_keypoint = interpolation(wand_end_keypoint, framesCount, False)


    getCenter(wand_tip_keypoint)

    wand_tip_keypoint = interpolation(wand_tip_keypoint, framesCount, True)


    concat_keypoint = np.zeros((framesCount, 5))

    concat_keypoint[:, :2] = wand_end_keypoint
    concat_keypoint[:, 2:] = wand_tip_keypoint

    
    keypoints_mapping[vid_name] = torch.tensor(concat_keypoint)


In [12]:
# the directory that contains original videos

import os
source_dir = "G:/.shortcut-targets-by-id/1eyTB0qCfXgrxNsrmWNeLNbd5sTKzP5HT/Data Wizards/dataset/processed_vid"
category_mapping = {"3-24 V": 0, "3-25 bridge": 1, "3-25 R": 2, "Accio": 1, "Avada Kedavra": 3, "Invalid": 4, "Lumos": 0, "Revelio": 2}

vid_class = {} # name in processed_vid : category


for root, dirs, files in os.walk(source_dir):
    tmp_root = root[root.rfind('/') + 1: ]
    tmp_root = tmp_root[tmp_root.rfind('\\') + 1: ]
    category = None if tmp_root not in category_mapping.keys() else category_mapping[tmp_root]
    for name in files:
        if not name.endswith('mp4'):
            continue
        assert category is not None, f"No label at{os.path.join(root, name)} {tmp_root}"

        vid_class[name] = category

In [13]:
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

model_name = "x3d_m"
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 224,
        "crop_size": 224,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            # UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
# clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second



In [14]:
def uniform_temporal_subsample(
    x: torch.Tensor, num_samples: int, temporal_dim: int = -3
) -> torch.Tensor:
    """
    Uniformly subsamples num_samples indices from the temporal dimension of the video.
    When num_samples is larger than the size of temporal dimension of the video, it
    will sample frames based on nearest neighbor interpolation.

    Args:
        x (torch.Tensor): A video tensor with dimension larger than one with torch
            tensor type includes int, long, float, complex, etc.
        num_samples (int): The number of equispaced samples to be selected
        temporal_dim (int): dimension of temporal to perform temporal subsample.

    Returns:
        An x-like Tensor with subsampled temporal dimension.
    """
    t = x.shape[temporal_dim]
    assert num_samples > 0 and t > 0
    # Sample by nearest neighbor interpolation if num_samples > t.
    indices = torch.linspace(0, t - 1, num_samples)
    indices = torch.clamp(indices, 0, t - 1).long()
    return torch.index_select(x, temporal_dim, indices), indices

In [15]:
import os
import json
from pytorchvideo.data.encoded_video import EncodedVideo
import gc
import json
import tqdm


vid_file = 'G:\\.shortcut-targets-by-id\\1eyTB0qCfXgrxNsrmWNeLNbd5sTKzP5HT\\Data Wizards\\dataset\\videoSync'

vids_tensor = []
vids_category = []
vids_keypoints = []


for root, dirs, files in os.walk(vid_file):
    for name in tqdm.tqdm(files):
        vid_path = os.path.join(root, name)
        trim_name = name[name.find('-') + 1:]
        if not vid_path.endswith('.mp4') or trim_name not in vid_class.keys():
            print(name)
            continue
        video = EncodedVideo.from_path(vid_path)
        video_data = video.get_clip(start_sec=0, end_sec=3)
        del video
        gc.collect()
        video_cropped, indices = uniform_temporal_subsample(video_data['video'], transform_params["num_frames"])
        vids_tensor.append(transform({'video':video_cropped})['video'])
        vids_category.append(vid_class[trim_name])
        if name in keypoints_mapping:
            vids_keypoints.append(keypoints_mapping[name][indices])
        else:
            vids_keypoints.append(torch.zeros(size=(transform_params["num_frames"], 5)))
        

 73%|███████▎  | 289/395 [05:46<02:13,  1.26s/it]

647db7de-0a1aad14-IMG_1629.mp4


 80%|███████▉  | 315/395 [06:20<01:46,  1.33s/it]

4ec71381-04-06-2023-31_1.mp4


 81%|████████  | 318/395 [06:22<01:24,  1.10s/it]

05c834cd-04-06-2023-32_1.mp4


 84%|████████▍ | 333/395 [06:41<01:22,  1.33s/it]

8728fbe4-04-06-2023-30_1.mp4


 91%|█████████ | 360/395 [07:16<00:46,  1.33s/it]

a3da0b39-04-06-2023-8_1.mp4
475a799b-04-06-2023-4_1.mp4


 95%|█████████▍| 374/395 [07:31<00:27,  1.32s/it]

68bf2412-04-06-2023-6_1.mp4
6f9f9743-04-06-2023-15_1.mp4


 96%|█████████▌| 378/395 [07:34<00:16,  1.06it/s]

e61c79ee-04-06-2023-7_1.mp4


 97%|█████████▋| 385/395 [07:42<00:11,  1.18s/it]

f4da5e75-04-06-2023-3_1.mp4


100%|██████████| 395/395 [07:52<00:00,  1.20s/it]


desktop.ini


100%|██████████| 1/1 [00:00<?, ?it/s]

desktop.ini





In [16]:
import random
size = len(vid_class)

vids_concat = list(zip(vids_tensor, vids_keypoints, vids_category))

random.shuffle(vids_concat)

train_data = vids_concat[:int(size * 0.85)]
val_data = vids_concat[int(size * 0.85):]

In [17]:
from torch.utils.data import Dataset

class VidClsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, data):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        return self.data[idx]
    
train_dataset = VidClsDataset(train_data)
val_dataset = VidClsDataset(val_data)

In [18]:
from torch.utils.data import DataLoader
batch_size = 4

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

In [19]:
import torch
from torch import nn

y = None

class lstm(nn.Module):
    def __init__(self, backbone,embed_size = embed_size, in_between = 64, out_features = 4, num_layers = 2):
        super(lstm, self).__init__()
        self.embed_size = embed_size
        self.backbone = backbone
        # self.embedding = nn.Embedding(x_grid_size * y_grid_size + 1, embed_size)
        self.rnn = nn.LSTM(input_size = embed_size, hidden_size = 150, batch_first = True, bidirectional = True, dropout = 0.1, num_layers = num_layers)
        # self.rnn2 = nn.LSTM(input_size = embed_size, hidden_size = 100, batch_first = True, bidirectional = True, dropout = 0.1, num_layers = num_layers)
        self.dropout = nn.Dropout(p = 0.1)
        self.relu = nn.LeakyReLU()
        self.fc1 = nn.Linear(300, 200)
        self.dropout2 = nn.Dropout(p = 0.1)
        self.fc2 = nn.Linear(200, 5)


        self.keypoint_head_1 = nn.Linear(in_features=embed_size, out_features=in_between)
        self.keypoint_head_2 = nn.Linear(in_features=in_between, out_features = out_features)

    def forward(self, x):
        # print(self.rnn(x)[1][0].shape)
        # return self.fc(torch.squeeze(self.rnn(x)[1][0], dim=0))
        # print(self.rnn(x)[0].shape)
        global y
        y = x
        a, b, c, d, e = x.shape
        features = backbone(x.view((a * b, c, d, e))).view((a, b, self.embed_size))
        return self.fc2(self.dropout2(self.relu( \
            self.fc1(self.dropout(self.rnn(features)[0][:, -1, :]))))) , \
         self.keypoint_head_2(nn.ReLU()(self.keypoint_head_1(features.view((a * b, self.embed_size)))))



def get_grouped_params(model, weight_decay, no_decay=["bias", "rnn"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [20]:
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Adam
import numpy as np
import math

beta = 0.1
epoch = 2
num_categories = 5



train_size = len(vids_tensor)
steps = math.ceil(train_size / batch_size)
crossEntropy = CrossEntropyLoss()

device = 'cuda'
model = lstm(backbone=backbone).to(device)
optimizer = Adam(get_grouped_params(model, weight_decay = 0.01), lr= 1e-5)


In [21]:
def mse_loss(input, target, mask, reduction="mean"):
    out = (input - target.view((-1, 4)))**2
    # print(out.view((-1, 4)).shape)
    # print(mask.view((-1,)).shape)
    out = out.view((-1, 4)) * mask.view((-1, 1))
    if reduction == "mean":
        return out[out != 0].mean()
    elif reduction == "None":
        return out

In [24]:
output = None

def train(model, epoch, optimizer, train_dataloader, val_dataloader):
    global output
    for epoch_i in range(0, epoch):
        model.train()
    
        loss_list = []
        all = 0
        correct = 0

    
        for vid, keypoint, target in tqdm.tqdm(train_dataloader):

            # print(input[0].shape)
            vid = vid.permute((0,2,1,3,4)).contiguous().to(device)
            keypoint = keypoint.to(device)
            target = target.to(device)


            output = model(vid)

            
        
        
            cls_loss = crossEntropy(output[0], target)

            reg_loss = mse_loss(output[1], keypoint[:,:,:4], keypoint[:,:,4]) * beta

            loss = cls_loss
            if not reg_loss.isnan().cpu().item():
                loss += reg_loss


            # print(reg_loss)

            correct += torch.sum(torch.argmax(output[0], dim= 1) == target).item()
            all += vid.shape[0]


        
            optimizer.zero_grad()
            loss.backward()
            loss_list.append(loss.item())
            optimizer.step()

            del vid
            del keypoint
            del target

            torch.cuda.empty_cache()
            gc.collect()
        
        print(epoch_i,' train loss:', np.mean(loss_list))
        print('train acc:', correct / all)
    

        correct = 0
        all = 0

        model.eval()
        for vid, keypoint, target in tqdm.tqdm(val_dataloader):

            # print(input[0].shape)
            vid = vid.permute((0,2,1,3,4)).contiguous().to(device)
            keypoint = keypoint.to(device)
            target = target.to(device)


            output = model(vid)

            # print(reg_loss)

            correct += torch.sum(torch.argmax(output[0], dim= 1) == target).item()
            all += vid.shape[0]

            del vid
            del keypoint
            del target

            torch.cuda.empty_cache()
            gc.collect()
        print('val acc:', correct / all)



In [25]:
train(model, 20, optimizer, train_dataloader, val_dataloader)

100%|██████████| 70/70 [00:18<00:00,  3.78it/s]


0  train loss: 1.6094906040600367
train acc: 0.20863309352517986


100%|██████████| 106/106 [00:14<00:00,  7.28it/s]


val acc: 0.27358490566037735


100%|██████████| 70/70 [00:18<00:00,  3.74it/s]


1  train loss: 1.5955458964620317
train acc: 0.31654676258992803


100%|██████████| 106/106 [00:12<00:00,  8.49it/s]


val acc: 0.44339622641509435


100%|██████████| 70/70 [00:18<00:00,  3.75it/s]


2  train loss: 1.5769921728542873
train acc: 0.42805755395683454


100%|██████████| 106/106 [00:12<00:00,  8.42it/s]


val acc: 0.5188679245283019


100%|██████████| 70/70 [00:18<00:00,  3.79it/s]


3  train loss: 1.5458644134657724
train acc: 0.5251798561151079


100%|██████████| 106/106 [00:11<00:00,  9.14it/s]


val acc: 0.5754716981132075


100%|██████████| 70/70 [00:17<00:00,  3.97it/s]


4  train loss: 1.4890140226909092
train acc: 0.6151079136690647


100%|██████████| 106/106 [00:11<00:00,  9.31it/s]


val acc: 0.660377358490566


100%|██████████| 70/70 [00:17<00:00,  3.94it/s]


5  train loss: 1.4075806907245092
train acc: 0.6798561151079137


100%|██████████| 106/106 [00:12<00:00,  8.75it/s]


val acc: 0.6886792452830188


100%|██████████| 70/70 [00:18<00:00,  3.78it/s]


6  train loss: 1.2842665902205876
train acc: 0.737410071942446


100%|██████████| 106/106 [00:12<00:00,  8.62it/s]


val acc: 0.6886792452830188


100%|██████████| 70/70 [00:18<00:00,  3.76it/s]


7  train loss: 1.1817974303449903
train acc: 0.7338129496402878


100%|██████████| 106/106 [00:12<00:00,  8.39it/s]


val acc: 0.7075471698113207


100%|██████████| 70/70 [00:18<00:00,  3.77it/s]


8  train loss: 1.0606905102729798
train acc: 0.7949640287769785


100%|██████████| 106/106 [00:12<00:00,  8.70it/s]


val acc: 0.839622641509434


100%|██████████| 70/70 [00:17<00:00,  3.96it/s]


9  train loss: 0.9228447462831225
train acc: 0.8597122302158273


100%|██████████| 106/106 [00:11<00:00,  9.27it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:17<00:00,  3.95it/s]


10  train loss: 0.8171799578836986
train acc: 0.8741007194244604


100%|██████████| 106/106 [00:11<00:00,  9.34it/s]


val acc: 0.9433962264150944


100%|██████████| 70/70 [00:18<00:00,  3.86it/s]


11  train loss: 0.7264001535517829
train acc: 0.8848920863309353


100%|██████████| 106/106 [00:11<00:00,  9.14it/s]


val acc: 0.8679245283018868


100%|██████████| 70/70 [00:17<00:00,  3.97it/s]


12  train loss: 0.589730503303664
train acc: 0.9172661870503597


100%|██████████| 106/106 [00:11<00:00,  9.25it/s]


val acc: 0.8962264150943396


100%|██████████| 70/70 [00:17<00:00,  3.97it/s]


13  train loss: 0.49310784637928007
train acc: 0.9496402877697842


100%|██████████| 106/106 [00:11<00:00,  9.30it/s]


val acc: 0.9528301886792453


100%|██████████| 70/70 [00:18<00:00,  3.83it/s]


14  train loss: 0.4203102420483317
train acc: 0.9640287769784173


100%|██████████| 106/106 [00:11<00:00,  9.19it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:17<00:00,  3.96it/s]


15  train loss: 0.3752299483333315
train acc: 0.9568345323741008


100%|██████████| 106/106 [00:11<00:00,  9.17it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:17<00:00,  3.96it/s]


16  train loss: 0.32764080315828326
train acc: 0.9712230215827338


100%|██████████| 106/106 [00:11<00:00,  9.17it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:18<00:00,  3.77it/s]


17  train loss: 0.2665068550833634
train acc: 0.9820143884892086


100%|██████████| 106/106 [00:13<00:00,  8.02it/s]


val acc: 0.8867924528301887


100%|██████████| 70/70 [00:18<00:00,  3.69it/s]


18  train loss: 0.24312072470784188
train acc: 0.9748201438848921


100%|██████████| 106/106 [00:12<00:00,  8.49it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:18<00:00,  3.85it/s]


19  train loss: 0.1958825582904475
train acc: 0.9820143884892086


100%|██████████| 106/106 [00:12<00:00,  8.51it/s]

val acc: 0.9245283018867925





In [26]:
output

tensor([[ 0.0065, -0.0238, -0.0570, -0.1022, -0.0679],
        [ 0.0018, -0.0277, -0.0492, -0.0985, -0.0915],
        [-0.0071, -0.0186, -0.0418, -0.0832, -0.0755],
        [-0.0008, -0.0127, -0.0558, -0.1048, -0.0637]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [41]:
y.shape

torch.Size([4, 16, 3, 224, 224])

In [60]:
9633792/96

100352.0

In [43]:
a, b, c, d, e = y.shape

In [44]:
(a * b, c, d, e)

(64, 3, 224, 224)

In [45]:
y.view((a * b, c, d, e)).shape

torch.Size([64, 3, 224, 224])

In [52]:
model.rnn(model.backbone(y.view((a * b, c, d, e))).view((4,16,96)))

(tensor([[[ 0.0381, -0.0470,  0.0389,  ..., -0.1045, -0.0727,  0.0261],
          [ 0.0414, -0.0652,  0.0639,  ..., -0.0921, -0.0698,  0.0198],
          [ 0.0383, -0.0728,  0.0718,  ..., -0.0754, -0.0642,  0.0262],
          ...,
          [ 0.0623, -0.0517,  0.1028,  ..., -0.0656, -0.0451,  0.0191],
          [ 0.0457, -0.0425,  0.1122,  ..., -0.0500, -0.0242,  0.0156],
          [ 0.0275, -0.0305,  0.1014,  ..., -0.0336, -0.0203,  0.0078]],
 
         [[ 0.0089, -0.0407,  0.0113,  ..., -0.1021, -0.1198, -0.0263],
          [ 0.0377, -0.0600,  0.0329,  ..., -0.1021, -0.1205, -0.0343],
          [ 0.0695, -0.0767,  0.0213,  ..., -0.1146, -0.1365, -0.0347],
          ...,
          [ 0.0434, -0.0558,  0.0411,  ..., -0.0812, -0.0770, -0.0266],
          [ 0.0370, -0.0373,  0.0368,  ..., -0.0478, -0.0658, -0.0040],
          [ 0.0338, -0.0288,  0.0367,  ..., -0.0515, -0.0481,  0.0009]],
 
         [[ 0.0119, -0.0397,  0.0095,  ..., -0.0550, -0.1150,  0.0045],
          [ 0.0088, -0.0602,