In [1]:
from typing import List, Optional

import torch
import torch.nn as nn
from pytorchvideo.layers.utils import set_attributes
from pytorchvideo.models.weight_init import init_net_weights

class Net(nn.Module):
    """
    Build a general Net models with a list of blocks for video recognition.

    ::

                                         Input
                                           ↓
                                         Block 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Block N
                                           ↓

    The ResNet builder can be found in `create_resnet`.
    """

    def __init__(self, *, blocks: nn.ModuleList) -> None:
        """
        Args:
            blocks (torch.nn.module_list): the list of block modules.
        """
        super().__init__()
        assert blocks is not None
        self.blocks = blocks
        init_net_weights(self)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for idx in range(len(self.blocks)):
            x = self.blocks[idx](x)
            # print(x.shape)
        return x


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import math
from typing import Callable, Tuple

import numpy as np
import torch
import torch.nn as nn
from fvcore.nn.squeeze_excitation import SqueezeExcitation
from pytorchvideo.layers.convolutions import Conv2plus1d
from pytorchvideo.layers.swish import Swish
from pytorchvideo.layers.utils import round_repeats, round_width, set_attributes
from pytorchvideo.models.head import ResNetBasicHead
# from pytorchvideo.models.net import Net
from pytorchvideo.models.resnet import BottleneckBlock, ResBlock, ResStage
from pytorchvideo.models.stem import ResNetBasicStem


def create_x3d_stem(
    *,
    # Conv configs.
    in_channels: int,
    out_channels: int,
    conv_kernel_size: Tuple[int] = (5, 3, 3),
    conv_stride: Tuple[int] = (1, 2, 2),
    conv_padding: Tuple[int] = (2, 1, 1),
    # BN configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    # Activation configs.
    activation: Callable = nn.ReLU,
) -> nn.Module:
    """
    Creates the stem layer for X3D. It performs spatial Conv, temporal Conv, BN, and Relu.

    ::

                                        Conv_xy
                                           ↓
                                        Conv_t
                                           ↓
                                     Normalization
                                           ↓
                                       Activation

    Args:
        in_channels (int): input channel size of the convolution.
        out_channels (int): output channel size of the convolution.
        conv_kernel_size (tuple): convolutional kernel size(s).
        conv_stride (tuple): convolutional stride size(s).
        conv_padding (tuple): convolutional padding size(s).

        norm (callable): a callable that constructs normalization layer, options
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, options
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): X3D stem layer.
    """
    conv_xy_module = nn.Conv3d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=(1, conv_kernel_size[1], conv_kernel_size[2]),
        stride=(1, conv_stride[1], conv_stride[2]),
        padding=(0, conv_padding[1], conv_padding[2]),
        bias=False,
    )
    conv_t_module = nn.Conv3d(
        in_channels=out_channels,
        out_channels=out_channels,
        kernel_size=(conv_kernel_size[0], 1, 1),
        stride=(conv_stride[0], 1, 1),
        padding=(conv_padding[0], 0, 0),
        bias=False,
        groups=out_channels,
    )
    stacked_conv_module = Conv2plus1d(
        conv_t=conv_xy_module,
        norm=None,
        activation=None,
        conv_xy=conv_t_module,
    )

    norm_module = (
        None
        if norm is None
        else norm(num_features=out_channels, eps=norm_eps, momentum=norm_momentum)
    )
    activation_module = None if activation is None else activation()

    return ResNetBasicStem(
        conv=stacked_conv_module,
        norm=norm_module,
        activation=activation_module,
        pool=None,
    )


def create_x3d_bottleneck_block(
    *,
    # Convolution configs.
    dim_in: int,
    dim_inner: int,
    dim_out: int,
    conv_kernel_size: Tuple[int] = (3, 3, 3),
    conv_stride: Tuple[int] = (1, 2, 2),
    # Norm configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    se_ratio: float = 0.0625,
    # Activation configs.
    activation: Callable = nn.ReLU,
    inner_act: Callable = Swish,
) -> nn.Module:
    """
    Bottleneck block for X3D: a sequence of Conv, Normalization with optional SE block,
    and Activations repeated in the following order:

    ::

                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                    Conv3d (conv_b)
                                           ↓
                                 Normalization (norm_b)
                                           ↓
                                 Squeeze-and-Excitation
                                           ↓
                                   Activation (act_b)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D bottleneck block.
    """
    # 1x1x1 Conv
    conv_a = nn.Conv3d(
        in_channels=dim_in, out_channels=dim_inner, kernel_size=(1, 1, 1), bias=False
    )
    norm_a = (
        None
        if norm is None
        else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
    )
    act_a = None if activation is None else activation()

    # 3x3x3 Conv
    conv_b = nn.Conv3d(
        in_channels=dim_inner,
        out_channels=dim_inner,
        kernel_size=conv_kernel_size,
        stride=conv_stride,
        padding=[size // 2 for size in conv_kernel_size],
        bias=False,
        groups=dim_inner,
        dilation=(1, 1, 1),
    )
    se = (
        SqueezeExcitation(
            num_channels=dim_inner,
            num_channels_reduced=round_width(dim_inner, se_ratio),
            is_3d=True,
        )
        if se_ratio > 0.0
        else nn.Identity()
    )
    norm_b = nn.Sequential(
        (
            nn.Identity()
            if norm is None
            else norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
        ),
        se,
    )
    act_b = None if inner_act is None else inner_act()

    # 1x1x1 Conv
    conv_c = nn.Conv3d(
        in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
    )
    norm_c = (
        None
        if norm is None
        else norm(num_features=dim_out, eps=norm_eps, momentum=norm_momentum)
    )

    return BottleneckBlock(
        conv_a=conv_a,
        norm_a=norm_a,
        act_a=act_a,
        conv_b=conv_b,
        norm_b=norm_b,
        act_b=act_b,
        conv_c=conv_c,
        norm_c=norm_c,
    )


def create_x3d_res_block(
    *,
    # Bottleneck Block configs.
    dim_in: int,
    dim_inner: int,
    dim_out: int,
    bottleneck: Callable = create_x3d_bottleneck_block,
    use_shortcut: bool = True,
    # Conv configs.
    conv_kernel_size: Tuple[int] = (3, 3, 3),
    conv_stride: Tuple[int] = (1, 2, 2),
    # Norm configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    se_ratio: float = 0.0625,
    # Activation configs.
    activation: Callable = nn.ReLU,
    inner_act: Callable = Swish,
) -> nn.Module:
    """
    Residual block for X3D. Performs a summation between an identity shortcut in branch1 and a
    main block in branch2. When the input and output dimensions are different, a
    convolution followed by a normalization will be performed.

    ::

                                         Input
                                           |-------+
                                           ↓       |
                                         Block     |
                                           ↓       |
                                       Summation ←-+
                                           ↓
                                       Activation

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable for create_x3d_bottleneck_block.

        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D block layer.
    """

    norm_model = None
    if norm is not None and dim_in != dim_out:
        norm_model = norm(num_features=dim_out)

    return ResBlock(
        branch1_conv=nn.Conv3d(
            dim_in,
            dim_out,
            kernel_size=(1, 1, 1),
            stride=conv_stride,
            bias=False,
        )
        if (dim_in != dim_out or np.prod(conv_stride) > 1) and use_shortcut
        else None,
        branch1_norm=norm_model if dim_in != dim_out and use_shortcut else None,
        branch2=bottleneck(
            dim_in=dim_in,
            dim_inner=dim_inner,
            dim_out=dim_out,
            conv_kernel_size=conv_kernel_size,
            conv_stride=conv_stride,
            norm=norm,
            norm_eps=norm_eps,
            norm_momentum=norm_momentum,
            se_ratio=se_ratio,
            activation=activation,
            inner_act=inner_act,
        ),
        activation=None if activation is None else activation(),
        branch_fusion=lambda x, y: x + y,
    )


def create_x3d_res_stage(
    *,
    # Stage configs.
    depth: int,
    # Bottleneck Block configs.
    dim_in: int,
    dim_inner: int,
    dim_out: int,
    bottleneck: Callable = create_x3d_bottleneck_block,
    # Conv configs.
    conv_kernel_size: Tuple[int] = (3, 3, 3),
    conv_stride: Tuple[int] = (1, 2, 2),
    # Norm configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    se_ratio: float = 0.0625,
    # Activation configs.
    activation: Callable = nn.ReLU,
    inner_act: Callable = Swish,
) -> nn.Module:
    """
    Create Residual Stage, which composes sequential blocks that make up X3D.

    ::

                                        Input
                                           ↓
                                       ResBlock
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                       ResBlock

    Args:

        depth (init): number of blocks to create.

        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable for create_x3d_bottleneck_block.

        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D stage layer.
    """
    res_blocks = []
    for idx in range(depth):
        block = create_x3d_res_block(
            dim_in=dim_in if idx == 0 else dim_out,
            dim_inner=dim_inner,
            dim_out=dim_out,
            bottleneck=bottleneck,
            conv_kernel_size=conv_kernel_size,
            conv_stride=conv_stride if idx == 0 else (1, 1, 1),
            norm=norm,
            norm_eps=norm_eps,
            norm_momentum=norm_momentum,
            se_ratio=(se_ratio if (idx + 1) % 2 else 0.0),
            activation=activation,
            inner_act=inner_act,
        )
        res_blocks.append(block)

    return ResStage(res_blocks=nn.ModuleList(res_blocks))


def create_x3d_head(
    *,
    # Projection configs.
    dim_in: int,
    dim_inner: int,
    dim_out: int,
    num_classes: int,
    # Pooling configs.
    pool_act: Callable = nn.ReLU,
    pool_kernel_size: Tuple[int] = (13, 5, 5),
    # BN configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    bn_lin5_on=False,
    # Dropout configs.
    dropout_rate: float = 0.5,
    # Activation configs.
    activation: Callable = nn.Softmax,
    # Output configs.
    output_with_global_average: bool = True,
) -> nn.Module:
    """
    Creates X3D head. This layer performs an projected pooling operation followed
    by an dropout, a fully-connected projection, an activation layer and a global
    spatiotemporal averaging.

    ::

                                     ProjectedPool
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    Args:
        dim_in (int): input channel size of the X3D head.
        dim_inner (int): intermediate channel size of the X3D head.
        dim_out (int): output channel size of the X3D head.
        num_classes (int): the number of classes for the video dataset.

        pool_act (callable): a callable that constructs resnet pool activation
            layer such as nn.ReLU.
        pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
            pooling.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        bn_lin5_on (bool): if True, perform normalization on the features
            before the classifier.

        dropout_rate (float): dropout rate.

        activation (callable): a callable that constructs resnet head activation
            layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
            applying activation).

        output_with_global_average (bool): if True, perform global averaging on temporal
            and spatial dimensions and reshape output to batch_size x out_features.

    Returns:
        (nn.Module): X3D head layer.
    """
    # print(locals())
    pre_conv_module = nn.Conv3d(
        in_channels=dim_in, out_channels=dim_inner, kernel_size=(1, 1, 1), bias=False
    )

    pre_norm_module = norm(num_features=dim_inner, eps=norm_eps, momentum=norm_momentum)
    pre_act_module = None if pool_act is None else pool_act()

    if pool_kernel_size is None:
        pool_module = nn.AdaptiveAvgPool3d((1, 1, 1))
    else:
        pool_module = nn.AvgPool3d(pool_kernel_size, stride=1)

    post_conv_module = nn.Conv3d(
        in_channels=dim_inner, out_channels=dim_out, kernel_size=(1, 1, 1), bias=False
    )

    if bn_lin5_on:
        post_norm_module = norm(
            num_features=dim_out, eps=norm_eps, momentum=norm_momentum
        )
    else:
        post_norm_module = None
    post_act_module = None if pool_act is None else pool_act()

    projected_pool_module = ProjectedPool(
        pre_conv=pre_conv_module,
        pre_norm=pre_norm_module,
        pre_act=pre_act_module,
        pool=pool_module,
        post_conv=post_conv_module,
        post_norm=post_norm_module,
        post_act=post_act_module,
    )

    if activation is None:
        activation_module = None
    elif activation == nn.Softmax:
        activation_module = activation(dim=1)
    elif activation == nn.Sigmoid:
        activation_module = activation()
    else:
        raise NotImplementedError(
            "{} is not supported as an activation" "function.".format(activation)
        )

    if output_with_global_average:
        output_pool = nn.AdaptiveAvgPool3d(1)
    else:
        output_pool = None

    return ResNetBasicHead(
        proj=nn.Linear(dim_out, num_classes, bias=True),
        activation=activation_module,
        pool=projected_pool_module,
        dropout=nn.Dropout(dropout_rate) if dropout_rate > 0 else None,
        output_pool=output_pool,
    )


def create_x3d(
    *,
    # Input clip configs.
    input_channel: int = 3,
    input_clip_length: int = 13,
    input_crop_size: int = 160,
    # Model configs.
    model_num_class: int = 400,
    dropout_rate: float = 0.5,
    width_factor: float = 2.0,
    depth_factor: float = 2.2,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_in: int = 12,
    stem_conv_kernel_size: Tuple[int] = (5, 3, 3),
    stem_conv_stride: Tuple[int] = (1, 2, 2),
    # Stage configs.
    stage_conv_kernel_size: Tuple[Tuple[int]] = (
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
    ),
    stage_spatial_stride: Tuple[int] = (2, 2, 2, 2),
    stage_temporal_stride: Tuple[int] = (1, 1, 1, 1),
    bottleneck: Callable = create_x3d_bottleneck_block,
    bottleneck_factor: float = 2.25,
    se_ratio: float = 0.0625,
    inner_act: Callable = Swish,
    # Head configs.
    head_dim_out: int = 2048,
    head_pool_act: Callable = nn.ReLU,
    head_bn_lin5_on: bool = False,
    head_activation: Callable = nn.Softmax,
    head_output_with_global_average: bool = True,
) -> nn.Module:
    """
    X3D model builder. It builds a X3D network backbone, which is a ResNet.

    Christoph Feichtenhofer.
    "X3D: Expanding Architectures for Efficient Video Recognition."
    https://arxiv.org/abs/2004.04730

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:
        input_channel (int): number of channels for the input video clip.
        input_clip_length (int): length of the input video clip. Value for
            different models: X3D-XS: 4; X3D-S: 13; X3D-M: 16; X3D-L: 16.
        input_crop_size (int): spatial resolution of the input video clip.
            Value for different models: X3D-XS: 160; X3D-S: 160; X3D-M: 224;
            X3D-L: 312.

        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.
        width_factor (float): width expansion factor.
        depth_factor (float): depth expansion factor. Value for different
            models: X3D-XS: 2.2; X3D-S: 2.2; X3D-M: 2.2; X3D-L: 5.0.

        norm (callable): a callable that constructs normalization layer.
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer.

        stem_dim_in (int): input channel size for stem before expansion.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.

        stage_conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck_factor (float): bottleneck expansion factor for the 3x3x3 conv.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.
        inner_act (callable): whether use Swish activation for act_b or not.

        head_dim_out (int): output channel size of the X3D head.
        head_pool_act (callable): a callable that constructs resnet pool activation
            layer such as nn.ReLU.
        head_bn_lin5_on (bool): if True, perform normalization on the features
            before the classifier.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): the X3D network.
    """

    torch._C._log_api_usage_once("PYTORCHVIDEO.model.create_x3d")

    blocks = []
    # Create stem for X3D.
    stem_dim_out = round_width(stem_dim_in, width_factor)
    stem = create_x3d_stem(
        in_channels=input_channel,
        out_channels=stem_dim_out,
        conv_kernel_size=stem_conv_kernel_size,
        conv_stride=stem_conv_stride,
        conv_padding=[size // 2 for size in stem_conv_kernel_size],
        norm=norm,
        norm_eps=norm_eps,
        norm_momentum=norm_momentum,
        activation=activation,
    )
    blocks.append(stem)

    # Compute the depth and dimension for each stage
    stage_depths = [1, 2, 5, 3]
    exp_stage = 2.0
    stage_dim1 = stem_dim_in
    stage_dim2 = round_width(stage_dim1, exp_stage, divisor=8)
    stage_dim3 = round_width(stage_dim2, exp_stage, divisor=8)
    stage_dim4 = round_width(stage_dim3, exp_stage, divisor=8)
    stage_dims = [stage_dim1, stage_dim2, stage_dim3, stage_dim4]

    dim_in = stem_dim_out
    # Create each stage for X3D.
    for idx in range(len(stage_depths)):
        dim_out = round_width(stage_dims[idx], width_factor)
        dim_inner = int(bottleneck_factor * dim_out)
        depth = round_repeats(stage_depths[idx], depth_factor)

        stage_conv_stride = (
            stage_temporal_stride[idx],
            stage_spatial_stride[idx],
            stage_spatial_stride[idx],
        )

        stage = create_x3d_res_stage(
            depth=depth,
            dim_in=dim_in,
            dim_inner=dim_inner,
            dim_out=dim_out,
            bottleneck=bottleneck,
            conv_kernel_size=stage_conv_kernel_size[idx],
            conv_stride=stage_conv_stride,
            norm=norm,
            norm_eps=norm_eps,
            norm_momentum=norm_momentum,
            se_ratio=se_ratio,
            activation=activation,
            inner_act=inner_act,
        )
        blocks.append(stage)
        dim_in = dim_out

    # Create head for X3D.
    total_spatial_stride = stem_conv_stride[1] * np.prod(stage_spatial_stride)
    total_temporal_stride = stem_conv_stride[0] * np.prod(stage_temporal_stride)

    assert (
        input_clip_length >= total_temporal_stride
    ), "Clip length doesn't match temporal stride!"
    assert (
        input_crop_size >= total_spatial_stride
    ), "Crop size doesn't match spatial stride!"

    head_pool_kernel_size = (
        input_clip_length // total_temporal_stride,
        int(math.ceil(input_crop_size / total_spatial_stride)),
        int(math.ceil(input_crop_size / total_spatial_stride)),
    )

    head = create_x3d_head(
        dim_in=dim_out,
        dim_inner=dim_inner,
        dim_out=head_dim_out,
        num_classes=model_num_class,
        pool_act=head_pool_act,
        pool_kernel_size=head_pool_kernel_size,
        norm=norm,
        norm_eps=norm_eps,
        norm_momentum=norm_momentum,
        bn_lin5_on=head_bn_lin5_on,
        dropout_rate=dropout_rate,
        activation=head_activation,
        output_with_global_average=head_output_with_global_average,
    )
    blocks.append(head)
    return Net(blocks=nn.ModuleList(blocks))


class ProjectedPool(nn.Module):
    """
    A pooling module augmented with Conv, Normalization and Activation both
    before and after pooling for the head layer of X3D.

    ::

                                    Conv3d (pre_conv)
                                           ↓
                                 Normalization (pre_norm)
                                           ↓
                                   Activation (pre_act)
                                           ↓
                                        Pool3d
                                           ↓
                                    Conv3d (post_conv)
                                           ↓
                                 Normalization (post_norm)
                                           ↓
                                   Activation (post_act)
    """

    def __init__(
        self,
        *,
        pre_conv: nn.Module = None,
        pre_norm: nn.Module = None,
        pre_act: nn.Module = None,
        pool: nn.Module = None,
        post_conv: nn.Module = None,
        post_norm: nn.Module = None,
        post_act: nn.Module = None,
    ) -> None:
        """
        Args:
            pre_conv (torch.nn.modules): convolutional module.
            pre_norm (torch.nn.modules): normalization module.
            pre_act (torch.nn.modules): activation module.
            pool (torch.nn.modules): pooling module.
            post_conv (torch.nn.modules): convolutional module.
            post_norm (torch.nn.modules): normalization module.
            post_act (torch.nn.modules): activation module.
        """
        super().__init__()
        set_attributes(self, locals())
        assert self.pre_conv is not None
        assert self.pool is not None
        assert self.post_conv is not None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pre_conv(x)

        if self.pre_norm is not None:
            x = self.pre_norm(x)
        if self.pre_act is not None:
            x = self.pre_act(x)

        x = self.pool(x)
        x = self.post_conv(x)

        if self.post_norm is not None:
            x = self.post_norm(x)
        if self.post_act is not None:
            x = self.post_act(x)
        return x


In [3]:
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

from typing import Any, Optional

import torch.nn as nn
# from pytorchvideo.models.x3d import create_x3d
from torch.hub import load_state_dict_from_url


root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics"
checkpoint_paths = {
    "x3d_xs": f"{root_dir}/X3D_XS.pyth",
    "x3d_s": f"{root_dir}/X3D_S.pyth",
    "x3d_m": f"{root_dir}/X3D_M.pyth",
    "x3d_l": f"{root_dir}/X3D_L.pyth",
}


def _x3d(
    pretrained: bool = False,
    progress: bool = True,
    checkpoint_path: Optional[str] = None,
    **kwargs: Any,
) -> nn.Module:
    model = create_x3d(**kwargs)
    if pretrained and checkpoint_path is not None:
        # All models are loaded onto CPU by default
        checkpoint = load_state_dict_from_url(
            checkpoint_path, progress=progress, map_location="cpu"
        )
        state_dict = checkpoint["model_state"]
        model.load_state_dict(state_dict)
    return model


def x3d_xs(
    pretrained: bool = False,
    progress: bool = True,
    **kwargs,
):
    r"""
    X3D-XS model architecture [1] trained on the Kinetics dataset.
    Model with pretrained weights has top1 accuracy of 69.12.

    [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

    Args:
        pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
        progress (bool): If True, displays a progress bar of the download to stderr
        kwargs: use these to modify any of the other model settings. All the
            options are defined in pytorchvideo/models/x3d.py

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    """
    return _x3d(
        pretrained=pretrained,
        progress=progress,
        checkpoint_path=checkpoint_paths["x3d_xs"],
        input_clip_length=4,
        input_crop_size=160,
        **kwargs,
    )


def x3d_s(
    pretrained: bool = False,
    progress: bool = True,
    **kwargs,
):
    """
    X3D-XS model architecture [1] trained on the Kinetics dataset.
    Model with pretrained weights has top1 accuracy of 73.33.

    [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

    Args:
        pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
        progress (bool): If True, displays a progress bar of the download to stderr
        kwargs: use these to modify any of the other model settings. All the
            options are defined in pytorchvideo/models/x3d.py

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    """
    return _x3d(
        pretrained=pretrained,
        progress=progress,
        checkpoint_path=checkpoint_paths["x3d_s"],
        input_clip_length=13,
        input_crop_size=160,
        **kwargs,
    )


def x3d_m(
    pretrained: bool = False,
    progress: bool = True,
    **kwargs,
):
    """
    X3D-XS model architecture [1] trained on the Kinetics dataset.
    Model with pretrained weights has top1 accuracy of 75.94.

    [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

    Args:
        pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
        progress (bool): If True, displays a progress bar of the download to stderr
        kwargs: use these to modify any of the other model settings. All the
            options are defined in pytorchvideo/models/x3d.py

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    """
    return _x3d(
        pretrained=pretrained,
        progress=progress,
        checkpoint_path=checkpoint_paths["x3d_m"],
        input_clip_length=16,
        input_crop_size=224,
        **kwargs,
    )


def x3d_l(
    pretrained: bool = False,
    progress: bool = True,
    **kwargs,
):
    """
    X3D-XS model architecture [1] trained on the Kinetics dataset.
    Model with pretrained weights has top1 accuracy of 77.44.

    [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

    Args:
        pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
        progress (bool): If True, displays a progress bar of the download to stderr
        kwargs: use these to modify any of the other model settings. All the
            options are defined in pytorchvideo/models/x3d.py

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    """
    return _x3d(
        pretrained=pretrained,
        progress=progress,
        checkpoint_path=checkpoint_paths["x3d_l"],
        input_clip_length=16,
        input_crop_size=312,
        depth_factor=5.0,
        **kwargs,
    )


In [4]:
def create_x3d_headless(
    *,
    # Input clip configs.
    input_channel: int = 3,
    input_clip_length: int = 13,
    input_crop_size: int = 160,
    # Model configs.
    model_num_class: int = 400,
    dropout_rate: float = 0.5,
    width_factor: float = 2.0,
    depth_factor: float = 2.2,
    # Normalization configs.
    norm: Callable = nn.BatchNorm3d,
    norm_eps: float = 1e-5,
    norm_momentum: float = 0.1,
    # Activation configs.
    activation: Callable = nn.ReLU,
    # Stem configs.
    stem_dim_in: int = 12,
    stem_conv_kernel_size: Tuple[int] = (5, 3, 3),
    stem_conv_stride: Tuple[int] = (1, 2, 2),
    # Stage configs.
    stage_conv_kernel_size: Tuple[Tuple[int]] = (
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
        (3, 3, 3),
    ),
    stage_spatial_stride: Tuple[int] = (2, 2, 2, 2),
    stage_temporal_stride: Tuple[int] = (1, 1, 1, 1),
    bottleneck: Callable = create_x3d_bottleneck_block,
    bottleneck_factor: float = 2.25,
    se_ratio: float = 0.0625,
    inner_act: Callable = Swish,
    # Head configs.
    head_dim_out: int = 2048,
    head_pool_act: Callable = nn.ReLU,
    head_bn_lin5_on: bool = False,
    head_activation: Callable = nn.Softmax,
    head_output_with_global_average: bool = True,
) -> nn.Module:

    torch._C._log_api_usage_once("PYTORCHVIDEO.model.create_x3d")

    blocks = []
    # Create stem for X3D.
    stem_dim_out = round_width(stem_dim_in, width_factor)
    stem = create_x3d_stem(
        in_channels=input_channel,
        out_channels=stem_dim_out,
        conv_kernel_size=stem_conv_kernel_size,
        conv_stride=stem_conv_stride,
        conv_padding=[size // 2 for size in stem_conv_kernel_size],
        norm=norm,
        norm_eps=norm_eps,
        norm_momentum=norm_momentum,
        activation=activation,
    )
    blocks.append(stem)

    # Compute the depth and dimension for each stage
    stage_depths = [1, 2, 5, 3]
    exp_stage = 2.0
    stage_dim1 = stem_dim_in
    stage_dim2 = round_width(stage_dim1, exp_stage, divisor=8)
    stage_dim3 = round_width(stage_dim2, exp_stage, divisor=8)
    stage_dim4 = round_width(stage_dim3, exp_stage, divisor=8)
    stage_dims = [stage_dim1, stage_dim2, stage_dim3, stage_dim4]

    dim_in = stem_dim_out
    # Create each stage for X3D.
    for idx in range(len(stage_depths)):
        dim_out = round_width(stage_dims[idx], width_factor)
        dim_inner = int(bottleneck_factor * dim_out)
        depth = round_repeats(stage_depths[idx], depth_factor)

        stage_conv_stride = (
            stage_temporal_stride[idx],
            stage_spatial_stride[idx],
            stage_spatial_stride[idx],
        )

        stage = create_x3d_res_stage(
            depth=depth,
            dim_in=dim_in,
            dim_inner=dim_inner,
            dim_out=dim_out,
            bottleneck=bottleneck,
            conv_kernel_size=stage_conv_kernel_size[idx],
            conv_stride=stage_conv_stride,
            norm=norm,
            norm_eps=norm_eps,
            norm_momentum=norm_momentum,
            se_ratio=se_ratio,
            activation=activation,
            inner_act=inner_act,
        )
        blocks.append(stage)
        dim_in = dim_out

    # Create head for X3D.
    total_spatial_stride = stem_conv_stride[1] * np.prod(stage_spatial_stride)
    total_temporal_stride = stem_conv_stride[0] * np.prod(stage_temporal_stride)

    assert (
        input_clip_length >= total_temporal_stride
    ), "Clip length doesn't match temporal stride!"
    assert (
        input_crop_size >= total_spatial_stride
    ), "Crop size doesn't match spatial stride!"

    head_pool_kernel_size = (
        input_clip_length // total_temporal_stride,
        int(math.ceil(input_crop_size / total_spatial_stride)),
        int(math.ceil(input_crop_size / total_spatial_stride)),
    )

    head = create_x3d_head(
        dim_in=dim_out,
        dim_inner=dim_inner,
        dim_out=head_dim_out,
        num_classes=model_num_class,
        pool_act=head_pool_act,
        pool_kernel_size=head_pool_kernel_size,
        norm=norm,
        norm_eps=norm_eps,
        norm_momentum=norm_momentum,
        bn_lin5_on=head_bn_lin5_on,
        dropout_rate=dropout_rate,
        activation=head_activation,
        output_with_global_average=head_output_with_global_average,
    )
    # blocks.append(head)
    return Net(blocks=nn.ModuleList(blocks)), head

checkpoint = None
def _x3d_headless(
    pretrained: bool = False,
    progress: bool = True,
    checkpoint_path: Optional[str] = None,
    **kwargs: Any,
) -> nn.Module:
    global checkpoint
    model, head = create_x3d_headless(**kwargs)
    if pretrained and checkpoint_path is not None:
        # All models are loaded onto CPU by default
        checkpoint = load_state_dict_from_url(
            checkpoint_path, progress=progress, map_location="cpu"
        )
        state_dict = checkpoint["model_state"]
        model.load_state_dict(state_dict, strict=False)
        for k in head.state_dict().keys():
            if head.state_dict()[k].shape and head.state_dict()[k].shape == state_dict['blocks.5.' + k].shape:
                head.state_dict()[k][:] = state_dict['blocks.5.' + k]
    return model, head


def x3d_s_headless(
    pretrained: bool = False,
    progress: bool = True,
    **kwargs,
):
    """
    X3D-XS model architecture [1] trained on the Kinetics dataset.
    Model with pretrained weights has top1 accuracy of 73.33.

    [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

    Args:
        pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset
        progress (bool): If True, displays a progress bar of the download to stderr
        kwargs: use these to modify any of the other model settings. All the
            options are defined in pytorchvideo/models/x3d.py

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    """
    return _x3d_headless(
        pretrained=pretrained,
        progress=progress,
        checkpoint_path=checkpoint_paths["x3d_s"],
        input_clip_length=13,
        input_crop_size=160,
        **kwargs,
    )


In [5]:
# import torch.nn as nn
# import torch
# nn.ReLU()(torch.ones(size = (1, 100)))

In [6]:
class x3d_with_regression(nn.Module):

    def __init__(self, pretrained = True, hidden_dim = 4800, out_features = 4, in_between = 128, **kwargs,) -> None:
        super().__init__()

        self.hidden_dim = hidden_dim
        self.model, self.head = x3d_s_headless(pretrained = pretrained, **kwargs,)
        self.keypoint_head_1 = nn.Linear(in_features=hidden_dim, out_features=in_between)
        self.keypoint_head_2 = nn.Linear(in_features=in_between, out_features = out_features)

        

    def forward(self, input):
        output = self.model(input)
        # print(output.shape)
        output_reshape = output.permute((0, 2, 1, 3, 4)).contiguous().view((-1, self.hidden_dim))
        return self.head(output), self.keypoint_head_2(nn.ReLU()(self.keypoint_head_1(output_reshape)))


In [7]:
def mse_loss(input, target, mask, reduction="mean"):
    out = (input - target.view((-1, 4)))**2
    # print(out.view((-1, 4)).shape)
    # print(mask.view((-1,)).shape)
    out = out.view((-1, 4)) * mask.view((-1, 1))
    if reduction == "mean":
        return out[out != 0].mean()
    elif reduction == "None":
        return out

http://api.labelstud.io/api/projects/{id}/export

In [8]:
import requests

headers = {
    'Authorization': 'Token e4342ac4fcf98c2e1910b122cb4103c059f8bbfc',
}

response = requests.get('https://bilishorturl.ml/api/projects/3/export?exportType=JSON', headers=headers)

import json
annotations = json.loads(response.content)

In [9]:
import os

keypoints_mapping = {}

def getCenter(keypoints):
    for point in keypoints:
        point['center_x'] = point['x'] + point['width'] / 2 
        point['center_y'] = point['y'] + point['height'] / 2

# return_interpolation: When true append whether interpolated at the end
# 1 means exist, 0 means missing
def interpolation(keypoints, frames, return_interpolation):
    prev = keypoints[0]['frame'] - 1
    prev_x = 0
    prev_y = 0
    res = np.zeros((frames, 3 if return_interpolation else 2))
    for i in keypoints:
        diff = i['frame'] - prev
        cur_x = i['center_x']
        cur_y = i['center_y']
        cur = i['frame']
        for j in range(prev + 1, i['frame']):
            # tmp = {'frame': j}
            tmp_x = (prev_x * (cur - j) + cur_x * (j - prev)) / diff
            tmp_y = (prev_y * (cur - j) + cur_y * (j - prev)) / diff

            res[j - 1, :2] = (tmp_x / 100, tmp_y / 100)
            if return_interpolation:
                res[j - 1, -1] = 1
            # tmp['interpolated'] = True
            # res.append(tmp)
        res[cur - 1, :2] = (cur_x / 100, cur_y / 100)
        if return_interpolation:
            res[cur - 1, -1] = 1
        prev_x = cur_x
        prev_y = cur_y
        prev = i['frame']

    return res


labels_name = ['wand tip', 'wand end']
for annotation in annotations:
    vid_name = annotation['file_upload']

    boxes = annotation['annotations'][0]['result']
    
    wand_end_keypoint = None
    wand_tip_keypoint = None
    wand_end_framesCount = None
    wand_tip_framesCount = None

    for i in boxes:
        if 'labels' not in i['value'].keys():
            continue
        if i['value']['labels'][0] == labels_name[0]:
            wand_tip_keypoint = i['value']['sequence']
            wand_tip_framesCount = i['value']['framesCount']
        elif i['value']['labels'][0] == labels_name[1]:
            wand_end_keypoint = i['value']['sequence']
            wand_end_framesCount = i['value']['framesCount']
    
    assert wand_tip_keypoint and wand_end_keypoint, f"missing annotations for {annotation['id']}"
    assert wand_end_framesCount == wand_tip_framesCount, f'frames not matched for {annotation["id"]}'

    framesCount = wand_end_framesCount
    # assert boxes[0]['value']['framesCount'] == boxes[1]['value']['framesCount'], f'frames not matched for {annotation["id"]}'
    # assert len(boxes) >= 2, f"missing annotations for {annotation['id']}"

    
    getCenter(wand_end_keypoint)

    wand_end_keypoint = interpolation(wand_end_keypoint, framesCount, False)


    getCenter(wand_tip_keypoint)

    wand_tip_keypoint = interpolation(wand_tip_keypoint, framesCount, True)


    concat_keypoint = np.zeros((framesCount, 5))

    concat_keypoint[:, :2] = wand_end_keypoint
    concat_keypoint[:, 2:] = wand_tip_keypoint

    
    keypoints_mapping[vid_name] = torch.tensor(concat_keypoint)


In [10]:
# keypoints_mapping['1f0d52f0-IMG_6458_9.mp4']

In [11]:
annotation['file_upload']

'49926b6d-2023-03-26_05_23_04.mp4'

In [12]:
# the directory that contains original videos

import os
source_dir = "G:/.shortcut-targets-by-id/1eyTB0qCfXgrxNsrmWNeLNbd5sTKzP5HT/Data Wizards/dataset/processed_vid"
category_mapping = {"3-24 V": 0, "3-25 bridge": 1, "3-25 R": 2, "Accio": 1, "Avada Kedavra": 3, "Invalid": 4, "Lumos": 0, "Revelio": 2}

vid_class = {} # name in processed_vid : category


for root, dirs, files in os.walk(source_dir):
    tmp_root = root[root.rfind('/') + 1: ]
    tmp_root = tmp_root[tmp_root.rfind('\\') + 1: ]
    category = None if tmp_root not in category_mapping.keys() else category_mapping[tmp_root]
    for name in files:
        if not name.endswith('mp4'):
            continue
        assert category is not None, f"No label at{os.path.join(root, name)} {tmp_root}"

        vid_class[name] = category

In [13]:
# label_key = 'ground_truth'
# vid_class_full = {} # name in video_sync : category 

# for i in annotations:
#     assert 'annotations' in i.keys() and i['annotations'], f"Empty annotations at {i['id']}"
#     assert 'video' in i['data'].keys()
    
#     file_path = i['data']['video']

#     file_name = file_path[file_path.rfind('/') + 1: ]

#     file_name_trim =  file_name[file_name.find('-') + 1:]

#     assert file_name_trim in vid_class.keys(), f"labels not found for {i['id']}"


In [14]:
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

model_name = "x3d_s"
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            # UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
# clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second



In [15]:
def uniform_temporal_subsample(
    x: torch.Tensor, num_samples: int, temporal_dim: int = -3
) -> torch.Tensor:
    """
    Uniformly subsamples num_samples indices from the temporal dimension of the video.
    When num_samples is larger than the size of temporal dimension of the video, it
    will sample frames based on nearest neighbor interpolation.

    Args:
        x (torch.Tensor): A video tensor with dimension larger than one with torch
            tensor type includes int, long, float, complex, etc.
        num_samples (int): The number of equispaced samples to be selected
        temporal_dim (int): dimension of temporal to perform temporal subsample.

    Returns:
        An x-like Tensor with subsampled temporal dimension.
    """
    t = x.shape[temporal_dim]
    assert num_samples > 0 and t > 0
    # Sample by nearest neighbor interpolation if num_samples > t.
    indices = torch.linspace(0, t - 1, num_samples)
    indices = torch.clamp(indices, 0, t - 1).long()
    return torch.index_select(x, temporal_dim, indices), indices

In [16]:
# from pytorchvideo.data.encoded_video import EncodedVideo

# vid_path = 'G:\\.shortcut-targets-by-id\\1eyTB0qCfXgrxNsrmWNeLNbd5sTKzP5HT\\Data Wizards\\dataset\\videoSync\\0a1aad14-IMG_1629.mp4'

# video = EncodedVideo.from_path(vid_path)

# video_data = video.get_clip(start_sec=0, end_sec=3)

# uniform_temporal_subsample(video_data['video'], 13)[1]

In [17]:
import os
import json
from pytorchvideo.data.encoded_video import EncodedVideo
import gc
import json
import tqdm


vid_file = 'G:\\.shortcut-targets-by-id\\1eyTB0qCfXgrxNsrmWNeLNbd5sTKzP5HT\\Data Wizards\\dataset\\videoSync'

vids_tensor = []
vids_category = []
vids_keypoints = []


for root, dirs, files in os.walk(vid_file):
    for name in tqdm.tqdm(files):
        vid_path = os.path.join(root, name)
        trim_name = name[name.find('-') + 1:]
        if not vid_path.endswith('.mp4') or trim_name not in vid_class.keys():
            print(name)
            continue
        video = EncodedVideo.from_path(vid_path)
        video_data = video.get_clip(start_sec=0, end_sec=3)
        del video
        gc.collect()
        video_cropped, indices = uniform_temporal_subsample(video_data['video'], transform_params["num_frames"])
        vids_tensor.append(transform({'video':video_cropped})['video'])
        vids_category.append(vid_class[trim_name])
        if name in keypoints_mapping:
            vids_keypoints.append(keypoints_mapping[name][indices])
        else:
            vids_keypoints.append(torch.zeros(size=(transform_params["num_frames"], 5)))
        

 73%|███████▎  | 289/395 [05:21<01:56,  1.10s/it]

647db7de-0a1aad14-IMG_1629.mp4


 80%|███████▉  | 315/395 [05:51<01:36,  1.21s/it]

4ec71381-04-06-2023-31_1.mp4


 81%|████████  | 318/395 [05:54<01:16,  1.01it/s]

05c834cd-04-06-2023-32_1.mp4


 84%|████████▍ | 333/395 [06:11<01:15,  1.21s/it]

8728fbe4-04-06-2023-30_1.mp4


 91%|█████████ | 360/395 [06:43<00:43,  1.23s/it]

a3da0b39-04-06-2023-8_1.mp4
475a799b-04-06-2023-4_1.mp4


 95%|█████████▍| 374/395 [06:57<00:25,  1.20s/it]

68bf2412-04-06-2023-6_1.mp4
6f9f9743-04-06-2023-15_1.mp4


 96%|█████████▌| 378/395 [06:59<00:14,  1.18it/s]

e61c79ee-04-06-2023-7_1.mp4


 97%|█████████▋| 385/395 [07:07<00:10,  1.07s/it]

f4da5e75-04-06-2023-3_1.mp4


100%|██████████| 395/395 [07:16<00:00,  1.11s/it]


desktop.ini


100%|██████████| 1/1 [00:00<?, ?it/s]

desktop.ini





In [18]:
import random
size = len(vid_class)

vids_concat = list(zip(vids_tensor, vids_keypoints, vids_category))

random.shuffle(vids_concat)

train_data = vids_concat[:int(size * 0.85)]
val_data = vids_concat[int(size * 0.85):]

In [19]:
from torch.utils.data import Dataset

class VidClsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, data):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        return self.data[idx]
    
train_dataset = VidClsDataset(train_data)
val_dataset = VidClsDataset(val_data)

In [20]:
from torch.utils.data import DataLoader
batch_size = 4

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

In [21]:
device = "cuda"
model = x3d_with_regression(pretrained = True, hidden_dim= 6912, model_num_class = 5, head_activation = None).to(device)
# model = x3d_s(pretrained=True)

In [22]:
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Adam
import numpy as np
import math


beta = 0.1
epoch = 2
num_categories = 5


optimizer = Adam(model.parameters(), lr= 1e-5)
train_size = len(vids_tensor)
steps = math.ceil(train_size / batch_size)
crossEntropy = CrossEntropyLoss()

# input = [torch.zeros(size=(batch_size, 3, slow_num_frames, crop_size, crop_size), device= device), 
#         torch.zeros(size=(batch_size, 3, num_frames, crop_size, crop_size), device= device)]


In [23]:
def train(model, epoch, optimizer, train_dataloader, val_dataloader):
    for epoch_i in range(0, epoch):
        model.train()
    
        loss_list = []
        all = 0
        correct = 0

    
        for vid, keypoint, target in tqdm.tqdm(train_dataloader):

            # print(input[0].shape)
            vid = vid.to(device)
            keypoint = keypoint.to(device)
            target = target.to(device)


            output = model(vid)
        
        
            cls_loss = crossEntropy(output[0], target)

            reg_loss = mse_loss(output[1], keypoint[:,:,:4], keypoint[:,:,4]) * beta

            loss = cls_loss
            if not reg_loss.isnan().cpu().item():
                loss += reg_loss


            # print(reg_loss)

            correct += torch.sum(torch.argmax(output[0], dim= 1) == target).item()
            all += vid.shape[0]


        
            optimizer.zero_grad()
            loss.backward()
            loss_list.append(loss.item())
            optimizer.step()

            del vid
            del keypoint
            del target

            torch.cuda.empty_cache()
            gc.collect()
        
        print(epoch_i,' train loss:', np.mean(loss_list))
        print('train acc:', correct / all)
    

        correct = 0
        all = 0

        model.eval()
        for vid, keypoint, target in tqdm.tqdm(val_dataloader):

            # print(input[0].shape)
            vid = vid.to(device)
            keypoint = keypoint.to(device)
            target = target.to(device)


            output = model(vid)

            # print(reg_loss)

            correct += torch.sum(torch.argmax(output[0], dim= 1) == target).item()
            all += vid.shape[0]

            del vid
            del keypoint
            del target

            torch.cuda.empty_cache()
            gc.collect()
        print('val acc:', correct / all)

In [24]:
train(model, 20, optimizer, train_dataloader, val_dataloader)

100%|██████████| 70/70 [00:20<00:00,  3.36it/s]


0  train loss: 1.5955128261021205
train acc: 0.28776978417266186


100%|██████████| 106/106 [00:10<00:00, 10.13it/s]


val acc: 0.330188679245283


100%|██████████| 70/70 [00:19<00:00,  3.55it/s]


1  train loss: 1.4486249583108084
train acc: 0.49640287769784175


100%|██████████| 106/106 [00:10<00:00, 10.49it/s]


val acc: 0.5754716981132075


100%|██████████| 70/70 [00:19<00:00,  3.52it/s]


2  train loss: 1.2992426242147173
train acc: 0.7050359712230215


100%|██████████| 106/106 [00:10<00:00, 10.52it/s]


val acc: 0.6792452830188679


100%|██████████| 70/70 [00:19<00:00,  3.58it/s]


3  train loss: 1.144862448317664
train acc: 0.7266187050359713


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.6698113207547169


100%|██████████| 70/70 [00:19<00:00,  3.58it/s]


4  train loss: 1.0108079663344791
train acc: 0.7553956834532374


100%|██████████| 106/106 [00:10<00:00, 10.53it/s]


val acc: 0.6698113207547169


100%|██████████| 70/70 [00:19<00:00,  3.57it/s]


5  train loss: 0.8881192105157035
train acc: 0.7985611510791367


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.6792452830188679


100%|██████████| 70/70 [00:19<00:00,  3.56it/s]


6  train loss: 0.7715329787560872
train acc: 0.8309352517985612


100%|██████████| 106/106 [00:10<00:00, 10.58it/s]


val acc: 0.7547169811320755


100%|██████████| 70/70 [00:19<00:00,  3.54it/s]


7  train loss: 0.6423448319946017
train acc: 0.8129496402877698


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.7924528301886793


100%|██████████| 70/70 [00:19<00:00,  3.59it/s]


8  train loss: 0.577091532094138
train acc: 0.841726618705036


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.7830188679245284


100%|██████████| 70/70 [00:19<00:00,  3.60it/s]


9  train loss: 0.5078424049275262
train acc: 0.8812949640287769


100%|██████████| 106/106 [00:10<00:00, 10.57it/s]


val acc: 0.8584905660377359


100%|██████████| 70/70 [00:19<00:00,  3.55it/s]


10  train loss: 0.4668771888528551
train acc: 0.8848920863309353


100%|██████████| 106/106 [00:09<00:00, 10.66it/s]


val acc: 0.8584905660377359


100%|██████████| 70/70 [00:19<00:00,  3.57it/s]


11  train loss: 0.44497772572296007
train acc: 0.8669064748201439


100%|██████████| 106/106 [00:10<00:00, 10.57it/s]


val acc: 0.8584905660377359


100%|██████████| 70/70 [00:19<00:00,  3.55it/s]


12  train loss: 0.38802718264716013
train acc: 0.9028776978417267


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.8679245283018868


100%|██████████| 70/70 [00:19<00:00,  3.55it/s]


13  train loss: 0.34742201971156256
train acc: 0.9100719424460432


100%|██████████| 106/106 [00:10<00:00, 10.56it/s]


val acc: 0.8584905660377359


100%|██████████| 70/70 [00:19<00:00,  3.56it/s]


14  train loss: 0.31858099890606745
train acc: 0.9316546762589928


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.8584905660377359


100%|██████████| 70/70 [00:19<00:00,  3.54it/s]


15  train loss: 0.282576433090227
train acc: 0.9172661870503597


100%|██████████| 106/106 [00:10<00:00, 10.49it/s]


val acc: 0.8962264150943396


100%|██████████| 70/70 [00:19<00:00,  3.52it/s]


16  train loss: 0.25782322686697756
train acc: 0.9460431654676259


100%|██████████| 106/106 [00:10<00:00, 10.42it/s]


val acc: 0.8962264150943396


100%|██████████| 70/70 [00:20<00:00,  3.43it/s]


17  train loss: 0.2595900952283825
train acc: 0.9316546762589928


100%|██████████| 106/106 [00:10<00:00, 10.33it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:19<00:00,  3.52it/s]


18  train loss: 0.2208694228636367
train acc: 0.935251798561151


100%|██████████| 106/106 [00:10<00:00, 10.55it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:19<00:00,  3.61it/s]


19  train loss: 0.1833762344771198
train acc: 0.9568345323741008


100%|██████████| 106/106 [00:10<00:00, 10.56it/s]

val acc: 0.8962264150943396





In [25]:
train(model, 20, optimizer, train_dataloader, val_dataloader)

100%|██████████| 70/70 [00:19<00:00,  3.60it/s]


0  train loss: 0.18964495190552302
train acc: 0.9712230215827338


100%|██████████| 106/106 [00:10<00:00, 10.56it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:19<00:00,  3.56it/s]


1  train loss: 0.15392648026879344
train acc: 0.9784172661870504


100%|██████████| 106/106 [00:10<00:00, 10.53it/s]


val acc: 0.8962264150943396


100%|██████████| 70/70 [00:19<00:00,  3.57it/s]


2  train loss: 0.1527777915288295
train acc: 0.9748201438848921


100%|██████████| 106/106 [00:10<00:00, 10.47it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:19<00:00,  3.59it/s]


3  train loss: 0.10323269681206772
train acc: 0.9928057553956835


100%|██████████| 106/106 [00:10<00:00, 10.38it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:19<00:00,  3.56it/s]


4  train loss: 0.097722680707063
train acc: 0.9928057553956835


100%|██████████| 106/106 [00:10<00:00, 10.47it/s]


val acc: 0.9433962264150944


100%|██████████| 70/70 [00:20<00:00,  3.42it/s]


5  train loss: 0.1012622277252376
train acc: 0.9784172661870504


100%|██████████| 106/106 [00:10<00:00,  9.66it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:20<00:00,  3.50it/s]


6  train loss: 0.0953673047412719
train acc: 0.9892086330935251


100%|██████████| 106/106 [00:10<00:00,  9.76it/s]


val acc: 0.9245283018867925


100%|██████████| 70/70 [00:20<00:00,  3.46it/s]


7  train loss: 0.09923220352668848
train acc: 0.9892086330935251


100%|██████████| 106/106 [00:11<00:00,  9.44it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:21<00:00,  3.31it/s]


8  train loss: 0.11529366948774883
train acc: 0.9784172661870504


100%|██████████| 106/106 [00:11<00:00,  9.27it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:20<00:00,  3.49it/s]


9  train loss: 0.07950218069101019
train acc: 0.9892086330935251


100%|██████████| 106/106 [00:10<00:00,  9.65it/s]


val acc: 0.9056603773584906


100%|██████████| 70/70 [00:20<00:00,  3.46it/s]


10  train loss: 0.06356943374765771
train acc: 0.9964028776978417


100%|██████████| 106/106 [00:11<00:00,  9.60it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:20<00:00,  3.39it/s]


11  train loss: 0.05154777321565364
train acc: 0.9964028776978417


100%|██████████| 106/106 [00:10<00:00,  9.72it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:20<00:00,  3.48it/s]


12  train loss: 0.04982759889348277
train acc: 0.9964028776978417


100%|██████████| 106/106 [00:11<00:00,  9.47it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:20<00:00,  3.44it/s]


13  train loss: 0.05568362303809928
train acc: 0.9856115107913669


100%|██████████| 106/106 [00:11<00:00,  9.23it/s]


val acc: 0.9245283018867925


100%|██████████| 70/70 [00:20<00:00,  3.49it/s]


14  train loss: 0.031931557369950625
train acc: 1.0


100%|██████████| 106/106 [00:11<00:00,  9.47it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:20<00:00,  3.49it/s]


15  train loss: 0.044167224261244496
train acc: 1.0


100%|██████████| 106/106 [00:11<00:00,  9.36it/s]


val acc: 0.9339622641509434


100%|██████████| 70/70 [00:19<00:00,  3.53it/s]


16  train loss: 0.03647206047815936
train acc: 1.0


100%|██████████| 106/106 [00:10<00:00, 10.51it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:19<00:00,  3.58it/s]


17  train loss: 0.049310640039454616
train acc: 0.9856115107913669


100%|██████████| 106/106 [00:10<00:00, 10.43it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:19<00:00,  3.55it/s]


18  train loss: 0.03312025060211973
train acc: 0.9964028776978417


100%|██████████| 106/106 [00:10<00:00, 10.59it/s]


val acc: 0.9150943396226415


100%|██████████| 70/70 [00:19<00:00,  3.59it/s]


19  train loss: 0.04727803177000689
train acc: 0.9964028776978417


100%|██████████| 106/106 [00:10<00:00, 10.41it/s]

val acc: 0.9150943396226415





In [22]:
tmp = torch.zeros((1,1), device=device)
type(tmp[tmp != 0].mean().isnan().cpu().item())

bool

In [23]:
loss

[1.6006215810775757,
 1.6522184610366821,
 1.5508222579956055,
 1.5435348749160767,
 1.5520023107528687,
 1.5453161001205444,
 1.5513572692871094,
 1.5396989583969116,
 1.5261726379394531,
 1.53245210647583,
 1.4759129285812378,
 1.4179316759109497,
 1.5488102436065674,
 1.526351809501648,
 1.5203030109405518,
 1.4540609121322632,
 1.5517607927322388,
 1.5476685762405396,
 1.5017987489700317,
 1.5352576971054077,
 1.560221791267395,
 1.5328083038330078,
 1.5226811170578003,
 1.502122402191162,
 1.433959722518921,
 1.4614757299423218,
 1.5298354625701904,
 1.3777358531951904,
 1.4819597005844116,
 1.6072654724121094,
 1.4554775953292847,
 1.4510821104049683,
 1.4250015020370483,
 1.4322353601455688,
 1.4579918384552002]

In [24]:
output[1].shape

torch.Size([13, 4])

In [25]:
keypoint[:,:,:4].view((-1, 4)).shape

NameError: name 'keypoint' is not defined

In [None]:
keypoint[:,:,:4]

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]], device='cuda:0')

In [None]:
vids_tensor[0].shape

In [None]:
torch.save(model.state_dict(), "x3d_regression_4_11.pt")

In [None]:
model(torch.zeros(size=vids_tensor[0].shape, device=device).unsqueeze(0))

(tensor([[-0.1644, -0.0496, -0.0880,  0.0246, -0.2376]], device='cuda:0',
        grad_fn=<ViewBackward0>),
 tensor([[ 0.7603, -1.8393, -2.2067],
         [ 0.8296, -1.9309, -2.0818],
         [ 0.8647, -0.9451, -1.9459],
         [ 1.3491, -0.5821, -1.8753],
         [ 1.3601, -0.4220, -2.1563],
         [ 1.4539, -0.9777, -2.2140],
         [ 1.6425, -0.9868, -2.0187],
         [ 1.5443, -0.8109, -2.4173],
         [ 1.7484, -0.7446, -1.9516],
         [ 1.8273, -0.3302, -2.0197],
         [ 1.3747, -1.2899, -1.1885],
         [ 0.3637, -2.4861, -0.8878],
         [-0.0671, -2.1500, -0.9555]], device='cuda:0',
        grad_fn=<AddmmBackward0>))

In [None]:
target.shape

torch.Size([2])