In [2]:
import os
os.chdir("..")
import logging
import time
import numpy as np
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from SoccerNet.Evaluation.MV_FoulRecognition import evaluate
import torch
from dataset import MultiViewDataset
from train import trainer, evaluation
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from model import MVNetwork
from config.classes import EVENT_DICTIONARY, INVERSE_EVENT_DICTIONARY
from torchvision.models.video import R3D_18_Weights, MC3_18_Weights
from torchvision.models.video import R2Plus1D_18_Weights, S3D_Weights
from torchvision.models.video import MViT_V2_S_Weights, MViT_V1_B_Weights
from torchvision.models.video import mvit_v2_s, MViT_V2_S_Weights, mvit_v1_b, MViT_V1_B_Weights
import einops

```python
parser = ArgumentParser(description='my method', formatter_class=ArgumentDefaultsHelpFormatter)    
parser.add_argument('--path',   required=True, type=str, help='Path to the dataset folder' )
parser.add_argument('--max_epochs',   required=False, type=int,   default=60,     help='Maximum number of epochs' )
parser.add_argument('--model_name',   required=False, type=str,   default="VARS",     help='named of the model to save' )
parser.add_argument('--batch_size', required=False, type=int,   default=2,     help='Batch size' )
parser.add_argument('--LR',       required=False, type=float,   default=1e-04, help='Learning Rate' )
parser.add_argument('--GPU',        required=False, type=int,   default=-1,     help='ID of the GPU to use' )
parser.add_argument('--max_num_worker',   required=False, type=int,   default=1, help='number of worker to load data')
parser.add_argument('--loglevel',   required=False, type=str,   default='INFO', help='logging level')
parser.add_argument("--continue_training", required=False, action='store_true', help="Continue training")
parser.add_argument("--num_views", required=False, type=int, default=5, help="Number of views")
parser.add_argument("--data_aug", required=False, type=str, default="Yes", help="Data augmentation")
parser.add_argument("--pre_model", required=False, type=str, default="r2plus1d_18", help="Name of the pretrained model")
parser.add_argument("--pooling_type", required=False, type=str, default="max", help="Which type of pooling should be done")
parser.add_argument("--weighted_loss", required=False, type=str, default="Yes", help="If the loss should be weighted")
parser.add_argument("--start_frame", required=False, type=int, default=0, help="The starting frame")
parser.add_argument("--end_frame", required=False, type=int, default=125, help="The ending frame")
parser.add_argument("--fps", required=False, type=int, default=25, help="Number of frames per second")
parser.add_argument("--step_size", required=False, type=int, default=3, help="StepLR parameter")
parser.add_argument("--gamma", required=False, type=float, default=0.1, help="StepLR parameter")
parser.add_argument("--weight_decay", required=False, type=float, default=0.001, help="Weight decacy")

parser.add_argument("--only_evaluation", required=False, type=int, default=3, help="Only evaluation, 0 = on test set, 1 = on chall set, 2 = on both sets and 3 = train/valid/test")
parser.add_argument("--path_to_model_weights", required=False, type=str, default="", help="Path to the model weights")

args = parser.parse_args()
```

In [3]:
LR = 1e-04
gamma = 0.1
step_size = 3
start_frame = 60
end_frame = 87
weight_decay = 0.001
        
model_name = "VARS"
pre_model = "mvit_v2_s"
num_views = 2
fps = 10
number_of_frames = int((end_frame - start_frame) / ((end_frame - start_frame) / (((end_frame - start_frame) / 25) * fps)))
number_of_frames2 =  (((end_frame - start_frame) / 25) * fps)
batch_size = 4
data_aug = 'Yes'
path = 'dataset224p'
pooling_type = "max"
weighted_loss = "Yes"
max_num_worker = 0
max_epochs = 50
only_evaluation = 3
path_to_model_weights = ""

In [4]:
    numeric_level = getattr(logging, 'INFO'.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % 'INFO')

    os.makedirs(os.path.join("models", os.path.join(model_name, os.path.join(str(num_views), os.path.join(pre_model, os.path.join(str(LR),
                            "_B" + str(batch_size) + "_F" + str(number_of_frames) + "_S" + "_G" + str(gamma) + "_Step" + str(step_size)))))), exist_ok=True)

    best_model_path = os.path.join("models", os.path.join(model_name, os.path.join(str(num_views), os.path.join(pre_model, os.path.join(str(LR),
                            "_B" + str(batch_size) + "_F" + str(number_of_frames) + "_S" + "_G" + str(gamma) + "_Step" + str(step_size))))))


    log_path = os.path.join(best_model_path, "logging.log")

    logging.basicConfig(
        level=numeric_level,
        format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s",
        handlers=[
            logging.FileHandler(log_path),
            logging.StreamHandler()
        ]
    )

In [5]:
    if data_aug == 'Yes':
        transformAug = transforms.Compose([
                                          transforms.RandomAffine(degrees=(0, 0), translate=(0.1, 0.1), scale=(0.9, 1)),
                                          transforms.RandomPerspective(distortion_scale=0.3, p=0.5),
                                          transforms.RandomRotation(degrees=5),
                                          transforms.ColorJitter(brightness=0.5, saturation=0.5, contrast=0.5),
                                          transforms.RandomHorizontalFlip()
                                          ])
    else:
        transformAug = None

    if pre_model == "r3d_18":
        transforms_model = R3D_18_Weights.KINETICS400_V1.transforms()        
    elif pre_model == "s3d":
        transforms_model = S3D_Weights.KINETICS400_V1.transforms()       
    elif pre_model == "mc3_18":
        transforms_model = MC3_18_Weights.KINETICS400_V1.transforms()       
    elif pre_model == "r2plus1d_18":
        transforms_model = R2Plus1D_18_Weights.KINETICS400_V1.transforms()
    elif pre_model == "mvit_v2_s":
        transforms_model = MViT_V2_S_Weights.KINETICS400_V1.transforms()
    else:
        transforms_model = R2Plus1D_18_Weights.KINETICS400_V1.transforms()
        print("Warning: Could not find the desired pretrained model")
        print("Possible options are: r3d_18, s3d, mc3_18, mvit_v2_s and r2plus1d_18")
        print("We continue with r2plus1d_18")

In [6]:
transforms_model

VideoClassification(
    crop_size=[224, 224]
    resize_size=[256]
    mean=[0.45, 0.45, 0.45]
    std=[0.225, 0.225, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [7]:
        dataset_Train = MultiViewDataset(path=path, start=start_frame, end=end_frame, fps=fps, split='train',
            num_views = num_views, transform=transformAug, transform_model=transforms_model)
        dataset_Valid2 = MultiViewDataset(path=path, start=start_frame, end=end_frame, fps=fps, split='valid', num_views = 5,
            transform_model=transforms_model)

2319
321


In [8]:
print(dataset_Train.__getitem__(49)[2].shape)


torch.Size([2, 3, 11, 224, 224])


In [9]:
        # Create the dataloaders for train validation and test datasets
        train_loader = torch.utils.data.DataLoader(dataset_Train,
            batch_size=batch_size, shuffle=False,
            num_workers=max_num_worker, pin_memory=True)

        val_loader2 = torch.utils.data.DataLoader(dataset_Valid2,
            batch_size=1, shuffle=True,
            num_workers=max_num_worker, pin_memory=True)

In [10]:
train_iter = next(iter(train_loader))

In [11]:
train_iter[0], train_iter[0].shape

(tensor([[0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]),
 torch.Size([4, 4]))

In [12]:
train_iter[1], train_iter[1].shape

(tensor([[0., 0., 0., 0., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0.]]),
 torch.Size([4, 8]))

In [13]:
train_iter[2].shape # batch size, views, channels, time/depth, height, width

torch.Size([4, 2, 3, 11, 224, 224])

In [14]:
train_iter[3] # lista zdjec na dany batch

['0', '1', '2', '3']

In [15]:
val_iter = next(iter(val_loader2))


In [16]:
val_iter[2].shape, val_iter[3]

(torch.Size([1, 2, 3, 11, 224, 224]), ['71'])

In [17]:
from model import MVNetwork

In [18]:
pre_model = "mvit_v2_s"

In [19]:
model = MVNetwork(net_name=pre_model)

In [20]:
videos = torch.randn(4,2, 3, 11, 224, 224)
output = model(videos)

ValueError: not enough values to unpack (expected 6, got 5)

## Metoda ze starego algorytmu 

In [22]:
from torchvision.models.video import r2plus1d_18, R2Plus1D_18_Weights

In [23]:
weights_model = R2Plus1D_18_Weights.DEFAULT
network = r2plus1d_18(weights=weights_model)

In [24]:
from utils import batch_tensor

In [71]:
batch_tensor(train_iter[2],dim=1,squeeze=True).shape

torch.Size([8, 3, 11, 224, 224])

In [None]:
network(batch_tensor(train_iter[2],dim=1,squeeze=True)).shape

In [None]:
#network(val_iter[2]).shape  # wywali blad bo tensor 5 D

## Nowy algorytm - siec debug

In [27]:
weights_model = MViT_V2_S_Weights.DEFAULT
model = mvit_v2_s(weights=weights_model)
n=2


In [28]:
from torchsummary import summary

In [29]:
for name, params in model.named_parameters():
    print(name, params.shape)

conv_proj.weight torch.Size([96, 3, 3, 7, 7])
conv_proj.bias torch.Size([96])
pos_encoding.class_token torch.Size([96])
blocks.0.norm1.weight torch.Size([96])
blocks.0.norm1.bias torch.Size([96])
blocks.0.norm2.weight torch.Size([96])
blocks.0.norm2.bias torch.Size([96])
blocks.0.attn.rel_pos_h torch.Size([111, 96])
blocks.0.attn.rel_pos_w torch.Size([111, 96])
blocks.0.attn.rel_pos_t torch.Size([15, 96])
blocks.0.attn.qkv.weight torch.Size([288, 96])
blocks.0.attn.qkv.bias torch.Size([288])
blocks.0.attn.project.0.weight torch.Size([96, 96])
blocks.0.attn.project.0.bias torch.Size([96])
blocks.0.attn.pool_q.pool.weight torch.Size([96, 1, 3, 3, 3])
blocks.0.attn.pool_q.norm_act.0.weight torch.Size([96])
blocks.0.attn.pool_q.norm_act.0.bias torch.Size([96])
blocks.0.attn.pool_k.pool.weight torch.Size([96, 1, 3, 3, 3])
blocks.0.attn.pool_k.norm_act.0.weight torch.Size([96])
blocks.0.attn.pool_k.norm_act.0.bias torch.Size([96])
blocks.0.attn.pool_v.pool.weight torch.Size([96, 1, 3, 3, 3])

In [30]:
model.head[1].in_features # tu nie wiem czy 96, czy 768

768

In [31]:
input_x = train_iter[2]
input_x.shape

torch.Size([4, 2, 3, 11, 224, 224])

In [32]:
input_x = einops.rearrange(input_x, 'b n c d h w -> (b n) c d h w')

In [33]:
input_x.shape

torch.Size([8, 3, 11, 224, 224])

In [34]:
x = model.conv_proj(input_x)
print(x.shape)
thw_shape = x.shape[2:]
thw_shape

torch.Size([8, 96, 6, 56, 56])


torch.Size([6, 56, 56])

In [35]:
B, C, D, H, W = x.shape

In [36]:
x = x.view(B, C, D * H * W).transpose(1,2)
x.shape

torch.Size([8, 18816, 96])

In [37]:
x_pos=model.pos_encoding(x)

In [38]:
x_pos.shape

torch.Size([8, 18817, 96])

In [39]:
tokens_per_frame = x_pos.shape[1];tokens_per_frame

18817

In [40]:
output_dict = {'single': {}}
if n > 1:
    output_dict['mv_collection'] = {}

In [41]:
output_dict

{'single': {}, 'mv_collection': {}}

In [42]:
tokens = x_pos.clone()
for block in model.blocks:
    tokens, thw_shape = block(tokens, thw_shape)
    print(tokens.shape, thw_shape)

tokens = model.norm(tokens)
print(tokens.shape)

torch.Size([8, 18817, 96]) (6, 56, 56)
torch.Size([8, 4705, 192]) (6, 28, 28)
torch.Size([8, 4705, 192]) (6, 28, 28)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 1177, 384]) (6, 14, 14)
torch.Size([8, 295, 768]) (6, 7, 7)
torch.Size([8, 295, 768]) (6, 7, 7)
torch.Size([8, 295, 768])


In [43]:
tokens.shape

torch.Size([8, 295, 768])

In [44]:
model.pos_encoding(x_pos).shape

torch.Size([8, 18818, 96])

In [45]:
    def format_multi_frame_tokens( x, batch_size, tokens_per_frame):
        """
        Formats the tokens for multiple frames.

        Args:
            x (Tensor): Input tensor with shape [batch_size * n, tokens, channels].
            batch_size (int): Original batch size.
            tokens_per_frame (int): Number of tokens per frame.

        Returns:
            Tensor: Formatted tensor.
        """
        # Rearrange the tensor to merge batch and frame dimensions
        print(f"Initial shape of x: {x.shape}")
        x = einops.rearrange(x, '(b n) s c -> b (n s) c', b=batch_size, n=n)
        first_img_token_idx = 0

        print(f"Shape after rearrange: {x.shape}")

        # Handle cls_token if present
        if hasattr(model.pos_encoding, 'class_token'):
            for i in range(1, n):
                excess_cls_index = i * tokens_per_frame + 1
                x = torch.cat((x[:, :excess_cls_index], x[:, excess_cls_index + 2:]), dim=1)
                print(f"Shape after removing cls token at frame {i}: {x.shape}")
            first_img_token_idx = 1

        # Normalize and add image embeddings
        image_embeddings = F.normalize(img_embed_matrix, dim=-1)
        print(f"Image embeddings shape: {image_embeddings.shape}")
        # Repeat embeddings to match the number of tokens per frame
        repeated_embeddings = torch.repeat_interleave(image_embeddings, tokens_per_frame-1 , dim=1)
        print(f"Repeated embeddings shape: {repeated_embeddings.shape}")

        x[:, first_img_token_idx:] += repeated_embeddings
        print(f"Shape after adding image embeddings: {x.shape}")
        return x

In [46]:
hasattr(model.pos_encoding, 'class_token')

True

In [47]:
embed_dim = 96

In [48]:
img_embed_matrix = nn.Parameter(torch.zeros(1, n, embed_dim), requires_grad=True)
nn.init.xavier_uniform_(img_embed_matrix)

Parameter containing:
tensor([[[-8.7425e-02, -7.0894e-02,  6.7221e-02,  1.3171e-01,  1.1597e-01,
           7.3686e-02,  3.2308e-02,  8.2645e-02, -8.2328e-02,  7.1367e-02,
          -3.5731e-02,  6.5479e-02, -1.1185e-01,  2.1948e-02,  5.6373e-02,
          -1.1769e-02, -2.1720e-02, -1.1829e-01, -4.9559e-02, -2.9565e-03,
          -9.4372e-02,  2.0462e-02, -5.5385e-02, -5.1856e-02, -4.8786e-02,
          -1.2847e-01,  1.2518e-01, -3.7235e-02,  3.6176e-02,  4.1613e-02,
          -8.1544e-02, -4.9512e-03,  7.6256e-02,  1.3083e-01,  1.2910e-01,
          -2.4532e-02,  1.2459e-01, -2.8243e-02, -7.8253e-03,  9.3521e-02,
           1.6084e-02,  1.0721e-01,  1.3785e-01, -4.4411e-02, -9.4780e-02,
           1.2711e-02,  9.7135e-02, -9.8067e-02,  1.0470e-01,  2.4907e-02,
          -2.5587e-02,  1.3250e-01, -4.6502e-02,  4.9412e-02, -1.2213e-01,
          -4.6024e-02, -1.2305e-02, -5.9592e-02, -6.8150e-02,  3.2270e-02,
          -8.0385e-02, -1.4049e-01,  6.2135e-02, -1.7336e-02, -8.2488e-02,
   

In [None]:
img_embed_matrix.shape

In [None]:
tokens_per_frame

In [None]:
t1 = format_multi_frame_tokens( x_pos, batch_size, tokens_per_frame)

In [None]:
thw_shape = x.shape[2:]
for block in model.blocks:
    tokens, thw_shape = block(t1, thw_shape)
    print(tokens.shape, thw_shape)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import einops
from torchvision.models.video import mvit_v2_s, MViT_V2_S_Weights

class MultiVideoHybridMVit2(nn.Module):
    """
    A hybrid model for handling multiple frames per video using MVit_v2_s.

    Args:
        num_classes (int): Number of output classes.
        n (int): Number of views per sample.
        pretrained_weights (str): Path to the pretrained weights.
    """
    
    def __init__(self, num_classes, n, pretrained_weights=None):
        super(MultiVideoHybridMVit2, self).__init__()

        self.n = n
        self.num_classes = num_classes

        # Initialize the base MVit_v2_s model
        weights = MViT_V2_S_Weights.DEFAULT if pretrained_weights is None else pretrained_weights
        self.model = mvit_v2_s(weights=weights)
        self.embed_dim = 96

        # Initialize the learnable image embedding matrix
        self.img_embed_matrix = nn.Parameter(torch.zeros(1, n, self.embed_dim), requires_grad=True)
        nn.init.xavier_uniform_(self.img_embed_matrix)

        # Initialize the classification head
        self.classifier = nn.Linear(self.model.head[1].in_features, self.num_classes)
        nn.init.zeros_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def format_multi_frame_tokens(self, x, batch_size, tokens_per_frame):
        """
        Formats the tokens for multiple frames.

        Args:
            x (Tensor): Input tensor with shape [batch_size * n, tokens, embed_dim].
            batch_size (int): Original batch size.
            tokens_per_frame (int): Number of tokens per frame.

        Returns:
            Tensor: Formatted tensor.
        """
        # Initial shape of x: [batch_size * n, tokens, embed_dim]
        # Example initial shape: [8, 18817, 96]
        print(f"Initial shape of x: {x.shape}")
        
        # Rearrange the tensor to merge batch and frame dimensions
        x = einops.rearrange(x, '(b n) s c -> b (n s) c', b=batch_size, n=self.n)
        # Shape after rearrange: [4, 37634, 96] if n=2 (concatenating tokens for all frames per batch)
        print(f"Shape after rearrange: {x.shape}")

        first_img_token_idx = 0

        # Handle cls_token if present
        if hasattr(self.model.pos_encoding, 'class_token'):
            for i in range(1, self.n):
                excess_cls_index = i * tokens_per_frame + 1
                x = torch.cat((x[:, :excess_cls_index], x[:, excess_cls_index + 2:]), dim=1)
                print(f"Shape after removing cls token at frame {i}: {x.shape}")
            first_img_token_idx = 2
            # Shape after removing excess cls tokens: [4, 37633, 96] if n=2 and cls tokens are removed

        # Normalize and add image embeddings
        image_embeddings = F.normalize(self.img_embed_matrix, dim=-1)
        # image_embeddings shape: [1, 2, 96] if n=2
        print(f"Image embeddings shape: {image_embeddings.shape}")
        
        # Repeat embeddings to match the number of tokens per frame
        repeated_embeddings = torch.repeat_interleave(image_embeddings, tokens_per_frame-2, dim=1)
        print(f"Repeated embeddings shape: {repeated_embeddings.shape}")

        x[:, first_img_token_idx:] += repeated_embeddings
        # Shape after adding image embeddings: [4, 37633, 96] (no change in shape, just adding embeddings)
        print(f"Shape after adding image embeddings: {x.shape}")
        
        return x

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (Tensor): Input tensor with shape [batch_size, n, channels, depth, height, width].

        Returns:
            dict: A dictionary with logits for single and multi-view collections.
        """
        batch_size = x.shape[0]
        output_dict = {'single': {}}
        if self.n > 1:
            output_dict['mv_collection'] = {}

        # Input shape: [batch_size, num_views, channels, depth, height, width]
        # Example input shape: [4, 2, 3, 11, 224, 224]
        
        # Flatten the views into individual images
        x = einops.rearrange(x, 'b n c d h w -> (b n) c d h w')
        # Shape after rearrange: [8, 3, 11, 224, 224]
        print(f"Shape after rearrange (views to individual images): {x.shape}")

        # Pass through the initial convolutional layers of MVIT to get patch embeddings
        x = self.model.conv_proj(x)
        # Shape after conv_proj: [8, 96, 6, 56, 56]
        print(f"Shape after conv_proj: {x.shape}")
        
        # Get the shape for temporal, height, and width dimensions
        init_thw_shape = x.shape[2:]  # Shape: [6, 56, 56]
        thw_shape = init_thw_shape
        print(f"THW shape after conv_proj: {thw_shape}")

        # Flatten the spatial dimensions and bring channels to the last dimension
        B, C, D, H, W = x.shape
        x = x.view(B, C, D * H * W).transpose(1, 2)  # Now x has shape [batch_size, num_tokens, embed_dim]
        # Shape after view and transpose: [8, 18816, 96]
        print(f"Shape after view and transpose: {x.shape}")

        # Add positional encoding
        x = self.model.pos_encoding(x)
        print(f"Shape after adding positional encoding: {x.shape}")

        tokens_per_frame = x.shape[1]  # Number of tokens per frame
        
        for view_type in output_dict:
            tokens = x.clone()
            if view_type == 'mv_collection':
                tokens = self.format_multi_frame_tokens(tokens, batch_size, tokens_per_frame)
                # Shape after format_multi_frame_tokens: [4, 37632, 96] if n=2
                print(f"Shape after format_multi_frame_tokens: {tokens.shape}")
                # Update thw_shape after merging frames
                thw_shape = init_thw_shape
                print(f"Updated thw_shape for mv_collection: {thw_shape}")

            # Sequentially pass the tokens through each block with the thw argument
            for block in self.model.blocks:
                tokens, thw_shape = block(tokens, thw_shape)
                print(tokens.shape, thw_shape)  # Debug print statement for shape tracking
                # Shape after each block will be [batch_size, num_tokens, embed_dim]

            tokens = self.model.norm(tokens)
            # Shape after normalization: [4, 295, 768] if final number of tokens is 295 and embed_dim is 768
            print(f"Shape after normalization: {tokens.shape}")

            logits = self.classifier(tokens[:, 0])
            # Shape of logits: [batch_size, num_classes], e.g., [4, 10]
            print(f"Shape of logits: {logits.shape}")

            output_dict[view_type]['logits'] = logits

        return output_dict


In [None]:
# Usage example:
# Initialize the model
model = MultiVideoHybridMVit2(num_classes=10, n=3)
# Example input: [batch_size, num_views, channels, depth, height, width]
videos = torch.randn(4, 3, 3, 11, 224, 224)
output = model(videos)

In [None]:
37633/96/4