Note: Before running this code, you need to prepare json files for video scene classification datasets. The keys are filenames and the keys are class names.

In [None]:
# Import libraries
import os
import json
import torch
import torch.nn as nn
import pytorchvideo.data
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from IPython.display import Image
import imageio
import decord  # Added import statement
DATA_ROOT = 'FILL_IN_YOUR_PATH'
# Select dataset. Options: "Hollywood2", "YUP++", "360x"
dataset = "360x"

if dataset == "Hollywood2":
    with open(f'{DATA_ROOT}/Hollywood2/hollywood2.json') as f:
        hollywood2_data = json.load(f)

    # Create label mappings
    labels = set()
    for v in hollywood2_data.values():
        for k, v in v['label'].items():
            if v:
                labels.add(k)
    labels = sorted(list(labels))
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for label, i in label2id.items()}

    print(f"Unique classes: {labels}.")

elif dataset == "YUP++":
    with open(f'{DATA_ROOT}/YUP++/test.json') as f:
        yup_data = json.load(f)
    # simply read all the labels
    labels = set()
    for entry in yup_data:
        labels.add(entry['label'])
    labels = sorted(list(labels))
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for label, i in label2id.items()}
    print(f"Unique classes: {labels}.")
elif dataset == "360x":
    path = f'{DATA_ROOT}/360x/index.json'
    with open(path) as f:
        yup_data = json.load(f)
    # simply read all the labels
    labels = set()
    for entry in yup_data:
        temp_label = yup_data[entry]['label']
        labels.add(temp_label)
    labels = sorted(list(labels))
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for label, i in label2id.items()}
    print(f"Unique classes: {labels}.")
else:
    raise ValueError(f"Unknown dataset: {dataset}")


  from .autonotebook import tqdm as notebook_tqdm


Unique classes: ['Agriculture & Rural', 'Artistic Spaces', 'Bars & Nightlife', 'Campus', 'Dining & Food Outlets', 'Elevators & Escalators&Stairs', 'Historic & Religious Sites', 'Hotel & Temporary Stay', 'Indoor Educational Spaces', 'Indoor Entertainment Venues', 'Indoor Residential Spaces', 'Indoor Shops & Retail& Commercial', 'Indoor sports venues', 'Kitchen', 'Nature', 'Open Public Spaces', 'Outdoor Commercial & Markets', 'Outdoor Residences & Living', 'Outdoor Sports & Athletic Fields', 'Outdoor Transportation', 'Parks & Recreational Areas', 'Public Gathering & Conference Spaces', 'Scientific interior space', 'Storage & Utility', 'Transportation Interiors', 'Urban Constructions & street', 'Waterfronts & Water Bodies', 'Workspaces'].


In [None]:
model_checkpoint = "MCG-NJU/videomae-base-finetuned-kinetics"
image_processor = VideoMAEImageProcessor.from_pretrained(model_checkpoint)
model = VideoMAEForVideoClassification.from_pretrained(
    model_checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

# Extract configurations
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = 16
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps
print(f"Number of frames to sample: {num_frames_to_sample}"
      f"\nSample rate: {sample_rate}"
      f"\nClip duration: {clip_duration} seconds"
      f"\nFrame size: {height}x{width}"
      f"\nMean: {mean}"
      f"\nStd: {std}")
# Dataset transformations
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    # RandomShortSideScale(min_size=256, max_size=320),
                    Resize(resize_to),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                    # CenterCrop(resize_to),
                ]
            ),
        ),
    ]
)


class Hollywood2Dataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.video_paths = [item["path"] for item in self.data]

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_info = self.data[idx]
    
        video_path = os.path.join(f"{DATA_ROOT}/Hollywood2/AVIClipsScenes/", os.path.basename(self.video_paths[idx]))
        label = [k for k, v in video_info["label"].items() if v][0]
        label_id = label2id[label]

        # Load video
        video = decord.VideoReader(video_path)
        # Simple sampling method (take every nth frame, n = len(video) // num_frames_to_sample)
        frames = [video[i].asnumpy() for i in range(0, len(video), max(1, len(video) // num_frames_to_sample))]

        # Stack frames to form a tensor
        video_tensor = torch.tensor(np.stack(frames))
        if self.transform:
            video_tensor = video_tensor.permute(3, 0, 1, 2)  # Convert (T, H, W, C) to (C, T, H, W)
            video_tensor = self.transform({"video": video_tensor})["video"]

        return {"video": video_tensor, "label": label_id}

class YUPDataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        # video path are ['path'] for each entries in the list
        self.video_paths = [entry['path'] for entry in self.data]
    def __len__(self):
        return len(self.video_paths)
    def __getitem__(self, idx):
        # strip ./ from the path
        video_path = os.path.join(f"{DATA_ROOT}/YUP++/", self.video_paths[idx].strip("./"))
        # video_path = os.path.join("./YUP++", self.video_paths[idx])
        #  self.data[self.video_paths[idx]]["path"]
        label = self.data[idx]['label']
        # print(label)
        label_id = label2id[label]
        video = decord.VideoReader(video_path)
        frames = [video[i].asnumpy() for i in range(0, len(video), len(video) // num_frames_to_sample)]
        video_tensor = torch.tensor(np.stack(frames))
        # print(video_tensor.shape)
        if self.transform:
            video_tensor = video_tensor.permute(3, 0, 1, 2)  # Convert (T, H, W, C) to (C, T, H, W)
            video_tensor = self.transform({"video": video_tensor})["video"]

        return {"video": video_tensor, "label": label_id}

    
class x360Dataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.video_paths = [item["video_name"] for item in self.data]
        self.labels = [item["category"] for item in self.data]
    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_info = self.data[idx]
        temp_path = f'{DATA_ROOT}/x360/third_person/'
        video_path = temp_path +'.mp4'
        # print(os.path.exists(video_path))
        label = self.labels[idx]
        label_id = label2id[label]

        # Load video
        video = decord.VideoReader(video_path)
        # Simple sampling method (take every nth frame, n = len(video) // num_frames_to_sample)
        frames = [video[i].asnumpy() for i in range(0, len(video), max(1, len(video) // num_frames_to_sample))]
        #print(frames[1].shape)
        # Stack frames to form a tensor
        video_tensor = torch.tensor(np.stack(frames),dtype=torch.uint8)
        if self.transform:
            video_tensor = video_tensor.permute(3, 0, 1, 2)  # Convert (T, H, W, C) to (C, T, H, W)
            video_tensor = self.transform({"video": video_tensor})["video"]

        return {"video": video_tensor, "label": label_id}
    
# Load the data and ensure it is in the required structure

def load_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
    
    if isinstance(data, dict):
        data = list(data.values())
    return data


def load_x360_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
        
    if isinstance(data, dict):
        data = list(data.values())
    return data

# Load data
if dataset == "Hollywood2":
    train_data = load_data(f'{DATA_ROOT}/Hollywood2/train_split_metadata.json')
    val_data = load_data(f'{DATA_ROOT}/Hollywood2/val_split_metadata.json')
    test_data = load_data(f'{DATA_ROOT}/Hollywood2/test_data.json')
    train_dataset = Hollywood2Dataset(data=train_data, transform=train_transform)
    val_dataset = Hollywood2Dataset(data=val_data, transform=val_transform)
    test_dataset = Hollywood2Dataset(data=test_data, transform=val_transform)

elif dataset == "YUP++":
    yup_test_data = load_data(f"{DATA_ROOT}/YUP++/yup_test_data.json")
    yup_val_data = load_data(f"{DATA_ROOT}/YUP++/yup_val_data.json")
    yup_train_data = load_data(f"{DATA_ROOT}/YUP++/yup_train_data.json")
    train_dataset = YUPDataset(data=yup_train_data, transform=train_transform)
    val_dataset = YUPDataset(data=yup_val_data, transform=val_transform)
    test_dataset = YUPDataset(data=yup_test_data, transform=val_transform)
elif dataset == "360x":
    test_data = load_x360_data(f'{DATA_ROOT}/360x_test_data.json')
    val_data = load_x360_data(f'{DATA_ROOT}/360x_test_data.json')
    train_data = load_x360_data(f'{DATA_ROOT}/360x_train_data.json')
    train_dataset = x360Dataset(data=train_data, transform=train_transform)
    val_dataset = x360Dataset(data=val_data, transform=val_transform)
    test_dataset = x360Dataset(data=test_data, transform=val_transform)
else:
    raise ValueError(f"Unknown dataset: {dataset}")


    
    

# Visualization function
def unnormalize_img(img):
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)
    
def create_gif(video_tensor, filename="sample.gif"):
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

sample_video = next(iter(train_dataset))
video_tensor = sample_video["video"]



Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([28]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([28, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of frames to sample: 16
Sample rate: 4
Clip duration: 2.1333333333333333 seconds
Frame size: 224x224
Mean: [0.485, 0.456, 0.406]
Std: [0.229, 0.224, 0.225]
Removed 1 entries.
[{'binocular_files_number': 2, 'capture_time': '20231022T1130742', 'category': 'Dining & Food Outlets', 'weather': 'indoor', 'text_description': 'Eating in Five Guys', 'gps': [52.4827, -1.89761], 'video_name': 'd86992ee-abc2-4dc2-9617-17263257d201'}] removed
Removed 1 entries.
[{'binocular_files_number': 2, 'capture_time': '20231022T1130742', 'category': 'Dining & Food Outlets', 'weather': 'indoor', 'text_description': 'Eating in Five Guys', 'gps': [52.4827, -1.89761], 'video_name': 'd86992ee-abc2-4dc2-9617-17263257d201'}] removed
Removed 0 entries.
[] removed




In [5]:
sample_video = next(iter(val_dataset))
video_tensor = sample_video["video"]
# display_gif(video_tensor)

In [6]:
sample_video = next(iter(test_dataset))
video_tensor = sample_video["video"]
# display_gif(video_tensor)

In [None]:

import json
import os
import numpy as np
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import CLIPModel, AutoProcessor
from torchvision.transforms import ToPILImage
from PIL import Image
import warnings
from torchvision import models
import timm

warnings.filterwarnings("ignore", category=UserWarning, module='huggingface_hub.*')

class PureImageEncoder(nn.Module):
    def __init__(self):
        super(PureImageEncoder, self).__init__()
        self.CLIP = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
        self.image_processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
        c_in = 768
        reduction = 4

        # Freeze CLIP
        for param in self.CLIP.parameters():
            param.requires_grad = False

    def preprocess_image(self, image):
        x = self.image_processor(images=image, return_tensors="pt")["pixel_values"]
        return x

    def forward(self, x):
        # if input is pre-computed features of CLIP, skip the get_features step
        if x.shape[-1] == 768:
            pass
        else:
            x = self.CLIP.get_image_features(pixel_values=x)
        return x

from ..model.TICL import TICL



In [None]:
# Custom branch in the model
# add an additional branch to the model
# it should override class transformers.VideoMAEForVideoClassification
# Custom branch in the model
print("""\

                                       ._ o o                                               
                                       \_`-)|_
                                    ,""       \ 
                                  ,"  ## |   ಠ ಠ. 
                                ," ##   ,-\__    `.
                              ,"       /     `--._;)      ///// Please wait, and...
                            ,"     ## /
                          ,"   ##    /
 _____ _               _                                                                    _                  _          _               
/  __ \ |             | |                                                                  | |                | |        | |              
| /  \/ |__   ___  ___| | __  _   _  ___  _   _ _ __   _ __   __ _ _ __ __ _ _ __ ___   ___| |_ ___ _ __ ___  | |__   ___| | _____      __
| |   | '_ \ / _ \/ __| |/ / | | | |/ _ \| | | | '__| | '_ \ / _` | '__/ _` | '_ ` _ \ / _ \ __/ _ \ '__/ __| | '_ \ / _ \ |/ _ \ \ /\ / /
| \__/\ | | |  __/ (__|   <  | |_| | (_) | |_| | |    | |_) | (_| | | | (_| | | | | | |  __/ ||  __/ |  \__ \ | |_) |  __/ | (_) \ V  V / 
 \____/_| |_|\___|\___|_|\_\  \__, |\___/ \__,_|_|    | .__/ \__,_|_|  \__,_|_| |_| |_|\___|\__\___|_|  |___/ |_.__/ \___|_|\___/ \_/\_/  
                               __/ |                  | |                                                                                 
                              |___/                   |_|                                                                                 

                    """)
from transformers import VideoMAEForVideoClassification, VideoMAEPreTrainedModel, Trainer, TrainingArguments
from transformers.modeling_outputs import ImageClassifierOutput
from typing import Optional, Tuple, Union
from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
import wandb
def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    if isinstance(predictions, list):
        predictions = np.concatenate(predictions, axis=0)
    if len(predictions.shape) == 3:
        predictions = predictions[:, :, 0]
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def collate_fn(examples):
    pixel_values = torch.stack([example["video"].permute(1, 0, 2, 3) for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

class CustomVideoMAE(VideoMAEPreTrainedModel):
    def __init__(self, config, base_model, feature_extractor, label2id, id2label):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.videomae = base_model.videomae
        
        # Classifier head
        self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
        self.feature_extractor = feature_extractor
        out_size = 768
        self.classifier = nn.Linear(
            # uncomment below to use the original VideoMAE branch and concatenate features
            # config.hidden_size + out_size,
            out_size,
            config.num_labels
        )
        self.conv1x1 = nn.Conv2d(4, 1, kernel_size=1)  # 1x1 convolution to reduce dimension
        self.config.label2id = label2id
        self.config.id2label = id2label

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutput]:

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # uncomment below to use the original VideoMAE branch
        # outputs = self.videomae(
        #     pixel_values,
        #     head_mask=head_mask,
        #     output_attentions=output_attentions,
        #     output_hidden_states=output_hidden_states,  # Ensure hidden states are returned
        #     return_dict=return_dict,
        # )
        # if output_hidden_states:
        #     sequence_output = outputs.hidden_states[-1]  # Ensure hidden states are returned
        # else:
        #     sequence_output = outputs[0]
        # if self.fc_norm is not None:
        #     sequence_output = self.fc_norm(sequence_output.mean(1))
        # else:
        #     sequence_output = sequence_output[:, 0]

        frame_count = pixel_values.shape[1]
        time_sample_interval = pixel_values.shape[1] // 4
        features = []
        for i in range(0, frame_count, time_sample_interval):
            frame_features = self.feature_extractor(pixel_values[:, i, :, :, :])
            features.append(frame_features)
        features = torch.stack(features, dim=1)
        features = self.conv1x1(features.unsqueeze(2)).squeeze(2)
        features = features.squeeze(1)
        # uncomment below to use the original VideoMAE branch and concatenate features
        # concatenated_features = torch.cat((sequence_output, features), dim=1)
        # logits = self.classifier(concatenated_features)
        logits = self.classifier(features)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,)  # + outputs[2:]  # Keep only logits
            return ((loss,) + output) if loss is not None else output
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            # hidden_states=outputs.hidden_states,
            # attentions=outputs.attentions,
        )
    
    
    def get_features(self, pixel_values, feature_type="concatenated"):
        with torch.no_grad():
            outputs = self.videomae(pixel_values, output_hidden_states=True, return_dict=True)
            sequence_output = outputs.hidden_states[-1][:, 0, :]  # CLS token representation

            frame_count = pixel_values.shape[1]
            time_sample_interval = pixel_values.shape[1] // 4
            features = []
            for i in range(0, frame_count, time_sample_interval):
                frame_features = self.feature_extractor(pixel_values[:, i, :, :, :])
                features.append(frame_features)
            features = torch.stack(features, dim=1)
            features = self.conv1x1(features.unsqueeze(2)).squeeze(2)
            features = features.squeeze(1)

            if feature_type == "concatenated":
                return torch.cat((sequence_output, features), dim=1)
            elif feature_type == "TICL":
                return features
            else:
                raise ValueError("Invalid feature type. Choose either 'concatenated' or 'TICL'.")



base_model = VideoMAEForVideoClassification.from_pretrained(
    model_checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)
# Feature extractor
time_feature = True
pure_CLIP = False

if time_feature == True:
    if not pure_CLIP:
        feature_extractor = TICL()
        model_path = 'TICL_adapter_v1_best.pth'
        feature_extractor.load_state_dict(torch.load(model_path))
        feature_extractor = feature_extractor.image_encoder
        feature_extractor.eval()
    else:
        feature_extractor = PureImageEncoder()
        feature_extractor.eval()
    custom_model = CustomVideoMAE(base_model.config, base_model, feature_extractor, label2id, id2label)
else:
    custom_model = base_model
print(custom_model)
# Training and evaluation
batch_size = 2
num_epochs = 20

metric = evaluate.load("accuracy")

if dataset == "Hollywood2":
    run_name = "hollywood2_videomae"
    # learning_rate = 1e-4
    num_epochs = 20
    learning_rate = 5e-5
    warmup_ratio = 0.0
elif dataset == "YUP++":
    run_name = "yup_videomae"
    learning_rate = 5e-5
    warmup_ratio = 0.0
    num_epochs = 10
elif dataset == "360x":
    batch_size = 2
    run_name = "360x_videomae"
    learning_rate = 7e-5
    warmup_ratio = 0.0
    num_epochs = 20
else:
    raise ValueError(f"Unknown dataset: {dataset}")
time_feature_type = "TICL" if time_feature else "None"
max_steps = (len(train_dataset) // batch_size) * num_epochs
steps_per_epoch = len(train_dataset) // batch_size
# add basic hyperparameters to the run name
# run_name += f"1:5val_TEST_resizeT_bs{batch_size}_epochs{num_epochs}_steps{max_steps}_lr{learning_rate}_wr{warmup_ratio}_Timefeature=4frame_{str(time_feature)}{time_feature_type}_third_view"
run_name += f"1:5val_TEST_resizeT_bs{batch_size}_epochs{num_epochs}_steps{max_steps}_lr{learning_rate}_wr{warmup_ratio}_Timefeature=4frame_{str(time_feature)}{time_feature_type}_LP_third_view"
if pure_CLIP:
    run_name += f"_pureCLIP={str(pure_CLIP)}"
# 4 epochs one eval
eval_steps = 1 * steps_per_epoch
# print(eval_steps)
out_dir = f"./out/{dataset}/{run_name}"
training_args = TrainingArguments(
    output_dir=out_dir,
    remove_unused_columns=False,
    # strategies must be the same
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps = eval_steps,  # Save every 4 epochs as well
    eval_steps=eval_steps,
    save_total_limit=1,  # Only keep the 1 most recent checkpoints
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=warmup_ratio,
    logging_steps=10,
    num_train_epochs=num_epochs,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    max_steps=max_steps,
    fp16=True,
    report_to="wandb",
    run_name=run_name,
)

trainer = Trainer(
    custom_model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # set evaluation interval to 1 to enable evaluation after each epoch
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

print(f"////Run {run_name} preparation finished...////")
print("""\
                              _- _ , - . _
                            `,% o` ~~-_,'.'
                            % %@ - % %, -'%,
                           ,-, . _ --\ -.%
                  P^=.     `'"   |+|'    `
                  ||             |+|
                  ||             |+|
                  ||             |+|
            ______/|             |+|
           `| ___ ,/             |+|
            ||   ||              |+|
            ||   ||              |+|
    ________||___||___.__________/H|____
    ______               _          __             _____         _       _             _ 
    | ___ \             | |        / _|           |_   _|       (_)     (_)           | |
    | |_/ /___  __ _  __| |_   _  | |_ ___  _ __    | |_ __ __ _ _ _ __  _ _ __   __ _| |
    |    // _ \/ _` |/ _` | | | | |  _/ _ \| '__|   | | '__/ _` | | '_ \| | '_ \ / _` | |
    | |\ \  __/ (_| | (_| | |_| | | || (_) | |      | | | | (_| | | | | | | | | | (_| |_|
    \_| \_\___|\__,_|\__,_|\__, | |_| \___/|_|      \_/_|  \__,_|_|_| |_|_|_| |_|\__, (_)
                            __/ |                                                 __/ |  
                           |___/                                                 |___/                                                                                                                                                                                                                                                              
        """)


                                       ._ o o                                               
                                       \_`-)|_
                                    ,""       \ 
                                  ,"  ## |   ಠ ಠ. 
                                ," ##   ,-\__    `.
                              ,"       /     `--._;)      ///// Please wait, and...
                            ,"     ## /
                          ,"   ##    /
 _____ _               _                                                                    _                  _          _               
/  __ \ |             | |                                                                  | |                | |        | |              
| /  \/ |__   ___  ___| | __  _   _  ___  _   _ _ __   _ __   __ _ _ __ __ _ _ __ ___   ___| |_ ___ _ __ ___  | |__   ___| | _____      __
| |   | '_ \ / _ \/ __| |/ / | | | |/ _ \| | | | '__| | '_ \ / _` | '__/ _` | '_ ` _ \ / _ \ __/ _ \ '__/ __| | '_ \ / _ \ |/ _

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([28]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([28, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
`text_config_dict` is provided w

CustomVideoMAE(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-11): 12 x VideoMAELayer(
          (attention): VideoMAEAttention(
            (attention): VideoMAESelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
            (dense): Linear(in_features=

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


////Run 360x_videomae1:5val_TEST_resizeT_bs2_epochs20_steps1520_lr7e-05_wr0.0_Timefeature=4frame_True_LP_third_view preparation finished...////
                              _- _ , - . _
                            `,% o` ~~-_,'.'
                            % %@ - % %, -'%,
                           ,-, . _ --\ -.%
                  P^=.     `'"   |+|'    `
                  ||             |+|
                  ||             |+|
                  ||             |+|
            ______/|             |+|
           `| ___ ,/             |+|
            ||   ||              |+|
            ||   ||              |+|
    ________||___||___.__________/H|____
    ______               _          __             _____         _       _             _ 
    | ___ \             | |        / _|           |_   _|       (_)     (_)           | |
    | |_/ /___  __ _  __| |_   _  | |_ ___  _ __    | |_ __ __ _ _ _ __  _ _ __   __ _| |
    |    // _ \/ _` |/ _` | | | | |  _/ _ \| '__|   | | '__/ _` | | 

In [9]:
trainer.train()
# Log the final evaluation metrics
print("testing model...")

final_metrics = trainer.evaluate(eval_dataset=test_dataset)
wandb.log(final_metrics)
print(final_metrics)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m505029658[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Accuracy
76,3.2264,3.249602,0.111111
152,3.1157,3.144314,0.111111
228,2.9813,3.079608,0.111111
304,2.6821,3.060203,0.148148
380,2.7604,3.028357,0.166667
456,2.3494,2.98065,0.166667
532,2.7413,2.91182,0.296296
608,2.4543,2.852078,0.296296
684,2.3929,2.785821,0.333333
760,1.4903,2.725021,0.351852




testing model...




{'eval_loss': 2.5075418949127197, 'eval_accuracy': 0.42592592592592593, 'eval_runtime': 53.5742, 'eval_samples_per_second': 1.008, 'eval_steps_per_second': 0.504, 'epoch': 20.0}
