In [1]:
!pip install av

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting av
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-10.0.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/my_kinetics/train.zip /content/train.zip
!unzip /content/train.zip

!cp /content/drive/MyDrive/my_kinetics/val.zip /content/val.zip
!unzip /content/val.zip

In [4]:
import av
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import albumentations as A

import torch.nn as nn
import torchvision.models 
from sklearn.metrics import accuracy_score

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [5]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = converted_len
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)]
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

In [6]:
class MyKinetics(Dataset):
    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        my_transform = A.Compose([
            A.Resize(128, 171, always_apply=True),
            A.CenterCrop(112, 112, always_apply=True),
            A.Normalize(mean = [0.43216, 0.394666, 0.37645],
                        std = [0.22803, 0.22145, 0.216989], 
                        always_apply=True)
                    ], additional_targets={
                        f'image{i}': 'image'for i in range(1, 8)
                        }
        )

        file_path = '/content/content/' + str(self.meta['split'].iloc[idx]) + '/' + str(self.meta['label'].iloc[idx]) +  '/my_' + str(self.meta['youtube_id'].iloc[idx]) + '.mp4'
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=8, frame_sample_rate=5, seg_len=container.streams.video[0].frames)
        video = read_video_pyav(container, indices)
        transformed = apply_video_augmentations(video, my_transform)

        label = torch.tensor(self.meta['num_label'].iloc[idx]).long()
        video = torch.from_numpy(transformed).permute(3, 0, 1, 2)

        return video, label 

In [7]:
train_info = pd.read_csv('/content/drive/MyDrive/my_kinetics/train.csv', sep=',')
train_dataset = MyKinetics(train_info)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)

val_info = pd.read_csv('/content/drive/MyDrive/my_kinetics/val.csv', sep=',')
val_dataset = MyKinetics(val_info)
val_dataloader = DataLoader(val_dataset, batch_size=16, num_workers=2)

In [11]:
epochs = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model = torchvision.models.video.r3d_18(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, 14)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

cuda


In [12]:
for epoch in range(epochs):
    model.train()

    train_loss = []
    for i, (batch, targets) in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch} | TRAIN")):
        optimizer.zero_grad()

        batch = batch.to(device)
        targets = targets.to(device)

        outputs = model.forward(batch)
        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

        del batch, outputs

    model.eval()

    val_loss = []
    val_targets = []
    val_preds = []

    for i, (batch, targets) in enumerate(tqdm(val_dataloader, desc=f"Epoch {epoch} | VALID")):
        with torch.no_grad():
            batch = batch.to(device)
            targets = targets.to(device)

            outputs = model.forward(batch)
            loss = loss_fn(outputs, targets)

            outputs = torch.argmax(torch.nn.functional.softmax(outputs, dim=1), dim=1)
            
            val_loss.append(loss.item())
            val_targets.extend(targets.cpu().numpy())
            val_preds.extend(outputs.cpu().numpy())

            del batch, outputs

    print('Training loss:', np.mean(train_loss))
    print('Validation loss:', np.mean(val_loss))
    print('ACC:', accuracy_score(val_targets, val_preds), end='\n\n')

Epoch 0 | TRAIN: 100%|██████████| 68/68 [01:46<00:00,  1.57s/it]
Epoch 0 | VALID: 100%|██████████| 9/9 [00:09<00:00,  1.05s/it]


Training loss: 2.208620234447367
Validation loss: 1.997698148091634
ACC: 0.3923076923076923



Epoch 1 | TRAIN: 100%|██████████| 68/68 [01:49<00:00,  1.60s/it]
Epoch 1 | VALID: 100%|██████████| 9/9 [00:11<00:00,  1.31s/it]


Training loss: 1.0589971735196955
Validation loss: 1.888624217775133
ACC: 0.4



Epoch 2 | TRAIN: 100%|██████████| 68/68 [01:45<00:00,  1.55s/it]
Epoch 2 | VALID: 100%|██████████| 9/9 [00:11<00:00,  1.27s/it]


Training loss: 0.467551968553487
Validation loss: 1.8283549414740667
ACC: 0.4307692307692308



Epoch 3 | TRAIN: 100%|██████████| 68/68 [01:46<00:00,  1.56s/it]
Epoch 3 | VALID: 100%|██████████| 9/9 [00:11<00:00,  1.27s/it]


Training loss: 0.20146447825519478
Validation loss: 1.7948171430163913
ACC: 0.4230769230769231



Epoch 4 | TRAIN: 100%|██████████| 68/68 [01:47<00:00,  1.58s/it]
Epoch 4 | VALID: 100%|██████████| 9/9 [00:11<00:00,  1.26s/it]

Training loss: 0.12773847125251503
Validation loss: 1.8479702075322468
ACC: 0.3769230769230769






Из эксперимента выше, и других, проведенных отдельно, можно сказать, что сеть достаточно быстро перееобучается. Скорее всего это происходит из-за того, что бекбон предобучен на том же наборе данных.

Лучшая достигнутая точность (accuracy) составила 0.43