In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [3]:
def mean_class_accuracy(predicts, labels):
    conf_matrix = confusion_matrix(y_pred=predicts, y_true=labels)

    cls_cnt = conf_matrix.sum(axis=1)
    cls_hit = np.diag(conf_matrix)

    metrics = [hit / cnt if cnt else 0.0 for cnt, hit in zip(cls_cnt, cls_hit)]
    mean_class_acc = np.mean(metrics)

    return mean_class_acc

In [12]:
gt_df = pd.read_table('./data/test_18.csv')
gt_df = gt_df[['attachment_id', 'label']].set_index('attachment_id')

pr_df = pd.read_table('./submit/predicts.csv')
pr_df = pr_df[['attachment_id', 'class_indx']].set_index('attachment_id')

In [13]:
df = gt_df.join(pr_df)

mean_class_accuracy(df['class_indx'].values, df[['label']])

0.8366633366633367

In [5]:
# Most common, no tta:
# 0.6583416583416584
#
# Most common, flip:
# 0.6733266733266733
#
# Most common, five crops 256 - 256: 
# 0.6858141858141859
#
# Most common, five crops 256 - 240: 
# 0.6758241758241759
#

# Max model Most common, no tta:
# 0.7447552447552448
# Max model Most common, flip:
# 0.7447552447552448
# Max model Most common, five crops 256 - 256: 
# 0.7527472527472527

# PreLast model Most common, no tta:
# 0.7937062937062938

# Last model Most common, no tta:
# 0.8381618381618382

# Last model Most common, no tta, repeat last:
# 0.8361638361638362

In [8]:
# show_video_in_jupyter('/home/user/datasets/slovo/dataset_15/45eb40a3-edd3-437f-9cfa-5040df72d275.mp4')

In [16]:
import torch
import numpy as np
import torchvision.transforms as transforms
from decord import VideoReader
from denku import show_video_in_jupyter, show_images

class VideoProcessor:
    def __init__(self, 
                 video_path, 
                 min_side=256,
                 sample_size=224, 
                 sample_stride=2, 
                 sample_n_frames=32,
                 batch_stride=8,
            ):
        
        self.video_reader = VideoReader(video_path)
        self.video_length = len(self.video_reader)

        self.sample_stride = sample_stride
        self.sample_n_frames = sample_n_frames
        self.batch_stride = batch_stride
        
        self.pixel_transforms = transforms.Compose([
            transforms.Resize(min_side, antialias=False),
            transforms.CenterCrop((sample_size, sample_size)),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
        ])
    
    def get_frames(self, start_frame):
        clip_length = min(self.video_length - start_frame, (self.sample_n_frames - 1) * self.sample_stride + 1)
        batch_index = np.linspace(start_frame, start_frame + clip_length - 1, self.sample_n_frames, dtype=int)
        pixel_values = torch.from_numpy(self.video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
        return pixel_values

    def get_video(self, start_frame=0):
        pixel_values = self.get_frames(start_frame)
        pixel_values = pixel_values / 255.0
        pixel_values = self.pixel_transforms(pixel_values)
        pixel_values = pixel_values.permute(1, 0, 2, 3)
        return pixel_values
    
    def get_video_batch(self):
        video_batch = []
        clip_len = (self.sample_n_frames - 1) * self.sample_stride + 1
        
        for start_frame in range(0, max(self.video_length - clip_len, 1), self.batch_stride):
            frames = self.get_video(start_frame=start_frame)
            video_batch.append(frames)
        return video_batch
    
video_processor = VideoProcessor('/home/user/datasets/slovo/dataset_15/45eb40a3-edd3-437f-9cfa-5040df72d275.mp4')
len(video_processor.get_video_batch())

1