In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
from tqdm import tqdm
import time
import threading
import numpy as np
import subprocess
import re

- `metadata_renamed` - то что делал Влад (переделанные наши данные)
- `metadata_old` - категории рутуба
- `metadata_orig` - категории рутуба + то, что делал Влад
- `metadata_filtered` - metadata_orig с отфильтрованными категориями по встречаемости

In [3]:
BASE = '/mnt/nfs'

metadata_r = pd.read_csv(f'{BASE}/data/metadata_old.csv')
metadata_o = pd.read_csv(f'{BASE}/data/metadata_renamed.csv')

categories = pd.read_csv(f'{BASE}/data/train_data_categories.csv').dropna()

In [24]:
def categories_extraction(x):
    x = x.split(',')
    x = [re.split(r'[:\t]', cat)[0].strip() for cat in x]
    x = ', '.join(set(x))
    return x

categories['lvl0'] = categories.tags.apply(categories_extraction)

In [25]:
all_categories = []
for i in categories.lvl0:
    i = list(filter(bool, i.split(', ')))
    all_categories.extend(i)

all_categories = set(all_categories)

## Merge videos

In [26]:
ids_to_delete = set()

# объединяем датасеты
for video_id in tqdm(metadata_o.video_id):
    audio_path = f'{BASE}/parsed_data/audio/{video_id}.mp3'
    frames_path = f'{BASE}/parsed_data/frames/{video_id}/'

    if not os.path.exists(frames_path) or not os.path.isdir(frames_path) or len(os.listdir(frames_path)) < 64:
        ids_to_delete.add(video_id)
    if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
        ids_to_delete.add(video_id)

metadata_o = metadata_o[~metadata_o.video_id.isin(ids_to_delete)]
metadata = pd.concat((metadata_o, metadata_r))

# перемещаем видео и аудио
for video_id in tqdm(metadata_o.video_id):
    audio_path = f'{BASE}/parsed_data/audio/{video_id}.mp3'
    new_audio_path = f'{BASE}/data/audio/{video_id}.mp3'

    if not os.path.exists(new_audio_path) or os.path.getsize(new_audio_path) == 0:
        audio_cp = f'cp {audio_path} {new_audio_path}'
        subprocess.run(audio_cp, shell=True)
    
    frames_path = f'{BASE}/parsed_data/frames/{video_id}/'
    new_frames_path = f'{BASE}/data/frames/{video_id}/'

    if not os.path.exists(new_frames_path) or not os.path.isdir(new_frames_path) or len(os.listdir(new_frames_path)) < 64:
        frames_cp = f'cp -r {frames_path} {new_frames_path}'
        subprocess.run(frames_cp, shell=True)

  0%|          | 0/7220 [00:00<?, ?it/s]

100%|██████████| 7220/7220 [00:01<00:00, 6108.13it/s]
100%|██████████| 1675/1675 [00:00<00:00, 2162.76it/s]


In [27]:
metadata.to_csv(f'{BASE}/data/metadata_orig.csv', index=False)

## Filter by frequency

In [40]:
metadata = pd.read_csv(f'{BASE}/data/metadata_orig.csv')

In [42]:
# удаляем видео, которые не соответствуют категориям рутуба
indices_to_delete = []
for i in range(len(metadata)):
    row = metadata.iloc[i]
    cats = row['category']
    cats = cats.split(', ')
    for cat in cats:
        if cat not in all_categories:
            indices_to_delete.append(i)

# обрезаем часто встречающиеся категории
all_categories = {}
for i in range(len(metadata)):
    row = metadata.iloc[i]
    categories = row['category'].split(', ')
    for cat in categories:
        if not cat in all_categories:
            all_categories[cat] = 1
        elif all_categories[cat] < 100:
            all_categories[cat] += 1
        else:
            indices_to_delete.append(i)

# удаляем редко встрчающиеся категории
for i in range(len(metadata)):
    row = metadata.iloc[i]
    categories = row['category'].split(', ')
    save=True
    for cat in categories:
        if all_categories[cat] < 10:
            save=False
    if not save:
        indices_to_delete.append(i)

In [43]:
filtered_indices = list(set(list(metadata.index)) - set(indices_to_delete))
metadata_filtered = metadata.iloc[filtered_indices]
metadata_filtered.to_csv(f'{BASE}/data/metadata_filtered.csv', index=False)

In [44]:
metadata_filtered.shape

(1648, 5)

## Categories + Tags

In [49]:
metadata_stacked = pd.read_csv(f'{BASE}/data/metadata_orig.csv')

In [65]:
metadata_stacked['tag'] = metadata_stacked['tag'] + ', ' + metadata_stacked['category']

metadata_stacked.to_csv(f'{BASE}/data/metadata_stacked.csv')

## Try train

In [66]:
from functools import partial
from torch.utils.data import DataLoader
from src.modelling.video_dataset import get_datasets, collate_fn

def get_loaders(path_to_data, val_size, test_size, seed, list_no_include_cat, batch_size, use_text_augmentation, 
                use_text_lematization, num_workers, pin_memory):
    # Определяем датасеты
    train_dataset, valid_dataset, test_dataset = get_datasets(path_to_data,
                                                              val_size=val_size,
                                                              test_size=test_size,
                                                              seed=seed,
                                                              categories=list_no_include_cat)

    # Инициализируем DataLoader для тренировочного набора
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=partial(collate_fn,
                                                 use_augmentation=use_text_augmentation,
                                                 use_lemmatization=use_text_lematization),
                              num_workers=num_workers,
                              pin_memory=pin_memory)

    # Инициализируем DataLoader для валидационного набора
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              collate_fn=collate_fn,
                              num_workers=num_workers,
                              pin_memory=pin_memory)

    # Инициализируем DataLoader для тестового набора
    test_loader = DataLoader(test_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=collate_fn,
                             num_workers=num_workers,
                             pin_memory=pin_memory)

    # Определяем классы
    classes = train_dataset.all_categories
    num_classes = len(classes)

    return train_loader, valid_loader, test_loader, classes, num_classes

  from .autonotebook import tqdm as notebook_tqdm
2024-09-28 10:06:46.456273: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-28 10:06:46.629262: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 10:06:46.677619: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-28 10:06:47.556580: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] 

In [70]:
def encode_labels(labels, num_classes, device):
    # Преобразование меток в one-hot encoding
    one_hot_labels = torch.zeros(len(labels), num_classes).to(device)
    for idx, label in enumerate(labels):
        one_hot_labels[idx, label] = 1.0  # Установка 1 для класса метки
    return one_hot_labels

def decode_labels(matrix, idx2cat):
    categories = []
    for row in matrix:
        # Получаем индексы, где значение 1
        indices = torch.nonzero(row).flatten().tolist()
        # Преобразуем индексы в категории
        cat_list = [idx2cat[idx] for idx in indices]
        categories.append(cat_list)
    return categories

In [67]:
train_loader, valid_loader, test_loader, classes, num_classes = get_loaders(
    path_to_data='/mnt/nfs/data',
    val_size=0.2, test_size=0.2, seed=17, list_no_include_cat=[], batch_size=1, use_text_augmentation=1, 
                use_text_lematization=1, num_workers=2, pin_memory=1
    )

In [80]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2").to(device)



In [98]:
loss

[1;35mtensor[0m[1m([0m[1;36m356746.1875[0m, [33mdevice[0m=[32m'cuda:0'[0m, [33mgrad_fn[0m=[1m<[0m[1;95mDivBackward1[0m[1m>[0m[1m)[0m