In [1]:
"""Обучить модель классификации видео с другим подходом и провести сравнение - 5 баллов"""

# Начало блокнота до создания модели скопирую с блокнота 2 и 3 

'Обучить модель классификации видео с другим подходом и провести сравнение - 5 баллов'

Каждое видео сопроваждается аудодорожкой. Возьмем аудорожки с каждого видео и попробуем предсказать label по аудидорожке. 
Будем брать аудиозаписи длиной 5 секунд на частоте 16кГц. Если у аудиозаписи длина меньше - будем брать случайный кроп. 
Если больше - будем добавлять паддинг этой же аудизаписью. Далее аудиозаписи будем переводить в изображенине (mel-spectogramm) и работать с ними как с изображениями. Таким образом каждая видеозапись будет конвенртированна в аудиозапись. 
Для данной задачи будем обучать сеть eff_net_b2 

In [2]:
# Импорт либ
import os
import random
import time
import warnings

import torch
import timm
import pandas as pd
import numpy as np
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn.functional as F
import albumentations
import albumentations as A

from box import Box
from tqdm import tqdm
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision.io import read_video
from albumentations.pytorch.transforms import ToTensorV2

warnings.simplefilter("ignore", UserWarning)

In [3]:
# Посмотрим на обновленную нашу дату 
df = pd.read_csv("../data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,name_video,label
0,0,videos/video_0000.mp4,tap dancing
1,1,videos/video_0001.mp4,tap dancing
2,2,videos/video_0002.mp4,tap dancing
3,3,videos/video_0003.mp4,tap dancing
4,4,videos/video_0004.mp4,tap dancing


In [4]:
# Закодируем лейблы в числовые значения 
unique_labels = df['label'].unique()
label_dict = {label: index for index, label in enumerate(unique_labels)}
df['target'] = df.label.map(label_dict)

df.head()

Unnamed: 0.1,Unnamed: 0,name_video,label,target
0,0,videos/video_0000.mp4,tap dancing,0
1,1,videos/video_0001.mp4,tap dancing,0
2,2,videos/video_0002.mp4,tap dancing,0
3,3,videos/video_0003.mp4,tap dancing,0
4,4,videos/video_0004.mp4,tap dancing,0


In [5]:
# Cоздадим конфиг для обучения модели
config = Box()

config.num_workers = 1
config.batch_size = 24
config.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config.seed = 1771
config.model_name = 'tf_efficientnet_b2'
config.num_features = df.target.nunique()
config.optimizer_lr = 0.001
config.epochs = 20
config.test_size = 0.2

In [6]:
def crop_or_pad(y, length, start=None):
    """
    Crop or padding for train audio
    :param y:
    :param length:
    :param start:
    :return:
    """
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        n_repeats = length // len(y)
        epsilon = length % len(y)
        y = np.concatenate([y] * n_repeats + [y[:epsilon]])

    elif len(y) > length:
        start = start or np.random.randint(len(y) - length)
        y = y[start:start + length]

    return y

def compute_melspec(y, sr, n_mels, fmin, fmax, n_fft=2048, hop_length=512):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    if fmax is None:
        fmax = sr // 2


    melspec = lb.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, n_fft=n_fft, hop_length=hop_length
    )

    melspec = lb.power_to_db(melspec.astype(np.float32), ref=np.max)
    return melspec

def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def normalize(image):
    # image = image.astype("float32", copy=False) / 255.0
    image = image.astype(np.uint8)
    image = np.stack([image, image, image], axis=-1)
    transform = albumentations.Compose([
        albumentations.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))
    ])
    return transform(image=image)['image'].T


In [16]:
# Изменим наш датасет 
from moviepy.editor import VideoFileClip
import librosa as lb
from scipy.signal import resample

class DanceAudioSet(Dataset):
    def __init__(self, df, is_train = False):
        self.df = df
        self.video_path = ".."
        self.is_train = is_train
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        target = row['target']
        video_path = os.path.join(self.video_path, row['name_video'])
        
        # Считываем только аудиозапись. Все аудиодорожки частотой 44100. 
        target_sr = 44100
        try:
            video = VideoFileClip(video_path)
            audio = video.audio
            audio_array = audio.to_soundarray()
            if audio_array.ndim > 1:
                audio_mono = np.mean(audio_array, axis=1)
            else:
                audio_mono = audio_array

            crop_audio = crop_or_pad(audio_mono, length=target_sr * 5)

            melspec = compute_melspec(crop_audio,
                                      sr=target_sr,
                                      n_mels=128,
                                      fmin=0,
                                      fmax=target_sr // 2,
                                      n_fft=2048,
                                      hop_length=512)
            image = mono_to_color(melspec)
            image = normalize(image)
            image = torch.tensor(image).float()
        except:
            image = torch.randint(0,1,(3, 431, 128))
            
        
        label = torch.tensor(target).long()
        return image, label

In [17]:
# Разобьем наши данные на тест и трейн. cоздадим тренеровочный и тестовый датасэт и даталоадэры
train_df, val_df = train_test_split(df, 
                                    test_size=config.test_size,
                                    random_state=config.seed,
                                    stratify=df['target']
                                   )
dataset_train = DanceAudioSet(train_df.reset_index(),
                                 is_train=True)
dataset_test = DanceAudioSet(val_df.reset_index())

train_loader = DataLoader(dataset_train,
                          batch_size=config.batch_size,
                          shuffle=True,
#                          num_workers=config.num_workers
                         )
valid_loader = DataLoader(dataset_test,
                          batch_size=config.batch_size,
#                          num_workers=config.num_workers
                         )

In [18]:
model_name = config.model_name
model = timm.create_model(model_name, pretrained=True)
model.classifier = nn.Sequential(
    nn.Linear(model.classifier.in_features, config.epochs)
)
model.to(config.device)

config.device

device(type='cuda')

In [19]:
loss_f = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config.optimizer_lr)
scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.8)

In [20]:
# Проведем обучение модели. Для корректной работы и для защиты от сбоев будем сохранять модель после каждой эпохи
for epoch_i in range(1, config.epochs + 1):
    start = time.time()

    print(f'---------------------epoch:{epoch_i}/{config.epochs}---------------------')

    # loss
    avg_train_loss = 0
    avg_val_loss = 0
    summa = 0
    ############## Train #############
    model.train()
    train_pbar = tqdm(train_loader, desc="Training")
    for X,y in (train_pbar):
        X_batch = X.to(config.device)
        y_batch = y.to(config.device)

        optimizer.zero_grad()
        res = model.forward(X_batch)
    
        loss = loss_f(res, y_batch)

        if torch.cuda.is_available():
            train_pbar.set_postfix(gpu_load=f"{torch.cuda.memory_allocated() / 1024 ** 3:.2f}GB",
                                   loss=f"{loss.item():.4f}")
        else:
            train_pbar.set_postfix(loss=f"{loss.item():.4f}")

        loss.backward()
        optimizer.step()
        avg_train_loss += loss * len(y_batch)

        del X, res
        

    
    ########## VALIDATION ###############
    model.eval()
    valid_pbar = tqdm(valid_loader, desc="Testing")
    with torch.no_grad():
        for X,y in (valid_pbar):
            X_batch = X.to(config.device)
            y_batch = y.to(config.device)

            res = model.forward(X_batch)
            
            loss = loss_f(res, y_batch)
            avg_val_loss += loss * len(y_batch)
            valid_pbar.set_postfix(loss=f"{loss.item():.4f}")

            res = res.detach().cpu()
            y_batch = y_batch.cpu()
            
            preds = torch.max(F.softmax(res, dim=1), dim=1)
            correct= torch.eq(preds[1], y_batch)
            summa += torch.sum(correct).item()

            del X, res
            

    torch.cuda.empty_cache()

    avg_train_loss = avg_train_loss / len(dataset_train)
    avg_val_loss = avg_val_loss / len(dataset_test)
    
    acc = summa / len(dataset_test)

    print(f'epoch: {epoch_i}, lr_rate {optimizer.param_groups[0]["lr"]}')

    print("loss_train: %0.4f| loss_valid: %0.4f|" % (avg_train_loss, avg_val_loss))
    print(f"metric {acc:.<5g}")

    elapsed_time = time.time() - start
    hours = int(elapsed_time // 3600)
    minutes = int((elapsed_time % 3600) // 60)
    seconds = int(elapsed_time % 60)
    print(f"Elapsed time: {hours:02d}:{minutes:02d}:{seconds:02d}")
    scheduler.step()
    torch.save(model, f"model_ep_{epoch_i}.pt")

---------------------epoch:1/20---------------------


Training: 100%|██████████| 81/81 [10:43<00:00,  7.95s/it, gpu_load=3.21GB, loss=2.3662]
Testing: 100%|██████████| 21/21 [02:39<00:00,  7.61s/it, loss=2.0172]


epoch: 1, lr_rate 0.001
loss_train: 2.4141| loss_valid: 2.5576|
metric 0.244856
Elapsed time: 00:13:23
---------------------epoch:2/20---------------------


Training: 100%|██████████| 81/81 [11:00<00:00,  8.15s/it, gpu_load=3.21GB, loss=1.7755]
Testing: 100%|██████████| 21/21 [02:52<00:00,  8.22s/it, loss=1.9081]


epoch: 2, lr_rate 0.001
loss_train: 2.0361| loss_valid: 2.2602|
metric 0.323045
Elapsed time: 00:13:53
---------------------epoch:3/20---------------------


Training: 100%|██████████| 81/81 [11:53<00:00,  8.81s/it, gpu_load=3.21GB, loss=1.9143]
Testing: 100%|██████████| 21/21 [03:05<00:00,  8.84s/it, loss=1.8884]


epoch: 3, lr_rate 0.001
loss_train: 1.8171| loss_valid: 2.0315|
metric 0.399177
Elapsed time: 00:14:59
---------------------epoch:4/20---------------------


Training: 100%|██████████| 81/81 [11:53<00:00,  8.81s/it, gpu_load=3.21GB, loss=1.4358]
Testing: 100%|██████████| 21/21 [02:46<00:00,  7.94s/it, loss=1.8757]


epoch: 4, lr_rate 0.0008
loss_train: 1.5516| loss_valid: 2.1806|
metric 0.390947
Elapsed time: 00:14:40
---------------------epoch:5/20---------------------


Training: 100%|██████████| 81/81 [10:41<00:00,  7.92s/it, gpu_load=3.21GB, loss=1.7663]
Testing: 100%|██████████| 21/21 [02:32<00:00,  7.26s/it, loss=1.7021]


epoch: 5, lr_rate 0.0008
loss_train: 1.3342| loss_valid: 2.1246|
metric 0.353909
Elapsed time: 00:13:13
---------------------epoch:6/20---------------------


Training: 100%|██████████| 81/81 [11:32<00:00,  8.54s/it, gpu_load=3.21GB, loss=1.5640]
Testing: 100%|██████████| 21/21 [02:57<00:00,  8.45s/it, loss=1.8947]


epoch: 6, lr_rate 0.0008
loss_train: 1.1703| loss_valid: 2.1359|
metric 0.395062
Elapsed time: 00:14:29
---------------------epoch:7/20---------------------


Training: 100%|██████████| 81/81 [11:24<00:00,  8.45s/it, gpu_load=3.21GB, loss=1.2133]
Testing: 100%|██████████| 21/21 [02:56<00:00,  8.42s/it, loss=2.6311]


epoch: 7, lr_rate 0.00064
loss_train: 0.9020| loss_valid: 2.1918|
metric 0.432099
Elapsed time: 00:14:21
---------------------epoch:8/20---------------------


Training: 100%|██████████| 81/81 [12:11<00:00,  9.03s/it, gpu_load=3.21GB, loss=0.6375]
Testing: 100%|██████████| 21/21 [02:42<00:00,  7.73s/it, loss=2.5433]


epoch: 8, lr_rate 0.00064
loss_train: 0.7078| loss_valid: 2.3737|
metric 0.390947
Elapsed time: 00:14:54
---------------------epoch:9/20---------------------


Training: 100%|██████████| 81/81 [11:30<00:00,  8.52s/it, gpu_load=3.21GB, loss=1.0601]
Testing: 100%|██████████| 21/21 [03:10<00:00,  9.07s/it, loss=3.1093]


epoch: 9, lr_rate 0.00064
loss_train: 0.5995| loss_valid: 2.6258|
metric 0.390947
Elapsed time: 00:14:40
---------------------epoch:10/20---------------------


Training: 100%|██████████| 81/81 [10:56<00:00,  8.10s/it, gpu_load=3.21GB, loss=0.5065]
Testing: 100%|██████████| 21/21 [02:33<00:00,  7.32s/it, loss=2.3375]


epoch: 10, lr_rate 0.0005120000000000001
loss_train: 0.4350| loss_valid: 2.5636|
metric 0.425926
Elapsed time: 00:13:30
---------------------epoch:11/20---------------------


Training: 100%|██████████| 81/81 [10:35<00:00,  7.85s/it, gpu_load=3.21GB, loss=0.5840]
Testing: 100%|██████████| 21/21 [02:29<00:00,  7.12s/it, loss=2.4233]


epoch: 11, lr_rate 0.0005120000000000001
loss_train: 0.3721| loss_valid: 2.6240|
metric 0.41358
Elapsed time: 00:13:05
---------------------epoch:12/20---------------------


Training: 100%|██████████| 81/81 [09:53<00:00,  7.32s/it, gpu_load=3.21GB, loss=0.0979]
Testing: 100%|██████████| 21/21 [02:28<00:00,  7.05s/it, loss=3.5517]


epoch: 12, lr_rate 0.0005120000000000001
loss_train: 0.3004| loss_valid: 2.6800|
metric 0.397119
Elapsed time: 00:12:21
---------------------epoch:13/20---------------------


Training: 100%|██████████| 81/81 [10:02<00:00,  7.44s/it, gpu_load=3.21GB, loss=0.3730]
Testing: 100%|██████████| 21/21 [02:27<00:00,  7.01s/it, loss=3.3222]


epoch: 13, lr_rate 0.0004096000000000001
loss_train: 0.2460| loss_valid: 2.4993|
metric 0.432099
Elapsed time: 00:12:30
---------------------epoch:14/20---------------------


Training: 100%|██████████| 81/81 [09:53<00:00,  7.32s/it, gpu_load=3.21GB, loss=0.0511]
Testing: 100%|██████████| 21/21 [02:27<00:00,  7.04s/it, loss=3.1269]


epoch: 14, lr_rate 0.0004096000000000001
loss_train: 0.2182| loss_valid: 2.6179|
metric 0.41358
Elapsed time: 00:12:21
---------------------epoch:15/20---------------------


Training: 100%|██████████| 81/81 [09:53<00:00,  7.33s/it, gpu_load=3.21GB, loss=0.1508]
Testing: 100%|██████████| 21/21 [02:28<00:00,  7.05s/it, loss=4.1746]


epoch: 15, lr_rate 0.0004096000000000001
loss_train: 0.1501| loss_valid: 3.0075|
metric 0.399177
Elapsed time: 00:12:21
---------------------epoch:16/20---------------------


Training: 100%|██████████| 81/81 [09:56<00:00,  7.36s/it, gpu_load=3.21GB, loss=0.3076]
Testing: 100%|██████████| 21/21 [02:31<00:00,  7.24s/it, loss=3.7400]


epoch: 16, lr_rate 0.0003276800000000001
loss_train: 0.1623| loss_valid: 2.6402|
metric 0.438272
Elapsed time: 00:12:28
---------------------epoch:17/20---------------------


Training: 100%|██████████| 81/81 [09:59<00:00,  7.40s/it, gpu_load=3.21GB, loss=0.4786]
Testing: 100%|██████████| 21/21 [02:32<00:00,  7.25s/it, loss=5.1536]


epoch: 17, lr_rate 0.0003276800000000001
loss_train: 0.1331| loss_valid: 2.6773|
metric 0.444444
Elapsed time: 00:12:31
---------------------epoch:18/20---------------------


Training: 100%|██████████| 81/81 [10:00<00:00,  7.41s/it, gpu_load=3.21GB, loss=0.0939]
Testing: 100%|██████████| 21/21 [02:28<00:00,  7.07s/it, loss=5.6931]


epoch: 18, lr_rate 0.0003276800000000001
loss_train: 0.1214| loss_valid: 2.8313|
metric 0.432099
Elapsed time: 00:12:28
---------------------epoch:19/20---------------------


Training: 100%|██████████| 81/81 [10:00<00:00,  7.41s/it, gpu_load=3.21GB, loss=0.1203]
Testing: 100%|██████████| 21/21 [02:29<00:00,  7.10s/it, loss=2.9988]


epoch: 19, lr_rate 0.0002621440000000001
loss_train: 0.0974| loss_valid: 2.7730|
metric 0.440329
Elapsed time: 00:12:29
---------------------epoch:20/20---------------------


Training: 100%|██████████| 81/81 [10:00<00:00,  7.41s/it, gpu_load=3.21GB, loss=0.0338]
Testing: 100%|██████████| 21/21 [02:30<00:00,  7.16s/it, loss=3.1414]

epoch: 20, lr_rate 0.0002621440000000001
loss_train: 0.0760| loss_valid: 2.8291|
metric 0.45679
Elapsed time: 00:12:30





In [None]:
# Как мы видим наилучшей результат был получен только на 20ой эпохе. Возможно, если бы мы поставили на эпох 50-60 то результаты были бы еще лучше. 