In [1]:
%cd ..

/home/sazerlife/projects/courses/itmo/semester-2/event_detection/lab4-kaggle-audioset


In [2]:
import json
import random
from pathlib import Path
from typing import Dict, List, Set, Tuple

import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import soundfile as sf
import torch
import torch.nn as nn
import torch.optim as opt
import torchaudio.transforms as T
from scipy import stats as st
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from torch.nn.modules.loss import _Loss
from torch.utils.data import DataLoader, TensorDataset
from torchlibrosa.augmentation import SpecAugmentation
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchmetrics.classification import (
    Accuracy,
    F1Score,
    MultilabelAccuracy,
    MultilabelF1Score,
)
from torchvision.models import resnet34
from torchvision.transforms import Compose
from tqdm import tqdm, tqdm_notebook
from transformers import ASTConfig, AutoFeatureExtractor, ASTForAudioClassification


from src.utils.train_val_split import train_val_split
from train_val_functions import train_epoch, validate
tqdm.pandas()

SEED=12345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = "cuda:0"
SAMPLE_RATE = 16000
DATA_PATH = Path("data/raw/")

train_csv_path = DATA_PATH / "train.csv"
train_audio_path = DATA_PATH / "audio_train"

test_csv_path = DATA_PATH / "test.csv"
test_audio_path = DATA_PATH /  "audio_test"

## Data loading

In [3]:
train_csv = pd.read_csv(train_csv_path)
CLASSES_NAMES = sorted(train_csv['label'].unique())

train_csv, val_csv = train_val_split(train_audio_path, train_csv, val_size=0.1)

In [4]:
for i in sorted(train_csv['label'].unique()):
    print(f"'{i}',", end=" ")

'Acoustic_guitar', 'Applause', 'Bark', 'Bass_drum', 'Burping_or_eructation', 'Bus', 'Cello', 'Chime', 'Clarinet', 'Computer_keyboard', 'Cough', 'Cowbell', 'Double_bass', 'Drawer_open_or_close', 'Electric_piano', 'Fart', 'Finger_snapping', 'Fireworks', 'Flute', 'Glockenspiel', 'Gong', 'Gunshot_or_gunfire', 'Harmonica', 'Hi-hat', 'Keys_jangling', 'Knock', 'Laughter', 'Meow', 'Microwave_oven', 'Oboe', 'Saxophone', 'Scissors', 'Shatter', 'Snare_drum', 'Squeak', 'Tambourine', 'Tearing', 'Telephone', 'Trumpet', 'Violin_or_fiddle', 'Writing', 

In [5]:
pd.concat([train_csv['duration'].describe(), val_csv['duration'].describe()], axis=1)

Unnamed: 0,duration,duration.1
count,5131.0,552.0
mean,6.805609,6.831014
std,7.019172,7.087069
min,0.3,0.32
25%,1.64,1.615
50%,4.1,4.2
75%,9.54,9.22
max,30.0,29.18


In [6]:
pd.DataFrame(train_csv['label'].value_counts()).join(pd.DataFrame(val_csv['label'].value_counts()).rename({"label": "val_label"}, axis=1))

Unnamed: 0,label,val_label
Fart,176,19
Flute,175,19
Double_bass,172,19
Trumpet,170,18
Acoustic_guitar,169,18
Cello,168,18
Shatter,166,18
Applause,165,18
Bass_drum,163,18
Gong,162,18


In [7]:
train_csv.sort_values(by="duration")[0:300]
train_csv.sort_values(by="duration", ignore_index=True).index.values # [300*1:300*2]

array([   0,    1,    2, ..., 5128, 5129, 5130])

In [8]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

## Training objects

In [8]:
def get_pretrained_AST(huggingface_path):
    feature_extractor = AutoFeatureExtractor.from_pretrained(huggingface_path)
    model = ASTForAudioClassification.from_pretrained(huggingface_path)
    
    for param in model.parameters():
        param.requires_grad = False

    # for param in model.audio_spectrogram_transformer.encoder.layer[-1].parameters():
    #     param.requires_grad = True

    # for param in model.audio_spectrogram_transformer.encoder.layer[-1].output.dense.parameters():
    #     param.requires_grad = True
    # for param in model.audio_spectrogram_transformer.encoder.layer[-1].layernorm_before.parameters():
    #     param.requires_grad = True
    # for param in model.audio_spectrogram_transformer.encoder.layer[-1].layernorm_after.parameters():
    #     param.requires_grad = True

    model.audio_spectrogram_transformer.layernorm = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    model.classifier.layernorm = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    model.classifier.dense = nn.Linear(in_features=768, out_features=len(CLASSES_NAMES), bias=True)
    
    return feature_extractor, model

class Dataset:
    def __init__(self, csv: pd.DataFrame, audio_dir: Path, sample_rate: int = 16000) -> None:
        self.csv = csv.sort_values(by="duration", ignore_index=True)
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate

        self.audio_paths = {index: path for index, path in enumerate(self.csv['fname'])}
        self.labels = {index: label for index, label in enumerate(self.csv['label'])}

        self.label2id = {label: id_ for id_, label in enumerate(CLASSES_NAMES)}
        self.id2label = {id_: label for id_, label in enumerate(CLASSES_NAMES)}

    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self, index: int) -> Tuple[torch.FloatTensor, torch.LongTensor]:
        fname, label = self.audio_paths[index], self.labels[index]
        audio, sr = sf.read(self.audio_dir / fname, dtype="float32")
        assert sr == self.sample_rate

        target = torch.zeros(len(CLASSES_NAMES))
        target[self.label2id[label]] = 1

        return audio, target


# import torch.utils.data as data


# class DynamicalBatchSampler(data.Sampler):
#     def __init__(self, dataset: Dataset, batch_size=None, shuffle=True, bucket_length=300):
#         self.dataset = dataset

#         self.batch_size = batch_size
#         self.shuffle = shuffle
#         self.bucket_length = bucket_length

#         self.buckets = list()
        

#     def __iter__(self):
#         buckets = self.__get_buckets()

#         batch_lists = []
#         for j, cluster_indices in enumerate(self.data_source.cluster_indices):
#             batches = [
#                 cluster_indices[i : i + self.batch_sizes[j]]
#                 for i in range(0, len(cluster_indices), self.batch_sizes[j])
#             ]
#             # filter our the shorter batches
#             batches = [_ for _ in batches if len(_) == self.batch_sizes[j]]
#             if self.shuffle:
#                 random.shuffle(batches)
#             batch_lists.append(batches)


#         if self.shuffle:
#             random.shuffle(lst)
        
#         return iter(lst)

#     def __len__(self):
#         return len(self.data_source)
    
#     def __get_buckets(self):
#         buckets: List[List[int]] = list()

#         for factor in range((len(self.dataset.csv) // self.bucket_length) + 1):
#             indexes = self.dataset.csv[self.bucket_length * factor : self.bucket_length * (factor + 1)].index.values
#             buckets.append(indexes)
        



class CollateWrapper:
    def __init__(self, feature_extractor) -> None:
        self.sampling_rate = feature_extractor.sampling_rate
        self.feature_extractor = feature_extractor
        # self.feature_extractor.max_length = 2048

    def __call__(self, batch: List[Tuple[torch.FloatTensor, torch.LongTensor]]):
        audios, targets = list(zip(*batch))

        # max_audio_len = max([audio.shape[-1] for audio in audios])
        # max_melspectorgram_len = max_audio_len // 162
        # max_melspectorgram_len = max_melspectorgram_len if max_melspectorgram_len < 2048 else 2048
        # self.feature_extractor.max_length = max_melspectorgram_len
        
        
        melspectorgrams = self.feature_extractor(audios, sampling_rate=self.sampling_rate, return_tensors="pt").input_values
        targets = torch.stack(targets, 0)
        
        return melspectorgrams, targets # .unsqueeze(1)

In [9]:
DEVICE = "cuda:0"
BATCH_SIZE = 64

EPOCHS_NUMBER = 56
EVAL_EPOCH = 3

In [10]:
feature_extractor, model = get_pretrained_AST("MIT/ast-finetuned-audioset-10-10-0.4593")

In [11]:
trainset = Dataset(train_csv, train_audio_path, sample_rate=16000)
valset = Dataset(val_csv, train_audio_path, sample_rate=16000)
collate_wrapper = CollateWrapper(feature_extractor)

trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=collate_wrapper, pin_memory=False, shuffle=False)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, collate_fn=collate_wrapper, pin_memory=False, shuffle=False)

In [12]:
13120 / 80

164.0

In [13]:
23360 / 144

162.22222222222223

In [14]:
55040 / 342

160.93567251461988

In [15]:
# for batch in trainloader:
#     X, _ = batch
#     print(X.shape)
#     # break

In [16]:
# # plt.matshow(X[0].T, origin='lower')
# for x in X:
#     plt.matshow(x.T, origin='lower')
# plt.show()

In [17]:
weights = torch.tensor(
    compute_class_weight("balanced", classes=CLASSES_NAMES, y=train_csv["label"]).astype(np.float32)
)

model = model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters())
criterion = torch.nn.CrossEntropyLoss(
    reduction="mean",
    weight=weights.to(DEVICE),
)

In [18]:
macro_accuracy = Accuracy(
    task="multiclass",
    num_classes=len(CLASSES_NAMES),
    multidim_average="global",
    average="macro",
)
weighted_accuracy = Accuracy(
    task="multiclass",
    num_classes=len(CLASSES_NAMES),
    multidim_average="global",
    average="weighted",
)
weighted_f1_score = F1Score(
    task="multiclass",
    num_classes=len(CLASSES_NAMES),
    multidim_average="global",
    average="weighted",
)

In [19]:
EXPERIMENTS_PATH = Path("experiments/AST-exp1-sorted/")
submission_csv_path = EXPERIMENTS_PATH / "submission.csv"

EXPERIMENTS_PATH.mkdir(exist_ok=True, parents=True)
(EXPERIMENTS_PATH / "checkpoints").mkdir(exist_ok=True, parents=True)

In [20]:
train_loss_list = list()
val_loss_list = list()

for epoch in range(EPOCHS_NUMBER):
    print("-" * 80)
    description = f"Training {epoch}/{EPOCHS_NUMBER}"
    trainloader_progress_bar = tqdm_notebook(trainloader, desc=description)
    train_targets, train_predictions = train_epoch(
        trainloader_progress_bar, model, criterion, optimizer, DEVICE
    )
    
    loss_value = criterion(train_predictions.to(DEVICE), train_targets.to(DEVICE)).item()
    train_targets, train_predictions = train_targets.argmax(-1).to(torch.int64), train_predictions.argmax(-1)
    train_log = (
        f"Loss: {loss_value} \n"
        f"Accuracy: {macro_accuracy(train_predictions, train_targets):.4} \n"
        f"Accuracy weighted: {weighted_accuracy(train_predictions, train_targets):.4} \n"
        f"F1 score weighted: {weighted_f1_score(train_predictions, train_targets):.4}"
    )
    print(train_log)

    if epoch % EVAL_EPOCH == 0:
        print("-" * 80)
        description = f"Val data validation {epoch}/{EPOCHS_NUMBER}"
        valloader_progress_bar = tqdm_notebook(valloader, desc=description)
        val_targets, val_predictions = validate(valloader_progress_bar, model, DEVICE)

        loss_value = criterion(val_predictions.to(DEVICE), val_targets.to(DEVICE)).item()
        val_targets, val_predictions = val_targets.argmax(-1).to(torch.int64), val_predictions.argmax(-1)
        val_log = (
            f"Loss: {loss_value} \n"
            f"Accuracy: {macro_accuracy(val_predictions, val_targets):.4} \n"
            f"Accuracy weighted: {weighted_accuracy(val_predictions, val_targets):.4} \n"
            f"F1 score weighted: {weighted_f1_score(val_predictions, val_targets):.4}"
        )
        print(val_log)

        torch.save(
            model.state_dict(), EXPERIMENTS_PATH / "checkpoints" / f"{epoch}.pt",
        )

# The last save if we go out early then EVAL_EPOCH value
torch.save(
    model.state_dict(), EXPERIMENTS_PATH / "checkpoints" / f"{epoch}.pt",
)

--------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  trainloader_progress_bar = tqdm_notebook(trainloader, desc=description)


Training 0/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 1.342644453048706 
Accuracy: 0.652 
Accuracy weighted: 0.648 
F1 score weighted: 0.6515
--------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  valloader_progress_bar = tqdm_notebook(valloader, desc=description)


Val data validation 0/56:   0%|          | 0/9 [00:00<?, ?it/s]

Loss: 0.8206866979598999 
Accuracy: 0.7577 
Accuracy weighted: 0.7301 
F1 score weighted: 0.7313
--------------------------------------------------------------------------------


Training 1/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.6005007028579712 
Accuracy: 0.822 
Accuracy weighted: 0.8164 
F1 score weighted: 0.818
--------------------------------------------------------------------------------


Training 2/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.3868924677371979 
Accuracy: 0.8848 
Accuracy weighted: 0.8764 
F1 score weighted: 0.8769
--------------------------------------------------------------------------------


Training 3/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.27362900972366333 
Accuracy: 0.9246 
Accuracy weighted: 0.9174 
F1 score weighted: 0.9174
--------------------------------------------------------------------------------


Val data validation 3/56:   0%|          | 0/9 [00:00<?, ?it/s]

Loss: 0.6642104387283325 
Accuracy: 0.8206 
Accuracy weighted: 0.8207 
F1 score weighted: 0.8229
--------------------------------------------------------------------------------


Training 4/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.20014140009880066 
Accuracy: 0.9449 
Accuracy weighted: 0.939 
F1 score weighted: 0.939
--------------------------------------------------------------------------------


Training 5/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.16203586757183075 
Accuracy: 0.9558 
Accuracy weighted: 0.9485 
F1 score weighted: 0.9484
--------------------------------------------------------------------------------


Training 6/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.12821343541145325 
Accuracy: 0.9652 
Accuracy weighted: 0.9591 
F1 score weighted: 0.959
--------------------------------------------------------------------------------


Val data validation 6/56:   0%|          | 0/9 [00:00<?, ?it/s]

Loss: 0.6887537837028503 
Accuracy: 0.8385 
Accuracy weighted: 0.837 
F1 score weighted: 0.8407
--------------------------------------------------------------------------------


Training 7/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.09773273020982742 
Accuracy: 0.9721 
Accuracy weighted: 0.968 
F1 score weighted: 0.968
--------------------------------------------------------------------------------


Training 8/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.07868345081806183 
Accuracy: 0.9791 
Accuracy weighted: 0.9747 
F1 score weighted: 0.9746
--------------------------------------------------------------------------------


Training 9/56:   0%|          | 0/81 [00:00<?, ?it/s]

Loss: 0.05968992039561272 
Accuracy: 0.9861 
Accuracy weighted: 0.9834 
F1 score weighted: 0.9835
--------------------------------------------------------------------------------


Val data validation 9/56:   0%|          | 0/9 [00:00<?, ?it/s]

Loss: 0.8024961948394775 
Accuracy: 0.8333 
Accuracy weighted: 0.8243 
F1 score weighted: 0.8262
--------------------------------------------------------------------------------


Training 10/56:   0%|          | 0/81 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(EXPERIMENTS_PATH / "checkpoints" / f"{30}.pt")
model.load_state_dict(torch.load(EXPERIMENTS_PATH / "checkpoints" / f"{30}.pt"))

In [None]:
def inference(audio_path, feature_extractor, model):
    waveform, sampling_rate = sf.read(audio_path)

    inputs = feature_extractor(waveform, sampling_rate=sampling_rate, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    
    predicted_class_ids = torch.argmax(logits, dim=-1).cpu().item()
    predicted_label = trainset.id2label[predicted_class_ids]

    return predicted_label

In [None]:
val_csv.head(2)

In [None]:
waveform, sampling_rate = sf.read(train_audio_path / val_csv["fname"][1])

inputs = feature_extractor(waveform, sampling_rate=sampling_rate, return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).cpu().item()
predicted_label = trainset.id2label[predicted_class_ids]
predicted_label

In [None]:
predicted_labels = list()

for fname in tqdm(val_csv['fname'].values):
    predicted_label = inference(train_audio_path / fname, feature_extractor, model)
    predicted_labels.append(predicted_label)

In [None]:
print(accuracy_score(val_csv["label"].values, predicted_labels))
print(f1_score(val_csv["label"].values, predicted_labels, average="weighted"))

In [None]:
print(accuracy_score(val_csv["label"].values, predicted_labels))
print(f1_score(val_csv["label"].values, predicted_labels, average="weighted"))

In [None]:
val_csv["predicted_label"] = predicted_labels

In [None]:
for _, label, duration, predicted in val_csv[val_csv['label'] != val_csv['predicted_label']].sort_values(by="duration").values:
    print(f"{label:20s} {duration:.4} {predicted:20s}")

In [None]:
test_csv = pd.read_csv(test_csv_path)

predicted_labels = list()
for fname in tqdm(test_csv['fname'].values):
    predicted_label = inference(test_audio_path / fname, feature_extractor, model)
    predicted_labels.append(predicted_label)

test_csv['label'] = predicted_labels

In [None]:
test_csv

In [None]:
test_csv.to_csv(EXPERIMENTS_PATH / "submission.csv", index=None)