In [None]:
import json
from yt_dlp import YoutubeDL
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os
from os import path
import pandas as pd
import tensorflow as tf
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Optimizer

# Retrieve the video links from json file
def getVideoLinks() -> list[str]:
    file_path = path.join(path.dirname(os.getcwd()), 'data/videos.json')
    with open(file_path) as f:
        data = json.load(f)
        return [video['link'] for video in data]
    

# Retrieve the downloaded file names
def getFileNames() -> list[str]:
    return os.listdir(path.join(path.dirname(os.getcwd()), 'data/dl_audios'))

In [None]:
# Run when audios aren't downloaded - get the video links and download audios locally to data/dl_audios

URLs = getVideoLinks()

file_path = path.join(path.dirname(os.getcwd()), 'data/dl_audios')
ydl_config = {
    'format': 'm4a/bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    'outtmpl': path.join(file_path, '%(title)s.%(ext)s'),
}

with YoutubeDL(ydl_config) as ydl:
    error_code = ydl.download(URLs)

In [None]:
def preprocessAudio(y: np.ndarray) -> np.ndarray:
    y = librosa.to_mono(y) # set to mono
    y = y / np.abs(y).max() # normalize amplitude
    return y

# Get the log mels of the files
def getLogMels(file_names: list[str]) -> np.ndarray:
    log_mels = np.ndarray(len(file_names), dtype=np.ndarray)

    for i, fn in enumerate(file_names):
        fp = path.join(path.join(path.dirname(os.getcwd()), 'data/dl_audios'), fn)
        y, sr = librosa.load(fp, sr=sr_param)
        y = preprocessAudio(y)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        log_mels[i] = librosa.power_to_db(mel, ref=np.max)

    return log_mels

sr_param = 16000
file_names = getFileNames()
log_mels = getLogMels(file_names)

In [None]:
# Visualize first 3 log mel spectrograms

for i in range(3):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_mels[i], sr=sr_param, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Log-Mel Spectrogram: {file_names[i]}')
    plt.tight_layout()
    plt.show()

In [None]:
# Load training data (clips of crying/laughing babies)

# data_files = ['../data/train_features.pt', '../data/test_features.pt', '../data/train_labels.pt', '../data/test_labels.pt']

data_files = ['../data/train_features.pt', '../data/test_features.pt', '../data/train_labels.pt', '../data/test_labels.pt']

if not all(os.path.exists(f) for f in data_files):
    
    class_labels = {
        'cry': (23, '/t/dd00002'),
        'laugh': (17, '/t/dd00001'),
        'music': (137, '/m/04rlf'),
        'singing': (27, '/m/015lz1'),
        'child_speech': (3, '/m/0ytgt'),
        'male_speech': (1, '/m/05zppz'),
        'female_speech': (2, '/m/02zsn'),
        'lullaby': (271, '/m/07pkxdp'),
    }

    df_train_bal = pd.read_csv(
        '../data/audioset_segments/balanced_train_segments.csv',
        comment='#',
        header=None,
        names=['YTID', 'start_seconds', 'end_seconds', 'labels'],
        sep = r'\s+',
        engine='python',
    )

    df_train_unbal = pd.read_csv(
        '../data/audioset_segments/unbalanced_train_segments.csv',
        comment='#',
        header=None,
        names=['YTID', 'start_seconds', 'end_seconds', 'labels'],
        sep = r'\s+',
        engine='python',
    )

    df_test = pd.read_csv(
        '../data/audioset_segments/eval_segments.csv',
        comment='#',
        header=None,
        names=['YTID', 'start_seconds', 'end_seconds', 'labels'],
        sep = r'\s+',
        engine='python',
    )

    bal = {}
    unbal = {}
    test = {}

    for label, (class_idx, class_code) in class_labels.items():
        bal[label] = df_train_bal[df_train_bal['labels'].str.contains(class_code)]
        unbal[label] = df_train_unbal[df_train_unbal['labels'].str.contains(class_code)]
        test[label] = df_test[df_test['labels'].str.contains(class_code)]

In [None]:
# !! Run script in scripts/audioset_features_downloader.py before running
# Load the VGG-like extracted features from the dataset

context_features = {
    'video_id': tf.io.FixedLenFeature([], dtype=tf.string),
    'start_time_seconds': tf.io.FixedLenFeature([], dtype=tf.float32),
    'end_time_seconds': tf.io.FixedLenFeature([], dtype=tf.float32),
    'labels': tf.io.VarLenFeature(dtype=tf.int64),
}

sequence_features = {
    "audio_embedding": tf.io.FixedLenSequenceFeature([], dtype=tf.string)
}

# Parse the AudioSet sequence
def parseSquence(serialized_example):
    context, sequence = tf.io.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )

    # Decode the 128-dimensional 8-bit quantized audio embedding bytes - (1, 128) vector per second
    audio_embeddings = tf.map_fn(
        lambda x: tf.io.decode_raw(x, tf.uint8),
        sequence["audio_embedding"],
        dtype=tf.uint8
    )

    return {
        "video_id": context["video_id"],
        "start_time": context["start_time_seconds"],
        "end_time": context["end_time_seconds"],
        "labels": tf.sparse.to_dense(context["labels"]),
        "audio_embedding": audio_embeddings,
    }


class_id_to_index = {label[0]: idx for idx, label in enumerate(class_labels.values())}

# Function to convert label tensor to binary vector
def process_labels(label_tensor):
    # Initialize a binary vector of 8 zeros (one for each class)
    binary_vector = [0] * len(class_labels)
    
    # Loop through each class ID in the label tensor and set the corresponding index to 1
    for class_id in label_tensor.numpy():
        index = class_id_to_index.get(class_id, None)
        if index is not None:
            binary_vector[index] = 1
            
    return binary_vector


# Retrieve the 128-D features from the sequences - produces a list of PyTorch tensors
def get_features_and_labels(dataset: pd.DataFrame, isTrain: bool, bal = False):
    features = []
    labels = []

    for id in dataset['YTID']:
        split = 'bal_train' if isTrain and bal else ('unbal_train' if isTrain else 'eval')
        rd = tf.data.TFRecordDataset(f"../data/audioset_v1_embeddings/{split}/{id[:2]}.tfrecord")
        rd = rd.map(parseSquence)
        
        if len(features) > 300:
            return features, labels
        
        for sample in rd:
            vid_id = sample['video_id'].numpy().decode("utf-8")

            if vid_id == id[:len(id) - 1]:
                features.append(torch.from_numpy(sample["audio_embedding"].numpy()))
                labels.append(process_labels(sample['labels']))
                break

    return features, labels


if not all(os.path.exists(f) for f in data_files):
    
    train_features = []
    test_features = []

    train_labels = []
    test_labels = []

    for label, (class_idx, class_code) in class_labels.items():
        print(f"---------- {label}")
        train_features_bal, train_labels_bal = get_features_and_labels(bal[label], isTrain=True, bal=True)
        if label == "music" or label == "singing":
            train_features_unbal = []
            train_labels_unbal = []
        else:
            train_features_unbal, train_labels_unbal = get_features_and_labels(unbal[label], isTrain=True)

        train_features_class = (train_features_bal + train_features_unbal)[:300]
        train_labels_class = (train_labels_bal + train_labels_unbal)[:300]

        test_features_class, test_labels_class = get_features_and_labels(test[label], isTrain=False)

        train_features += train_features_class
        train_labels += train_labels_class

        test_features += test_features_class[:50]
        test_labels += test_labels_class[:50]

    # Padded features to 10 secs
    train_features = pad_sequence(train_features, batch_first=True)
    test_features = pad_sequence(test_features, batch_first=True)

    torch.save(train_features.float(), '../data/train_features.pt')
    torch.save(test_features.float(), '../data/test_features.pt')
    torch.save(torch.Tensor(train_labels).float(), '../data/train_labels.pt')
    torch.save(torch.Tensor(test_labels).float(), '../data/test_labels.pt')