In [2]:
import os
import random

import torch
from torch.utils.data import Dataset
import torchaudio.functional as F

import numpy as np
import pandas as pd

import tqdm


def binarySearch(data, val):
    highIndex = len(data)-1
    lowIndex = 0
    while highIndex > lowIndex:
            index = (highIndex + lowIndex) // 2
            sub = data[index]
            if data[lowIndex] == val:
                    return [lowIndex, lowIndex]
            elif sub == val:
                    return [index, index]
            elif data[highIndex] == val:
                    return [highIndex, highIndex]
            elif sub > val:
                    if highIndex == index:
                            return sorted([highIndex, lowIndex])
                    highIndex = index
            else:
                    if lowIndex == index:
                            return sorted([highIndex, lowIndex])
                    lowIndex = index
    return sorted([highIndex, lowIndex])


class AudioDataset(Dataset):

    def __init__(self, root_dir, transform=None, num_audios = -1, return_amp = True):
        
        self.root_dir = root_dir
        self.embeddings_dir = os.path.join(self.root_dir, 'embeddings_6144')
        self.spectrograms_dir = os.path.join(self.root_dir, 'spectrograms')
        self.transform = transform
        self.num_audios = num_audios
        self.return_amp = return_amp
        
        self.df = pd.read_csv(os.path.join(root_dir, 'number_of_frames_per_audio.csv'))
        if num_audios > 0 and isinstance(num_audios, int):
            self.df = self.df.head(num_audios)
        self.cumulative_sum = self.df['number_of_frames'].cumsum()
        
                
    def __len__(self):
        return self.df['number_of_frames'].sum()

    def __getitem__(self, idx):
        
        low_index, high_index = binarySearch(self.cumulative_sum, idx+1)
        file_name = self.df.iloc[high_index]['file_name']
        emb_path = os.path.join(self.embeddings_dir, file_name)        
        spec_path = os.path.join(self.spectrograms_dir, file_name)
        
        if low_index == 0 and high_index == 0:
            frame_idx = idx
        else:
            frame_idx = idx - self.cumulative_sum[low_index]
            
        with open(emb_path, 'rb') as f:
            emb = np.load(f)
        with open(spec_path, 'rb') as f:
            spec = np.load(f)
        
        emb_tensor = torch.from_numpy(emb[frame_idx])
        spec_tensor = torch.from_numpy(spec[frame_idx]).permute(2, 0, 1)
                
        if self.return_amp is True:
            spec_tensor_amp = F.DB_to_amplitude(x = spec_tensor, ref = 1, power = 0.5)
            return emb_tensor, spec_tensor_amp, torch.tensor(frame_idx)
        else:
            return emb_tensor, spec_tensor, torch.tensor(frame_idx)



In [3]:
import tqdm
import os
import random
import numpy as np

In [4]:
data_path = "/scratch/prs392/incubator/data/LibriSpeech/"
data_paths = {}
data_paths['train'] = os.path.join(data_path, 'train-clean-360')
data_paths['val'] = os.path.join(data_path, 'dev-clean')
data_paths['test'] = os.path.join(data_path, 'test-clean')
# train_dataset = AudioDataset(root_dir=data_paths['train'], num_audios = -1)

In [5]:
# embs = None

# for i in tqdm.tqdm(range(int(len(train_dataset) * 0.01))):
#     emb, spec, j = train_dataset[i]
#     if embs is None:
#         embs = emb
        
#         embs = embs.reshape((1, embs.shape[0]))
# #         print(embs.shape)
#     else:
# #         print(embs.shape)
#         emb = emb.reshape((1, emb.shape[0]))
# #         print(emb.shape)
#         embs = np.append(embs, emb, axis = 0)
    

In [6]:
# avgs = sums / int(len(train_dataset) * 0.01)

In [7]:
# embs.shape

In [8]:
# 122029 / 3

In [9]:
root_dir = data_paths['train']
list_of_embedding_file_names = []
embeddings_dir = os.path.join(root_dir, 'embeddings_6144')

for root, dirs, files in os.walk(embeddings_dir):
    for file in files:
        if file.endswith(".npy"):
            list_of_embedding_file_names.append(file)

In [10]:
len(list_of_embedding_file_names)

104015

In [11]:
random_files = random.choices(list_of_embedding_file_names, k=500)

In [12]:
list_of_embedding_frames = None
for file_name in tqdm.tqdm(random_files):
    emb_path = os.path.join(embeddings_dir, file_name)
    temp = np.load(emb_path, mmap_mode='r')
    if list_of_embedding_frames is None:
        list_of_embedding_frames = temp
    else:
#         print(list_of_embedding_frames.shape)
#         print(temp.reshape((temp.shape[0], 1, temp.shape[1])).shape)
        list_of_embedding_frames = np.append(list_of_embedding_frames, temp, axis = 0)
            

100%|██████████| 500/500 [02:29<00:00,  3.35it/s]


In [13]:
list_of_embedding_frames.shape

(60515, 6144)

In [None]:
with open(os.path.join(embeddings_dir, 'random_500_audios_embeddings.npy'), 'wb') as f:
    np.save(f, np.array(list_of_embedding_frames))

In [16]:
list_of_embedding_frames.mean(axis=0)

array([ 1.1972656 , -0.04162345,  1.0635653 , ...,  0.58443135,
        1.2051384 ,  2.5238538 ], dtype=float32)

In [17]:
list_of_embedding_frames.std(axis=0)

array([0.6269768 , 0.23699315, 0.4800284 , ..., 0.07191464, 0.05293078,
       0.2366121 ], dtype=float32)

In [19]:
with open(os.path.join(embeddings_dir, 'random_500_audios_embeddings.npy'), 'rb') as f:
    list_of_embedding_frames = np.load(f)

In [23]:
list_of_embedding_frames.mean(axis=0), list_of_embedding_frames.std(axis=0)

((6144,), (6144,))

In [30]:
import torchvision.transforms as transforms

x = torch.randn(6144)
norm = transforms.Normalize(list_of_embedding_frames.mean(axis=0), list_of_embedding_frames.std(axis=0))
(x - torch.tensor(list_of_embedding_frames.mean(axis=0)))/ torch.tensor(list_of_embedding_frames.std(axis=0))

tensor([ -1.8008,   9.3931,  -2.7889,  ..., -48.0184,  19.4374, -19.0974])

In [47]:
root_dir = data_paths['train']
list_of_spec_file_names = []
specs_dir = os.path.join(root_dir, 'spectrograms')

for root, dirs, files in os.walk(specs_dir):
    for file in files:
        if file.endswith(".npy"):
            list_of_spec_file_names.append(file)

In [48]:
random_files = random.choices(list_of_spec_file_names, k=500)

In [49]:
spec_path = os.path.join(specs_dir, random_files[0])
temp = np.load(spec_path, mmap_mode='r')
temp.shape, np.moveaxis(temp, (0,1,2,3),(0, 2, 3, 1)).shape

((54, 128, 199, 1), (54, 1, 128, 199))

In [50]:
# list_of_spec_frames = None
# for file_name in tqdm.tqdm(random_files):
#     spec_path = os.path.join(specs_dir, file_name)
#     temp = np.load(spec_path, mmap_mode='r')
#     if list_of_spec_frames is None:
#         list_of_spec_frames = np.moveaxis(temp, (0,1,2,3),(0, 2, 3, 1))
# #         print(list_of_spec_frames.shape)

#     else:
# #         print(list_of_embedding_frames.shape)
# #         print(temp.reshape((temp.shape[0], 1, temp.shape[1])).shape)
#         list_of_spec_frames = np.concatenate([list_of_spec_frames, np.moveaxis(temp, (0,1,2,3),(0, 2, 3, 1))], axis = 0)
#         print(list_of_spec_frames.shape)
# #         break
            

In [70]:
import torchaudio.functional as F

In [79]:
list_of_spec_frames = None
sums = np.zeros((1, 128, 199))
sq_sums = np.zeros((1, 128, 199))
total = 0
for file_name in tqdm.tqdm(random_files):
    spec_path = os.path.join(specs_dir, file_name)
    temp = np.load(spec_path, mmap_mode='r')
    temp = np.moveaxis(temp, (0,1,2,3),(0, 2, 3, 1))
#     temp2 = temp.copy()
    temp = F.DB_to_amplitude(x = torch.tensor(temp), ref = 1, power = 0.5).numpy()
    
    sums += temp.sum(axis=0);
    sq_sums += np.square(temp).sum(axis = 0)
    total += temp.shape[0]
#     break
  

100%|██████████| 500/500 [01:42<00:00,  4.88it/s]


In [80]:
total

58877

In [81]:
mean = sums / total

In [82]:
mean

array([[[0.17823456, 0.1835684 , 0.17356669, ..., 0.16794217,
         0.17702117, 0.17122662],
        [0.18279658, 0.18558587, 0.1739925 , ..., 0.1683288 ,
         0.17881936, 0.17550689],
        [0.19130822, 0.19350225, 0.17905354, ..., 0.17280251,
         0.18596955, 0.1830777 ],
        ...,
        [0.01497173, 0.01359738, 0.01037071, ..., 0.00978747,
         0.01281919, 0.01410495],
        [0.01493131, 0.01356068, 0.01034271, ..., 0.00976109,
         0.01278464, 0.01406693],
        [0.01490542, 0.01353716, 0.01032478, ..., 0.00974421,
         0.01276252, 0.0140426 ]]])

In [83]:
variance = (sq_sums / total) - np.square(mean);

In [87]:
variance

array([[[2.20204493e-02, 2.56676922e-02, 2.62555049e-02, ...,
         2.49182619e-02, 2.41922285e-02, 2.05171750e-02],
        [1.87460017e-02, 1.99056972e-02, 1.92389903e-02, ...,
         1.77856291e-02, 1.82292705e-02, 1.69931608e-02],
        [1.89357311e-02, 2.03372685e-02, 1.97200366e-02, ...,
         1.70982834e-02, 1.76571735e-02, 1.64636244e-02],
        ...,
        [1.35840740e-04, 1.12044667e-04, 6.51741313e-05, ...,
         5.68429376e-05, 9.75909245e-05, 1.18201701e-04],
        [1.35287603e-04, 1.11588434e-04, 6.49087424e-05, ...,
         5.66135677e-05, 9.71971040e-05, 1.17724670e-04],
        [1.34973623e-04, 1.11329488e-04, 6.47581181e-05, ...,
         5.64834909e-05, 9.69737490e-05, 1.17454143e-04]]])

In [86]:
np.sqrt(variance)

array([[[0.14839289, 0.1602114 , 0.1620355 , ..., 0.15785519,
         0.15553851, 0.14323818],
        [0.13691604, 0.14108755, 0.13870469, ..., 0.13336277,
         0.13501582, 0.13035782],
        [0.13760716, 0.1426088 , 0.14042805, ..., 0.1307604 ,
         0.1328803 , 0.12831066],
        ...,
        [0.01165507, 0.01058512, 0.00807305, ..., 0.00753943,
         0.00987881, 0.01087206],
        [0.01163132, 0.01056354, 0.0080566 , ..., 0.0075242 ,
         0.00985886, 0.0108501 ],
        [0.01161781, 0.01055128, 0.00804724, ..., 0.00751555,
         0.00984753, 0.01083763]]])

In [88]:
with open(os.path.join(specs_dir, 'random_500_audios_specs_mean.npy'), 'wb') as f:
    np.save(f, np.array(mean))

In [89]:
with open(os.path.join(specs_dir, 'random_500_audios_specs_std.npy'), 'wb') as f:
    np.save(f, np.sqrt(variance))