In [20]:
from torch import nn
import torch

class Reshape(nn.Module):
    def __init__(self, shape):
        super(Reshape, self).__init__()
        self.shape = shape

    def forward(self, x):
#         print(x.shape)
        return x.view(-1, *self.shape)
    
class InversionV2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv0 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv1 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.stacked_conv1 = nn.Sequential(
            nn.Conv2d(512*2, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(),
        )
        self.stacked_conv2 = nn.Sequential(
            nn.Conv2d(256*2, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.stacked_conv3 = nn.Sequential(
            nn.Conv2d(128*2, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.conv9 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv10 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv11 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.stacked_conv4 = nn.Sequential(
            nn.Conv2d(64*2, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.conv12 = nn.Sequential(
            nn.Conv2d(64, 1, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
        )

    def forward(self, x):
        x = Reshape((512, 4, 3))(x)
        
        x = self.conv0(x)
        
        x = nn.Upsample(size=(16, 24))(x)
        x1 = self.conv1(x)
        x2 = self.conv2(x1)
        x = torch.cat([x1, x2], dim=1)
        x = self.stacked_conv1(x)
        
        
        x1 = self.conv3(x)
        x2 = self.conv4(x1)
        x = torch.cat([x1, x2], dim=1)
        
        x = self.stacked_conv2(x)
        
        x = nn.Upsample(size=(32, 49))(x)
        x = self.conv5(x)
        x = self.conv6(x)
        
        x = nn.Upsample(size=(64, 99))(x)
        x1 = self.conv7(x)
        x2 = self.conv8(x1)
        x = torch.cat([x1, x2], dim=1)
        
        x = self.stacked_conv3(x)
        
        x = self.conv9(x)
        
        x = nn.Upsample(size=(128, 199))(x)
        x1 = self.conv10(x)
        x2 = self.conv11(x1)
        x = torch.cat([x1, x2], dim=1)
        
        x = self.stacked_conv4(x)
        
        x = self.conv12(x)
        return x

In [21]:
from torchsummary import summary
model = InversionV2()
summary(model, input_size=(1, 6144))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 512, 4, 3]       2,359,296
              ReLU-2            [-1, 512, 4, 3]               0
       BatchNorm2d-3            [-1, 512, 4, 3]           1,024
            Conv2d-4          [-1, 512, 16, 24]       2,359,296
              ReLU-5          [-1, 512, 16, 24]               0
       BatchNorm2d-6          [-1, 512, 16, 24]           1,024
            Conv2d-7          [-1, 512, 16, 24]       2,359,296
              ReLU-8          [-1, 512, 16, 24]               0
       BatchNorm2d-9          [-1, 512, 16, 24]           1,024
           Conv2d-10          [-1, 512, 16, 24]       4,718,592
             ReLU-11          [-1, 512, 16, 24]               0
      BatchNorm2d-12          [-1, 512, 16, 24]           1,024
           Conv2d-13          [-1, 256, 16, 24]       1,179,648
             ReLU-14          [-1, 256,

In [None]:
from torch import nn
import torch

class Reshape(nn.Module):
    def __init__(self, shape):
        super(Reshape, self).__init__()
        self.shape = shape

    def forward(self, x):
#         print(x.shape)
        return x.view(-1, *self.shape)
    
class InversionV3(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv9 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv10 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv11 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        )
        self.conv12 = nn.Sequential(
            nn.Conv2d(64, 1, kernel_size=(3, 3), stride=(1, 1),padding=(1, 1), bias=False),
        )

    def forward(self, x):
        x = Reshape((512, 4, 3))(x)
        x = nn.Upsample(size=(16, 24))(x)
        x1 = self.conv1(x)
        x2 = self.conv2(x1)
        x = x1+x2
        
        x1 = self.conv3(x)
        x2 = self.conv4(x1)
        x = x1+x2
        
        x = nn.Upsample(size=(32, 49))(x)
        x = self.conv5(x)
        x = self.conv6(x)
        
        x = nn.Upsample(size=(64, 99))(x)
        x1 = self.conv7(x)
        x2 = self.conv8(x1)
        x = x1+x2
        x = self.conv9(x)
        
        x = nn.Upsample(size=(128, 199))(x)
        x1 = self.conv10(x)
        x2 = self.conv11(x1)
        x = x1+x2
        x = self.conv12(x)
        
#         print(x.shape)
        
#         x = self.model(x)
        return x

In [17]:
import dataloaders.audio_dataset as dataset
import numpy as np

In [18]:
AudioDataset = dataset.AudioDataset
audio_dataset = AudioDataset('/scratch/prs392/incubator/data/LibriSpeech/train-clean-360')

In [50]:
import os
import random

import torch
from torch.utils.data import Dataset
# import torchaudio.functional as F

import numpy as np
import pandas as pd

# import tqdm


def binarySearch(data, val):
    highIndex = len(data)-1
    lowIndex = 0
    while highIndex > lowIndex:
            index = (highIndex + lowIndex) // 2
            sub = data[index]
            if data[lowIndex] == val:
                    return [lowIndex, lowIndex]
            elif sub == val:
                    return [index, index]
            elif data[highIndex] == val:
                    return [highIndex, highIndex]
            elif sub > val:
                    if highIndex == index:
                            return sorted([highIndex, lowIndex])
                    highIndex = index
            else:
                    if lowIndex == index:
                            return sorted([highIndex, lowIndex])
                    lowIndex = index
    return sorted([highIndex, lowIndex])


class AudioDataset(Dataset):

    def __init__(self, root_dir, transform=None, num_audios = -1, return_amp = True):
        
        self.root_dir = root_dir
        self.embeddings_dir = os.path.join(self.root_dir, 'embeddings_6144')
        # self.spectrograms_dir = os.path.join(self.root_dir, 'spectrograms')
        self.transform = transform
        self.num_audios = num_audios
        self.return_amp = return_amp
        
        self.df = pd.read_csv(os.path.join(root_dir, 'number_of_frames_per_audio.csv'))
        if num_audios > 0 and isinstance(num_audios, int):
            self.df = self.df.head(num_audios)
        self.cumulative_sum = self.df['number_of_frames'].cumsum()
                
    def __len__(self):
        return self.df['number_of_frames'].sum()

    def __getitem__(self, idx):
        
        low_index, high_index = binarySearch(self.cumulative_sum, idx+1)
        file_name = self.df.iloc[high_index]['file_name']
        emb_path = os.path.join(self.embeddings_dir, file_name)        
        # spec_path = os.path.join(self.spectrograms_dir, file_name)
        
        if low_index == 0 and high_index == 0:
            frame_idx = idx
        else:
            frame_idx = idx - self.cumulative_sum[low_index]
        
        with open(emb_path, 'rb') as f:
            emb = np.load(f)
        
        return emb[idx], file_name

        # with open(spec_path, 'rb') as f:
        #     spec = np.load(f)
        


        # emb_tensor = torch.from_numpy(emb[frame_idx])
        # spec_tensor = torch.from_numpy(spec[frame_idx]).permute(2, 0, 1)
                
        # if self.return_amp is True:
        #     spec_tensor_amp = F.DB_to_amplitude(x = spec_tensor, ref = 1, power = 0.5)
        #     return emb_tensor, spec_tensor_amp, torch.tensor(frame_idx)
        
        # else:
        #     return emb_tensor, spec_tensor, torch.tensor(frame_idx)


In [51]:
audio_dataset = AudioDataset('/scratch/prs392/incubator/data/LibriSpeech/train-clean-360')

In [52]:
list_of_embs = []
for i in range(len(audio_dataset)):
    emb, file = audio_dataset[i]
    print(file)
    list_of_embs.append(emb)
    if i == 10:
        break

6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy
6567-53342-0017.npy


In [53]:
list_of_embs[0].shape

(6144,)

In [54]:
(list_of_embs[0] == list_of_embs[1]).all()

False

In [55]:
def l1_norm(a,b):
    return sum(map(lambda a:abs(a[0]-a[1]), zip(a,b)))

In [56]:
dist = l1_norm(list_of_embs[0], list_of_embs[1])
dist

1047.5116952792741

In [31]:
dist = l1_norm(list_of_embs[1], list_of_embs[2])
dist

904.6283872451168

In [32]:
dist = l1_norm(list_of_embs[3], list_of_embs[4])
dist

1238.1193534955382

In [14]:
list_of_embs[0] == list_of_embs[1]

False

In [6]:
spec.shape

torch.Size([1, 128, 199])