In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import torch
from lpc import LPCCoefficients

In [10]:
# Parameters
#     * sr            : sample rate of the signal ( 16 kHz )
#     * frame_duration: duration of the window in seconds ( 16 ms )
#     * frame_overlap : frame overlapping factor
#     * K             : number of linear predictive coding coefficients
sr = 16000
frame_duration = 0.016
frame_overlap = 0.5
K = 32

# Initialize the module given all the parameters
lpc_prep = LPCCoefficients(sr, frame_duration, frame_overlap, order=(K - 1))

In [4]:
x = torch.randn(20, 48000)
lpc_prep(x).shape

torch.Size([20, 374, 255])
torch.Size([20, 374, 254])
torch.Size([20, 374, 253])
torch.Size([20, 374, 252])
torch.Size([20, 374, 251])
torch.Size([20, 374, 250])
torch.Size([20, 374, 249])
torch.Size([20, 374, 248])
torch.Size([20, 374, 247])
torch.Size([20, 374, 246])
torch.Size([20, 374, 245])
torch.Size([20, 374, 244])
torch.Size([20, 374, 243])
torch.Size([20, 374, 242])
torch.Size([20, 374, 241])
torch.Size([20, 374, 240])
torch.Size([20, 374, 239])
torch.Size([20, 374, 238])
torch.Size([20, 374, 237])
torch.Size([20, 374, 236])
torch.Size([20, 374, 235])
torch.Size([20, 374, 234])
torch.Size([20, 374, 233])
torch.Size([20, 374, 232])
torch.Size([20, 374, 231])
torch.Size([20, 374, 230])
torch.Size([20, 374, 229])
torch.Size([20, 374, 228])
torch.Size([20, 374, 227])
torch.Size([20, 374, 226])
torch.Size([20, 374, 225])


torch.Size([20, 374, 32])

In [5]:
x = torch.randn(64 * 64, 3000).cuda()

In [25]:
lpc_prep = LPCCoefficients(sr, frame_duration / 16, frame_overlap, order=(K - 1)).cuda()


lpc_prep(x).shape

torch.Size([4096, 3000]) 16 8
torch.Size([4096, 374, 16]) 16 8
torch.Size([4096, 374, 15])
torch.Size([4096, 374, 14])
torch.Size([4096, 374, 13])
torch.Size([4096, 374, 12])
torch.Size([4096, 374, 11])
torch.Size([4096, 374, 10])
torch.Size([4096, 374, 9])
torch.Size([4096, 374, 8])
torch.Size([4096, 374, 7])
torch.Size([4096, 374, 6])
torch.Size([4096, 374, 5])
torch.Size([4096, 374, 4])
torch.Size([4096, 374, 3])
torch.Size([4096, 374, 2])
torch.Size([4096, 374, 1])
torch.Size([4096, 374, 0])


IndexError: index -1 is out of bounds for dimension 2 with size 0

In [77]:
import torchaudio

m = torchaudio.transforms.MFCC(
    sample_rate=16000 // 16,
    n_mfcc=37,
    melkwargs={
        "n_fft": 512 // 4,
        "hop_length": 310 // 4,
        "n_mels": 37,
        "center": False,
    },
)
x = torch.randn(2, 1, 3000)
m(x).shape

torch.Size([2, 1, 37, 38])

In [76]:
import torchaudio

m = torchaudio.transforms.MFCC(
    sample_rate=16000 // 64,
    n_mfcc=19,
    melkwargs={
        "n_fft": 512 // 8,
        "hop_length": 310 // 8,
        "n_mels": 19,
        "center": False,
    },
)
x = torch.randn(2, 1, 3000 // 4)
m(x).shape

torch.Size([2, 1, 19, 19])

In [69]:
m = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000 // 16, n_fft=512 // 4, hop_length=320 // 4, n_mels=37
)
x = torch.randn(2, 1, 3000)
m(x).shape

torch.Size([2, 1, 37, 38])

In [69]:
m = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000 // 16, n_fft=512 // 4, hop_length=320 // 4, n_mels=37
)
x = torch.randn(2, 1, 3000)
m(x).shape

torch.Size([2, 1, 37, 38])

In [81]:
import torch.nn as nn

triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
anchor = torch.randn(100, 128, requires_grad=True)
positive = torch.randn(100, 128, requires_grad=True)
negative = torch.randn(100, 128, requires_grad=True)
output = triplet_loss(anchor, positive, negative)
output.backward()

### MFCC

In [109]:
n_mels = [65, 33, 17, 9]
MFCC_transforms = nn.ModuleList(
    [
        torchaudio.transforms.MFCC(
            sample_rate=16000 // (16 * (4**i)),
            n_mfcc=n_mels[i],
            melkwargs={
                "n_fft": 512 // ((2**i) * 4),
                "hop_length": 187 // ((2**i) * 4),
                "n_mels": n_mels[i],
                "center": True,
            },
        )
        for i in range(4)
    ]
)

In [110]:
for i in range(4):
    x = torch.randn(2, 1, 3000 // (4**i))
    y = MFCC_transforms[i](x)
    print(y.shape)

torch.Size([2, 1, 65, 66])
torch.Size([2, 1, 33, 33])
torch.Size([2, 1, 17, 18])
torch.Size([2, 1, 9, 10])


### LFCC

In [120]:
n_mels = [65, 33, 17, 9]
transforms = nn.ModuleList(
    [
        torchaudio.transforms.LFCC(
            sample_rate=16000 // (16 * (4**i)),
            n_lfcc=n_mels[i],
            speckwargs={
                "n_fft": 512 // ((2**i) * 4),
                "hop_length": 187 // ((2**i) * 4),
                "center": True,
            },
        )
        for i in range(4)
    ]
)
for i in range(4):
    x = torch.randn(2, 1, 3000 // (4**i))
    y = transforms[i](x)
    print(y.shape)

torch.Size([2, 1, 65, 66])
torch.Size([2, 1, 33, 33])
torch.Size([2, 1, 17, 18])
torch.Size([2, 1, 9, 10])
