<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Conv1d" data-toc-modified-id="Conv1d-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Conv1d</a></span></li><li><span><a href="#Attentions" data-toc-modified-id="Attentions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Attentions</a></span></li><li><span><a href="#STFT" data-toc-modified-id="STFT-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>STFT</a></span></li></ul></div>

In [None]:
#default_exp models.common

In [None]:
#export
from librosa.filters import mel as librosa_mel
from librosa.util import pad_center, tiny
import numpy as np
from scipy.signal import get_window
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F

from uberduck_ml_dev.utils import *

### Conv1d

In [None]:
# export


class Conv1d(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size=1,
        stride=1,
        padding=None,
        dilation=1,
        bias=True,
        w_init_gain="linear",
    ):
        self.__init__()
        if padding is None:
            assert kernel_size % 2 == 1
            padding = int(dilation * (kernel_size - 1) / 2)
        self.conv = nn.Conv1(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias,
        )
        nn.init.xavier_uniform_(
            self.conv.weight, gain=nn.init.calculate_gain(w_init_gain)
        )
        
    def forward(self, signal):
        return self.conv(signal)

### Attentions

### STFT

In [None]:
#export
class STFT(torch.nn.Module):
    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
    def __init__(
        self,
        filter_length=800,
        hop_length=200,
        win_length=800,
        window='hann',
    ):
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
                                   np.imag(fourier_basis[:cutoff, :])])

        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :].astype(np.float32))

        if window is not None:
            assert(filter_length >= win_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = torch.from_numpy(fft_window).float()

            # window the bases
            forward_basis *= fft_window
            inverse_basis *= fft_window

        self.register_buffer('forward_basis', forward_basis.float())
        self.register_buffer('inverse_basis', inverse_basis.float())

    def transform(self, input_data):
        num_batches = input_data.size(0)
        num_samples = input_data.size(1)

        self.num_samples = num_samples

        # similar to librosa, reflect-pad the input
        input_data = input_data.view(num_batches, 1, num_samples)
        input_data = F.pad(
            input_data.unsqueeze(1),
            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
            mode='reflect')
        input_data = input_data.squeeze(1)

        forward_transform = F.conv1d(
            input_data,
            Variable(self.forward_basis, requires_grad=False),
            stride=self.hop_length,
            padding=0)

        cutoff = int((self.filter_length / 2) + 1)
        real_part = forward_transform[:, :cutoff, :]
        imag_part = forward_transform[:, cutoff:, :]

        magnitude = torch.sqrt(real_part**2 + imag_part**2)
        phase = torch.autograd.Variable(
            torch.atan2(imag_part.data, real_part.data))

        return magnitude, phase

    def inverse(self, magnitude, phase):
        recombine_magnitude_phase = torch.cat(
            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1,
        )

        inverse_transform = F.conv_transpose1d(
            recombine_magnitude_phase,
            Variable(self.inverse_basis, requires_grad=False),
            stride=self.hop_length,
            padding=0,
        )

        if self.window is not None:
            window_sum = window_sumsquare(
                self.window, magnitude.size(-1), hop_length=self.hop_length,
                win_length=self.win_length, n_fft=self.filter_length,
                dtype=np.float32)
            # remove modulation effects
            approx_nonzero_indices = torch.from_numpy(
                np.where(window_sum > tiny(window_sum))[0])
            window_sum = torch.autograd.Variable(
                torch.from_numpy(window_sum), requires_grad=False)
            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]

            # scale by hop ratio
            inverse_transform *= float(self.filter_length) / self.hop_length

        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]

        return inverse_transform

    def forward(self, input_data):
        self.magnitude, self.phase = self.transform(input_data)
        reconstruction = self.inverse(self.magnitude, self.phase)
        return reconstruction

In [None]:
#export

class MelSTFT(torch.nn.Module):
    def __init__(
        self,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        n_mel_channels=80,
        sampling_rate=22050,
        mel_fmin=0.0,
        mel_fmax=8000.0,
    ):
        super().__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel(
            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer("mel_basis", mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y, ref_level_db=20, magnitude_power=1.5):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert torch.min(y.data) >= -1
        assert torch.max(y.data) <= 1

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output

In [None]:
stft = STFT()
stft(torch.randn(1, 1000,))
mel_stft = MelSTFT()
mel_stft.mel_spectrogram(torch.clip(torch.randn(1, 1000), -1, 1))

tensor([[[-1.4012e+00, -9.6032e-01, -6.8401e-01, -2.4548e-01],
         [-6.9331e-01, -1.1485e+00, -1.3439e+00, -7.2029e-01],
         [-2.0700e-01, -4.9946e-01, -1.1252e+00, -6.4045e-01],
         [-1.1032e+00, -1.1514e+00, -1.0081e+00, -3.8287e-01],
         [-1.9763e+00, -1.2883e+00, -1.0279e+00, -3.1775e-01],
         [-2.2755e+00, -1.4329e+00, -6.6981e-01, -4.1260e-01],
         [-1.8025e+00, -4.4743e-01, -6.3598e-02, -4.7892e-01],
         [-1.3262e+00, -2.5980e-01,  2.0563e-01,  1.6634e-01],
         [-9.6420e-01, -2.4060e-01,  6.4932e-03,  7.0793e-02],
         [-5.7281e-01, -1.4511e-01, -1.5999e-04, -3.0025e-01],
         [-1.2433e+00, -8.7362e-01, -1.1272e+00, -1.4528e+00],
         [-2.7609e-01, -6.7848e-01, -6.2364e-01, -1.0757e+00],
         [-2.9298e-01, -1.5836e-01, -7.9322e-01, -1.1299e+00],
         [-1.3421e+00, -2.3078e-01, -6.5121e-02, -7.0137e-01],
         [-2.5563e+00, -5.7537e-01, -1.7676e-01, -9.9839e-01],
         [-1.7800e+00, -1.4022e+00, -9.6675e-01, -1.474

In [None]:
?torch.clip