In [None]:
import cv2
import os
import torch
import random
import librosa
# import torchaudio
import numpy as np
from glob import glob
# from tqdm import tqdm
# from librosa.feature.inverse import mel_to_audio
# from preprocess import n_fft, hop_size, win_size, sampling_rate, fmin, fmax,num_mels,load_audio
from IPython.display import Audio
from meldataset import load_wav,MelDataset
from utils import plot_spectrogram
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:

dataset = MelDataset(
    glob("dataset/audio/*.wav"),
    8192,
    1024,
    80,
    256,
    1024,
    22050,
    0,
    8000,
    shuffle=False,
    n_cache_reuse=0,
    device=None,
    fmax_loss=8000,
    fine_tuning=False,
    split=True,
    base_mels_path="dataset/mel/"
)

In [None]:
data = dataset[0]

In [None]:
data

In [None]:
Audio(data[1].view(1, -1), rate=22050)

In [None]:
data[0].max()

In [None]:
plot_spectrogram(data[0])

In [None]:
plot_spectrogram(data[3])

In [None]:
# Thanks to the author providing the canny edge module: https://github.com/DCurro/CannyEdgePytorch
import torch
import numpy as np
import torch.nn as nn
from scipy.signal import gaussian


class CannyEdge(nn.Module):
    def __init__(self, threshold=8.0):
        super(CannyEdge, self).__init__()

        self.threshold = threshold

        filter_size = 5
        generated_filters = gaussian(filter_size, std=1.0).reshape([1, filter_size])

        self.gaussian_filter_horizontal = nn.Conv2d(
            in_channels=1, out_channels=1, kernel_size=(1, filter_size), padding=(0, filter_size // 2))
        self.gaussian_filter_horizontal.weight.data.copy_(torch.from_numpy(generated_filters))
        self.gaussian_filter_horizontal.bias.data.copy_(torch.from_numpy(np.array([0.0])))
        self.gaussian_filter_vertical = nn.Conv2d(
            in_channels=1, out_channels=1, kernel_size=(filter_size, 1), padding=(filter_size // 2, 0))
        self.gaussian_filter_vertical.weight.data.copy_(torch.from_numpy(generated_filters.T))
        self.gaussian_filter_vertical.bias.data.copy_(torch.from_numpy(np.array([0.0])))

        sobel_filter = np.array([[1, 0, -1],
                                 [2, 0, -2],
                                 [1, 0, -1]])

        self.sobel_filter_horizontal = nn.Conv2d(in_channels=1, out_channels=1,
                                                 kernel_size=sobel_filter.shape, padding=sobel_filter.shape[0] // 2)
        self.sobel_filter_horizontal.weight.data.copy_(torch.from_numpy(sobel_filter))
        self.sobel_filter_horizontal.bias.data.copy_(torch.from_numpy(np.array([0.0])))
        self.sobel_filter_vertical = nn.Conv2d(in_channels=1, out_channels=1,
                                               kernel_size=sobel_filter.shape, padding=sobel_filter.shape[0] // 2)
        self.sobel_filter_vertical.weight.data.copy_(torch.from_numpy(sobel_filter.T))
        self.sobel_filter_vertical.bias.data.copy_(torch.from_numpy(np.array([0.0])))

        # filters were flipped manually
        filter_0 = np.array([[0, 0, 0],
                             [0, 1, -1],
                             [0, 0, 0]])

        filter_45 = np.array([[0, 0, 0],
                              [0, 1, 0],
                              [0, 0, -1]])

        filter_90 = np.array([[0, 0, 0],
                              [0, 1, 0],
                              [0, -1, 0]])

        filter_135 = np.array([[0, 0, 0],
                               [0, 1, 0],
                               [-1, 0, 0]])

        filter_180 = np.array([[0, 0, 0],
                               [-1, 1, 0],
                               [0, 0, 0]])

        filter_225 = np.array([[-1, 0, 0],
                               [0, 1, 0],
                               [0, 0, 0]])

        filter_270 = np.array([[0, -1, 0],
                               [0, 1, 0],
                               [0, 0, 0]])

        filter_315 = np.array([[0, 0, -1],
                               [0, 1, 0],
                               [0, 0, 0]])

        all_filters = np.stack([filter_0, filter_45, filter_90, filter_135,
                               filter_180, filter_225, filter_270, filter_315])

        self.directional_filter = nn.Conv2d(in_channels=1, out_channels=8,
                                            kernel_size=filter_0.shape, padding=filter_0.shape[-1] // 2)
        self.directional_filter.weight.data.copy_(torch.from_numpy(all_filters[:, None, ...]))
        self.directional_filter.bias.data.copy_(torch.from_numpy(np.zeros(shape=(all_filters.shape[0],))))

    def forward(self, img):
        batch_size = img.shape[0]
        img_r = img[:, 0:1]
        img_g = img[:, 1:2]
        img_b = img[:, 2:3]

        blur_horizontal = self.gaussian_filter_horizontal(img_r)
        blurred_img_r = self.gaussian_filter_vertical(blur_horizontal)
        blur_horizontal = self.gaussian_filter_horizontal(img_g)
        blurred_img_g = self.gaussian_filter_vertical(blur_horizontal)
        blur_horizontal = self.gaussian_filter_horizontal(img_b)
        blurred_img_b = self.gaussian_filter_vertical(blur_horizontal)

        blurred_img = torch.stack([blurred_img_r, blurred_img_g, blurred_img_b], dim=1)
        blurred_img = torch.stack([torch.squeeze(blurred_img)])

        grad_x_r = self.sobel_filter_horizontal(blurred_img_r)
        grad_y_r = self.sobel_filter_vertical(blurred_img_r)
        grad_x_g = self.sobel_filter_horizontal(blurred_img_g)
        grad_y_g = self.sobel_filter_vertical(blurred_img_g)
        grad_x_b = self.sobel_filter_horizontal(blurred_img_b)
        grad_y_b = self.sobel_filter_vertical(blurred_img_b)

        # COMPUTE THICK EDGES

        grad_mag = torch.sqrt(grad_x_r**2 + grad_y_r**2)
        grad_mag += torch.sqrt(grad_x_g**2 + grad_y_g**2)
        grad_mag += torch.sqrt(grad_x_b**2 + grad_y_b**2)
        grad_orientation = (torch.atan2(grad_y_r + grad_y_g + grad_y_b,
                            grad_x_r + grad_x_g + grad_x_b) * (180.0 / 3.14159))
        grad_orientation += 180.0
        grad_orientation = torch.round(grad_orientation / 45.0) * 45.0

        # THIN EDGES (NON-MAX SUPPRESSION)

        all_filtered = self.directional_filter(grad_mag)

        inidices_positive = (grad_orientation / 45) % 8
        inidices_negative = ((grad_orientation / 45) + 4) % 8

        height = inidices_positive.size()[2]
        width = inidices_positive.size()[3]
        pixel_count = height * width
        pixel_range = torch.tensor([range(pixel_count)], dtype=float).to(img.device)

        indices = (inidices_positive.view(-1).data * pixel_count + pixel_range.repeat(1, batch_size)).squeeze()
        channel_select_filtered_positive = all_filtered.view(-1)[indices.long()].view(batch_size, 1, height, width)

        indices = (inidices_negative.view(-1).data * pixel_count + pixel_range.repeat(1, batch_size)).squeeze()
        channel_select_filtered_negative = all_filtered.view(-1)[indices.long()].view(batch_size, 1, height, width)

        channel_select_filtered = torch.cat([channel_select_filtered_positive, channel_select_filtered_negative], 1)
        is_max = channel_select_filtered.min(dim=1)[0] > 0.0
        is_max = torch.unsqueeze(is_max, dim=1)

        thin_edges = grad_mag.clone()
        thin_edges[is_max == 0] = 0.0

        # THRESHOLD

        thresholded = thin_edges.clone()
        thresholded[thin_edges < self.threshold] = 0.0

        early_threshold = grad_mag.clone()
        early_threshold[grad_mag < self.threshold] = 0.0

        assert grad_mag.size() == grad_orientation.size() == thin_edges.size() == thresholded.size() == early_threshold.size()

        # return blurred_img, grad_mag, grad_orientation, thin_edges, thresholded, early_threshold
        return thresholded

In [None]:
import cv2
import torch
from torch.autograd import Variable


def canny(raw_img, use_cuda=False):
    img = torch.from_numpy(raw_img.transpose((2, 0, 1)))
    batch = torch.stack([img, img]).float()

    net = CannyEdge(threshold=5)
    if use_cuda:
        net.cuda()
    net.eval()

    data = Variable(batch)
    if use_cuda:
        data = Variable(batch).cuda()

    thresholded = net(data)
    print(thresholded.shape)
    cv2.imwrite('final.png', (thresholded.data.cpu().numpy()[1, 0] > 0.0).astype(float) * 255)


import numpy as np
img = librosa.power_to_db(np.load("dataset/mel/Alto-1#newboy#0000.npy"))
fig = plot_spectrogram(img)
fig.show()
img = (img - img.min()) / (img.max() - img.min())
img = np.expand_dims(img, axis=-1).repeat(3, axis=-1)
cv2.imwrite("img.png", img * 255)

canny(img, use_cuda=False)
img = np.load("dataset/mel/Alto-1#newboy#0000.npy")
plot_spectrogram(librosa.power_to_db(img))
img.shape

In [None]:
np.expand_dims(img, axis=0).repeat(3, axis=0).shape

In [None]:
import torch.nn as nn


class Sobel(nn.Module):
    def __init__(self):
        super().__init__()
        self.filter = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=0, bias=False)

        Gx = torch.tensor([[2.0, 0.0, -2.0], [4.0, 0.0, -4.0], [2.0, 0.0, -2.0]]) * 0
        Gy = torch.tensor([[4.0, 8.0, 4.0], [0.0, 0.0, 0.0], [-4.0, -8.0, -4.0]])
        G = torch.cat([Gx.unsqueeze(0), Gy.unsqueeze(0)], 0)
        G = G.unsqueeze(1)
        self.filter.weight = nn.Parameter(G, requires_grad=False)

    def forward(self, img):
        x = self.filter(img)
        x = torch.mul(x, x)
        x = torch.sum(x, dim=1, keepdim=True)
        x = torch.sqrt(x + 1e-9)
        return x

In [None]:
def plot_spectrogram(spectrogram):
    fig, ax = plt.subplots(figsize=(10, 2))
    im = ax.imshow(
        spectrogram,
        aspect="auto",
        origin="lower",
        interpolation='none'
    )
    plt.colorbar(im, ax=ax)

    fig.canvas.draw()
    plt.close()

    return fig

In [None]:

img = librosa.power_to_db(np.load("dataset/mel/Alto-1#newboy#0000.npy"))
img = np.expand_dims(np.expand_dims(img, axis=0), axis=0)
sobel = Sobel()
print(img.shape)
result = sobel(torch.from_numpy(img))
result = result / result.max()
print(result.shape)
plt.figure(figsize=(5, 5), layout="constrained")
plt.suptitle("Sobel-Filtered Mel-Spectrogram")
plt.imshow(
    torch.pow(result[0, 0], 2),
    aspect="auto",
    origin="lower",
    interpolation='none'
)
plt.savefig("sobel.png")

plt.figure(figsize=(5, 5), layout="constrained")
plt.suptitle("Original Mel-Spectrogram")
plt.imshow(
    img[0, 0],
    aspect="auto",
    origin="lower",
    interpolation='none'
)
plt.savefig("spectrogram.png")

In [None]:
(y_mel.max(), y_g_hat_mel.max())
(y_mel.min(), y_g_hat_mel.min())

In [None]:
y_mel = np.load("y_mel.npy")
y_g_hat_mel = np.load("y_g_hat_mel.npy")