In [None]:
%pip install librosa python-dotenv pydot
%pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [None]:
from torchaudio import models
import torchaudio
import torchaudio.transforms as transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import random

import dataloader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
%%time
ms = dataloader.MusicSet(dataloader.TRACKS, device=device)

In [None]:
rnn = nn.RNN(1025, 10, 1, batch_first=True).to(device)
rnn2 = nn.RNN(1035, 10, 1, batch_first=True).to(device)
idx = 100
out, h = rnn(ms[idx][0].t()[None,])
print(out.shape)
out_r = ms[idx][0].t()[0][None].repeat(1291, 1).unsqueeze(0)

out = torch.zeros(10).to(device)

finale = torch.zeros(1025, 10)

lol = ms[idx][0].t()[0]

middle = torch.cat([lol, out + torch.randn(10).to(device)])

for i in range(1025):
    out, h = rnn2(middle[None, None, ], h)
    out = out[0][0]
    finale[i] = out
    middle = torch.cat([lol, out + torch.randn(10).to(device)])

out_s = torch.sum(finale, 1)
print(out_s.shape, out_s)

In [None]:
class RNN_VAE(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, encoder_dim=64):
        super(RNN_VAE, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.rnn1 = nn.GRU(input_size,
                           hidden_size,
                           num_layers,
                           batch_first=True)
        self.rnn2 = nn.GRU(hidden_size,
                           input_size,
                           num_layers,
                           batch_first=True)
        self.fc0 = nn.Linear(hidden_size, encoder_dim)
        self.fc1 = nn.Linear(encoder_dim, hidden_size)

    def encode(self,x):
        # hidden state
        s0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(device)
        middle, _ = self.rnn1(x, s0)
        return F.relu(middle)

    def decode(self, inp_dec, forced_teaching=False):
        s1 = torch.zeros(self.num_layers, 1, self.input_size).to(device)
        if not forced_teaching:
            L = inp_dec.shape[1]
            inp_dec = inp_dec[:, -1].repeat(1, L, 1)
        middle = self.fc0(inp_dec)
        middle = F.relu(middle)
        middle = self.fc1(middle)
        y, s_out = self.rnn2(middle, s1)
        return y

    def forward(self, x):
        middle = self.encode(x)
        return self.decode(middle)

In [None]:
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 0
PENALITY = 0.5

model = RNN_VAE(1025, 64, 1, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50], gamma=0.1)
criterion = nn.MSELoss()

In [None]:
TEACHING_RATE = 1
def train(num_epochs, teaching_rate=TEACHING_RATE):
    for epoch in range(num_epochs):
        for i, x in enumerate(ms):
            x_p = x[0].t()[None,]
            mid = model.encode(x_p)
            forced_teaching = random.random() < teaching_rate
            out = model.decode(mid, forced_teaching)
            optimizer.zero_grad()
            loss = criterion(out, x_p)
            loss.backward()
            _ = nn.utils.clip_grad_norm_(model.parameters(), 25.)
            optimizer.step()
            i += 1
            if i % 1 == 0:
                print(f"{epoch},{i}: {loss}", end='\r')
            break
        scheduler.step()

In [None]:
train(2000)

In [None]:
SAMPLE = 0

test_song_orig = ms[SAMPLE][0]
print(test_song_orig.shape)
res_orig = ms.from_spectro(test_song_orig)
print(res_orig.shape)


print(test_song_orig.cpu().detach().numpy())
print(f"Max: {test_song_orig.max().cpu().detach().numpy()},\
      min: {test_song_orig.min().cpu().detach().numpy()},\
      mean: {test_song_orig.mean().cpu().detach().numpy()}")

p = torch.sum(test_song_orig.t().abs(), 1)
plt.plot(p.cpu().detach())
ipd.Audio(res_orig.cpu(), rate=22050)

In [None]:
test_song = ms[SAMPLE][0].t()[None,]
code = model.encode(test_song)
spectro_forced = F.relu(model.decode(code, True).squeeze().t())
spectro_ = F.relu(model.decode(code, False).squeeze().t())
# print(spectro_.shape)
spectro = spectro_forced

res = ms.from_spectro(spectro)
# print(res.shape)

print(spectro.abs().cpu().detach().numpy())
print(f"Max: {spectro.max().cpu().detach().numpy()},\
      min: {spectro.min().cpu().detach().numpy()},\
      mean: {spectro.mean().cpu().detach().numpy()}")

p = torch.sum(spectro.t().abs(), 1)
plt.plot(p.cpu().detach())
ipd.Audio(res.cpu().detach(), rate=22050)

In [None]:
import soundfile
soundfile.write("test.wav",10*res.cpu().detach().numpy(), 22050)

In [None]:
librosa.display.specshow(test_song_orig.cpu().detach().numpy(), sr=22050, hop_length=512, x_axis='time', y_axis='log')

In [None]:
librosa.display.specshow(spectro.cpu().detach().numpy(), sr=22050, hop_length=512, x_axis='time', y_axis='log')