In [1]:
import os
import librosa
import numpy as np
import imageio
from utils.hparams import hparams
from utils import audio_tools as audio
from keras_contrib.layers.normalization.instancenormalization import InstanceNormalization
from keras.engine.saving import load_model
from IPython.display import Audio
from cyclespecgan import CycleSpecGAN



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



Using TensorFlow backend.


# Making this notebook work
Download preprocessed audio data and generator checkpoints from https://drive.google.com/open?id=1lCDZDeGmNrk1MvJJj4JYM8O5QDGssM25. Put it in the root directory where this notebook is located.

# Training

To train, please run `train.py`. I removed the training code here because the notebook becomes very large when logs are being generated which renders it unusable. 

# Load Preprocessed Data

In [2]:
def spec_2_audio(y, mel_means, mel_stds):
    s = np.squeeze(y)
    s = denormalize(s, mel_means, mel_stds)
    w = audio.inv_melspectrogram(s)
    return w

def denormalize(norm_s, mel_means, mel_stds):
    """ normalized spectrogram to original spectrogram using the calculated mean/standard deviation """
    assert norm_s.shape[0] == mel_means.shape[0]
    Y = (norm_s * (3.0 * mel_stds)) + mel_means
    return Y

source_data = np.load("source.npz")
source_specs = source_data['specs']
source_specs = source_specs[:,:,:, None] # Add channel
source_mel_means = source_data['mean']
source_mel_stds = source_data['std']

target_data = np.load("target.npz")
target_specs = target_data['specs']
target_specs = target_specs[:,:,:, None] # Add channel
target_mel_means = target_data['mean']
target_mel_stds = target_data['std']



# Construct Spectrogram GIFs

In [None]:
def images_to_gif(src, dest):
    images = []
    for file_name in os.listdir(src):
        if file_name.endswith('batch-0.png'):
            file_path = os.path.join(src, file_name)
            images.append(imageio.imread(file_path))
    imageio.mimsave(dest, images)
    
os.makedirs("gifs", exist_ok=True)

images_to_gif('./outputs/IRMAS-pia-2-gac/identity', 'gifs/pia-2-gac-identity.gif')
images_to_gif('./outputs/IRMAS-pia-2-gac/no-identity', 'gifs/pia-2-gac-no-identity.gif')


# CycleSpecGAN with Identity Loss
![title](gifs/pia-2-gac-identity.gif)

In [3]:
# Load checkpoint
checkpoint = './checkpoints/IRMAS-pia-2-gac-identity/g_AB-epoch-200.h5'
g_AB = load_model(checkpoint, custom_objects={'InstanceNormalization': InstanceNormalization})

Instructions for updating:
Colocations handled automatically by placer.




In [4]:
source_imgs = np.array([source_specs[3], source_specs[10], source_specs[16]])
fake_B_imgs = g_AB.predict(source_imgs)

orig_1 = spec_2_audio(source_imgs[0], source_mel_means, source_mel_stds)
fake_1 = spec_2_audio(fake_B_imgs[0], target_mel_means, target_mel_stds)

orig_2 = spec_2_audio(source_imgs[1], source_mel_means, source_mel_stds)
fake_2 = spec_2_audio(fake_B_imgs[1], target_mel_means, target_mel_stds)

orig_3 = spec_2_audio(source_imgs[2], source_mel_means, source_mel_stds)
fake_3 = spec_2_audio(fake_B_imgs[2], target_mel_means, target_mel_stds)


In [5]:
Audio(orig_1, rate=16000)

In [6]:
Audio(fake_1, rate=16000)

In [7]:
Audio(orig_2, rate=16000)

In [8]:
Audio(fake_2, rate=16000)

In [9]:
Audio(orig_3, rate=16000)

In [10]:
Audio(fake_3, rate=16000)

# CycleSpecGAN no Identity Loss
![title](gifs/pia-2-gac-no-identity.gif)

In [11]:
# Load checkpoint
checkpoint = './checkpoints/IRMAS-pia-2-gac-no-identity/g_AB-epoch-100.h5'
g_AB_no_id = load_model(checkpoint, custom_objects={'InstanceNormalization': InstanceNormalization})

fake_B_imgs = g_AB_no_id.predict(source_imgs)

orig_1 = spec_2_audio(source_imgs[0], source_mel_means, source_mel_stds)
fake_1 = spec_2_audio(fake_B_imgs[0], target_mel_means, target_mel_stds)

orig_2 = spec_2_audio(source_imgs[1], source_mel_means, source_mel_stds)
fake_2 = spec_2_audio(fake_B_imgs[1], target_mel_means, target_mel_stds)

orig_3 = spec_2_audio(source_imgs[2], source_mel_means, source_mel_stds)
fake_3 = spec_2_audio(fake_B_imgs[2], target_mel_means, target_mel_stds)



In [12]:
Audio(orig_1, rate=16000)

In [13]:
Audio(fake_1, rate=16000)

In [14]:
Audio(orig_2, rate=16000)

In [15]:
Audio(fake_2, rate=16000)

In [16]:
Audio(orig_3, rate=16000)

In [17]:
Audio(fake_3, rate=16000)

In [20]:
librosa.output.write_wav('fake_1.wav', fake_1, 16000)
librosa.output.write_wav('fake_2.wav', fake_2, 16000)
librosa.output.write_wav('fake_3.wav', fake_3, 16000)