In [49]:
%reset -f
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as kr
import librosa as lb
from IPython.display import Audio

# Summary

**1 – Loading preprocessed data**

**2 – Building the VAE**

- 2.1 – Encoder part
- 2.2 – Decoder part
- 2.3 – VAE

**3 – Training the VAE**

**4 – Sound generation**

# 1 – Loading preprocessed data

First, we load data from the *data* folder. Additionally, we manually set some important variables.

In [50]:
x_train = np.load("data/x_train.npy")
x_test = np.load("data/x_test.npy")
min_values = np.load("data/min_values.npy")
max_values = np.load("data/max_values.npy")

width = 50
height = 50
sr = 22050 #sample rate
hop_length = int((sr+1) / width) + 1 #parameter of the transformation

In [51]:
data = [x_train.shape, x_test.shape]
index = ["train data", "test data"]
columns = ["number of mel spectrograms", "height", "width"]
df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0,number of mel spectrograms,height,width
train data,22500,50,50
test data,7500,50,50


# 2 – Building the VAE

In order to generate virtual voices, we will build a variational autoencoder (VAE).

### 2.1 – Encoder part

We chose to use 2 convolutional layers, with downsampling. Then, we set 2 dense layers before creating the latent space of size $d=20$. The encoder should return the latent space but also 2 dense layers *dense_mean* and *dense_log_std* of size $d$. They respectively contain every $\mu_i$ and $\log (\sigma^2_i)$, where $1 \leq i \leq d$.

In [52]:
def sampling(mean_log_std): #used to create the latent space regarding dense_mean and dense_log_std
    mean, log_std = mean_log_std
    return mean + tf.random.normal(tf.shape(mean)) * tf.math.exp(log_std/2)

In [53]:
input = kr.Input(shape=(height, width, 1))

x = kr.layers.Conv2D(40, (3, 3), strides=(1, 1), activation="relu")(input) #(48, 48, 40)
x = kr.layers.AveragePooling2D((2, 2))(x) #(24, 24, 40)
x = kr.layers.Conv2D(25, (3, 3), strides=(1, 1), activation="relu")(x) #(22, 22, 25)
x = kr.layers.AveragePooling2D((2, 2))(x) #(11, 11, 25)

x = kr.layers.Flatten()(x) #(11*11*25) = (3025)
x = kr.layers.Dense(800, activation="relu")(x) #(800)
x = kr.layers.Dense(60, activation="relu")(x) #(60)

dense_mean = kr.layers.Dense(20, name="dense_mean")(x) #(20)
dense_log_std = kr.layers.Dense(20, name="dense_log_std")(x) #(20)
latent_space = kr.layers.Lambda(sampling, name="latent_space")([dense_mean, dense_log_std]) #(20)

encoder = kr.models.Model(inputs=input, outputs=[dense_mean, dense_log_std, latent_space])

### 2.2 – Decoder part

The decoder is the concatenation of 3 dense layers and 2 transposed convolutional layers (with upsampling). In the VAE structure, the decoder learns how to best reconstruct the input data.

In [54]:
input = kr.Input(shape=20)

x = kr.layers.Dense(60, activation="relu")(input) #(60)
x = kr.layers.Dense(800, activation="relu")(x) #(800)
x = kr.layers.Dense(11*11*25, activation="relu")(x) #(11*11*25) = (3025)
x = kr.layers.Reshape((11, 11, 25))(x) #(11, 11, 25)

x = kr.layers.UpSampling2D((2, 2))(x) #(22, 22, 25)
x = kr.layers.Conv2DTranspose(40, (3, 3), strides=(1, 1), activation="relu")(x) #(24, 24, 40)
x = kr.layers.UpSampling2D((2, 2))(x) #(48, 48, 40)
output = kr.layers.Conv2DTranspose(1, (3, 3), strides=(1, 1), activation="relu")(x) #(50, 50, 1)

decoder = kr.models.Model(inputs=input, outputs=output)

### 2.3 – VAE

Finally, we connect the encoder and the decoder. Since we want the latent space to follow a normal distribution, our loss function will be
$$\lambda \times MSE + D_{KL}(P||Q)$$
where :
- we assume that $P \sim N(\mu, \Sigma)$ with $\mu = \begin{pmatrix} \mu_1 \\ \vdots \\ \mu_d \end{pmatrix}$ the output of *dense_mean* and $\Sigma = \begin{pmatrix} \sigma^2_1 & 0 & \cdots & 0 \\ 0 & \sigma^2_2 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & \sigma^2_d \end{pmatrix}$, where $\log (\sigma^2_1), \dots, \log (\sigma^2_d)$ is the output of *dense_log_std*
- $Q \sim N(0_d, I_d)$
- $D_{KL}(P||Q) = \frac{1}{2} \sum_{i=1}^d \bigl[ \mu_i^2 + \sigma_i^2 - 1 - \log (\sigma_i^2) \bigl]$ is the **Kullback-Leibler divergence** from $P$ (the latent space distribution) to $Q$ (the target distribution, a standard normal one here).
- $\lambda$ is a hyperparameter that we chose to set to 500.

In [55]:
input = kr.Input(shape=(height, width, 1))

encoding_space = encoder(input)
dense_mean = encoding_space[0]
dense_log_std = encoding_space[1]
latent_space = encoding_space[2]

output = decoder(latent_space)

vae = kr.models.Model(inputs=input, outputs=output)

#building the loss function (with Kullback-Leibler divergence)
KL = tf.reduce_mean(tf.math.square(dense_mean))
KL = KL + tf.reduce_mean(tf.math.exp(dense_log_std))
KL = KL - tf.reduce_mean(dense_log_std)
KL = KL - 1
KL = KL / 2

custom_loss = 500 * kr.losses.mse(input, output) + KL
vae.add_loss(custom_loss)

#compiling the VAE
vae.compile(optimizer="adam", metrics="mse")

# 3 – Training the VAE

In [56]:
x_train = np.expand_dims(x_train, axis=-1) #(22500, 50, 50, 1)
x_test = np.expand_dims(x_test, axis=-1) #(7500, 50, 50, 1)

#some hyperparameters
epochs = 10
batch_size = 64

vae.fit(x_train, x_train, epochs=epochs, batch_size=batch_size) #training the VAE

#evaluating the model
custom_loss, mse = vae.evaluate(x_test, x_test, verbose=0)
KL = custom_loss - 500*mse
print("MSE on test data :", round(mse, 4))
print("KL loss on test data :", round(KL, 4))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MSE on test data : 0.0046
KL loss on test data : 0.6674


# 4 – Sound generation

The VAE is ready to use. We need to follow those steps.
- Generate a random sample $\sim N(0_d, I_d)$.
- Apply the decoder that will return a mel spectrogram.
- Transform the mel spectrogram into an audio signal.
- Hear the result.

In [65]:
sample = np.random.randn(20) #(20) / each component ~N(0, I)
sample = np.expand_dims(sample, axis=0) #(1, 20)
generated_S = decoder.predict(sample, verbose=0) #(1, 50, 50, 1)
generated_S = np.squeeze(generated_S) #(50, 50)
generated_S = generated_S * (max_values-min_values) + min_values #denormalization

#mel spectrogram to audio signal
generated_S = lb.db_to_power(generated_S)
generated_signal = lb.feature.inverse.mel_to_audio(generated_S, hop_length=hop_length)

Audio(data=generated_signal, rate=sr)