<a href="https://colab.research.google.com/github/PratikStar/google-colab/blob/main/2_2_Music_VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive',  force_remount=True)

Mounted at /content/drive


### Install tensorflow v2.4.0

In [2]:
!pip uninstall --yes tensorflow
!pip install tensorflow==2.4.0

Found existing installation: tensorflow 2.6.0
Uninstalling tensorflow-2.6.0:
  Successfully uninstalled tensorflow-2.6.0
Collecting tensorflow==2.4.0
  Downloading tensorflow-2.4.0-cp37-cp37m-manylinux2010_x86_64.whl (394.7 MB)
[K     |████████████████████████████████| 394.7 MB 17 kB/s 
Collecting h5py~=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 48.8 MB/s 
[?25hCollecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp37-cp37m-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 73.0 MB/s 
[?25hCollecting tensorflow-estimator<2.5.0,>=2.4.0rc0
  Downloading tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 71.5 MB/s 
Installing collected packages: grpcio, tensorflow-estimator, h5py, gast, tensorflow
  Attempting uninstall: grpcio
    Found ex

## Autoencoder Code

In [85]:
"#@title"
import os
import pickle
import csv
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, ReLU, BatchNormalization, \
    Flatten, Dense, Reshape, Conv2DTranspose, Activation, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
import numpy as np
import tensorflow as tf

print(tf.__version__)
tf.compat.v1.disable_eager_execution()


class VAE:
    """
    VAE represents a Deep Convolutional variational autoencoder architecture
    with mirrored encoder and decoder components.
    """

    def __init__(self,
                 input_shape,
                 conv_filters,
                 conv_kernels,
                 conv_strides,
                 latent_space_dim):
        self.input_shape = input_shape
        self.conv_filters = conv_filters
        self.conv_kernels = conv_kernels 
        self.conv_strides = conv_strides 
        self.latent_space_dim = latent_space_dim 
        self.reconstruction_loss_weight = 1000000

        self.encoder = None
        self.decoder = None
        self.model = None

        self._num_conv_layers = len(conv_filters)
        self._shape_before_bottleneck = None
        self._model_input = None

        self._build()

    def summary(self):
        self.encoder.summary()
        self.decoder.summary()
        self.model.summary()

    def compile(self, learning_rate=0.0001):
        optimizer = Adam(learning_rate=learning_rate)
        self.model.compile(optimizer=optimizer,
                           loss=self._calculate_combined_loss,
                           metrics=[self._calculate_reconstruction_loss,
                                    self._calculate_kl_loss])

    def train(self, x_train, y_train, batch_size, num_epochs):
        return self.model.fit(x_train,
                       y_train,
                       batch_size=batch_size,
                       epochs=num_epochs,
                       shuffle=True)

    def save(self, save_folder="."):
        self._create_folder_if_it_doesnt_exist(save_folder)
        self._save_parameters(save_folder)
        self._save_weights(save_folder)

    def reconstruct(self, images):
        latent_representations = self.encoder.predict(images)
        reconstructed_images = self.decoder.predict(latent_representations)
        return reconstructed_images, latent_representations

    def _save_parameters(self, save_folder):
        parameters = [
            self.input_shape,
            self.conv_filters,
            self.conv_kernels,
            self.conv_strides,
            self.latent_space_dim
        ]
        save_path = os.path.join(save_folder, "parameters.pkl")
        with open(save_path, "wb") as f:
            print(parameters)
            pickle.dump(parameters, f)

    def _save_weights(self, save_folder):
        save_path = os.path.join(save_folder, "weights.h5")
        self.model.save_weights(save_path)

    @classmethod
    def load(cls, save_folder="."):
        parameters_path = os.path.join(save_folder, "parameters.pkl")
        with open(parameters_path, "rb") as f:
            parameters = pickle.load(f)
        autoencoder = VAE(*parameters)
        weights_path = os.path.join(save_folder, "weights.h5")
        autoencoder.load_weights(weights_path)
        return autoencoder

    def load_weights(self, weights_path):
        self.model.load_weights(weights_path)

    def _calculate_combined_loss(self, y_target, y_predicted):
        reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
        kl_loss = self._calculate_kl_loss(y_target, y_predicted)
        combined_loss = self.reconstruction_loss_weight * reconstruction_loss\
                                                         + kl_loss
        return combined_loss

    def _calculate_reconstruction_loss(self, y_target, y_predicted):
        error = y_target - y_predicted
        reconstruction_loss = K.mean(K.square(error), axis=[1, 2, 3])
        return reconstruction_loss

    def _calculate_kl_loss(self, y_target, y_predicted):
        kl_loss = -0.5 * K.sum(1 + self.log_variance - K.square(self.mu) -
                               K.exp(self.log_variance), axis=1)
        return kl_loss

    def _create_folder_if_it_doesnt_exist(self, folder):
        if not os.path.exists(folder):
            os.makedirs(folder)

    def _build(self):
        self._build_encoder()
        self._build_decoder()
        self._build_autoencoder()

    def _build_autoencoder(self):
        model_input = self._model_input
        model_output = self.decoder(self.encoder(model_input))
        self.model = Model(model_input, model_output, name="autoencoder")

    def _build_decoder(self):
        decoder_input = self._add_decoder_input()
        dense_layer = self._add_dense_layer(decoder_input)
        reshape_layer = self._add_reshape_layer(dense_layer)
        conv_transpose_layers = self._add_conv_transpose_layers(reshape_layer)
        decoder_output = self._add_decoder_output(conv_transpose_layers)
        self.decoder = Model(decoder_input, decoder_output, name="decoder")

    def _add_decoder_input(self):
        return Input(shape=self.latent_space_dim, name="decoder_input")

    def _add_dense_layer(self, decoder_input):
        num_neurons = np.prod(self._shape_before_bottleneck) # [1, 2, 4] -> 8
        dense_layer = Dense(num_neurons, name="decoder_dense")(decoder_input)
        return dense_layer

    def _add_reshape_layer(self, dense_layer):
        return Reshape(self._shape_before_bottleneck)(dense_layer)

    def _add_conv_transpose_layers(self, x):
        """Add conv transpose blocks."""
        # loop through all the conv layers in reverse order and stop at the
        # first layer
        for layer_index in reversed(range(1, self._num_conv_layers)):
            x = self._add_conv_transpose_layer(layer_index, x)
        return x

    def _add_conv_transpose_layer(self, layer_index, x):
        layer_num = self._num_conv_layers - layer_index
        conv_transpose_layer = Conv2DTranspose(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"decoder_conv_transpose_layer_{layer_num}"
        )
        x = conv_transpose_layer(x)
        x = ReLU(name=f"decoder_relu_{layer_num}")(x)
        x = BatchNormalization(name=f"decoder_bn_{layer_num}")(x)
        return x

    def _add_decoder_output(self, x):
        conv_transpose_layer = Conv2DTranspose(
            filters=1,
            kernel_size=self.conv_kernels[0],
            strides=self.conv_strides[0],
            padding="same",
            name=f"decoder_conv_transpose_layer_{self._num_conv_layers}"
        )
        x = conv_transpose_layer(x)
        output_layer = Activation("sigmoid", name="sigmoid_layer")(x)
        return output_layer

    def _build_encoder(self):
        encoder_input = self._add_encoder_input()
        conv_layers = self._add_conv_layers(encoder_input)
        bottleneck = self._add_bottleneck(conv_layers)
        self._model_input = encoder_input
        self.encoder = Model(encoder_input, bottleneck, name="encoder")

    def _add_encoder_input(self):
        return Input(shape=self.input_shape, name="encoder_input")

    def _add_conv_layers(self, encoder_input):
        """Create all convolutional blocks in encoder."""
        x = encoder_input
        for layer_index in range(self._num_conv_layers):
            x = self._add_conv_layer(layer_index, x)
        return x

    def _add_conv_layer(self, layer_index, x):
        """Add a convolutional block to a graph of layers, consisting of
        conv 2d + ReLU + batch normalization.
        """
        layer_number = layer_index + 1
        conv_layer = Conv2D(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"encoder_conv_layer_{layer_number}"
        )
        x = conv_layer(x)
        x = ReLU(name=f"encoder_relu_{layer_number}")(x)
        x = BatchNormalization(name=f"encoder_bn_{layer_number}")(x)
        return x

    def _add_bottleneck(self, x):
        """Flatten data and add bottleneck with Guassian sampling (Dense
        layer).
        """
        self._shape_before_bottleneck = K.int_shape(x)[1:]
        x = Flatten()(x)
        self.mu = Dense(self.latent_space_dim, name="mu")(x)
        self.log_variance = Dense(self.latent_space_dim,
                                  name="log_variance")(x)

        def sample_point_from_normal_distribution(args):
            mu, log_variance = args
            epsilon = K.random_normal(shape=K.shape(self.mu), mean=0.,
                                      stddev=1.)
            sampled_point = mu + K.exp(log_variance / 2) * epsilon
            return sampled_point

        x = Lambda(sample_point_from_normal_distribution,
                   name="encoder_output")([self.mu, self.log_variance])
        return x


2.4.0


## Music AutoEncoder

In [113]:
LATENT_SPACE_DIM= 16
CONV_FILTERS = (128, 64, 32, 32)
CONV_KERNELS = (3, 3, 3, 3)
CONV_STRIDES = (1, 2, 2, 1)

musicae = VAE(
    input_shape=(256, 64, 1),
    conv_filters= CONV_FILTERS, 
    conv_kernels= CONV_KERNELS,
    conv_strides= CONV_STRIDES,
    latent_space_dim=LATENT_SPACE_DIM
)
musicae.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 256, 64, 1)] 0                                            
__________________________________________________________________________________________________
encoder_conv_layer_1 (Conv2D)   (None, 256, 64, 128) 1280        encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_relu_1 (ReLU)           (None, 256, 64, 128) 0           encoder_conv_layer_1[0][0]       
__________________________________________________________________________________________________
encoder_bn_1 (BatchNormalizatio (None, 256, 64, 128) 512         encoder_relu_1[0][0]             
____________________________________________________________________________________________

### Load Data


In [112]:
import re
import os
import numpy as np

SPECTROGRAMS_PATH = "/content/drive/MyDrive/Music/VAE/ICASSP/spectrogram"
AMP_IDS = [i for i in range(1, 81)] # 1-80
CLIP_IDS = [i for i in range(1, 13)] # 1-12

def load_music_ds(spectrograms_path, ampids, clipids):

    # ampid check: 1-80
    invalidampids = [i for i in ampids if i>80 or i<1]
    if len(invalidampids) > 0:
        raise Exception("Invalid ampids: " + str(invalidampids))
    # clipid check: 1-12
    invalidclipids = [i for i in clipids if i>12 or i<1]
    if len(invalidclipids) > 0:
        raise Exception("Invalid clipids: " + str(invalidclipids))

    # Creating a list of DI file names
    x, y = [], []
    xfiles, yfiles = [], []
    dis = []
    for root, _, filenames in os.walk(spectrograms_path):
        for filename in filenames:
            regex = "^00000.*"
            if re.match(regex, filename):
                dis.append(filename)

    # Actual DS creation
    for root, _, filenames in os.walk(spectrograms_path):

        for filename in filenames:
            fn_split = filename.split(" ")[0].split("-")

            ampid = int(fn_split[0])
            clipid = int(fn_split[1])
            windowid = int(fn_split[2])

            if ampid in ampids and clipid in clipids:
                # print(filename)
                xfiles.append(filename)
                di_regex = "^00000-" + "%02d" % clipid + "-" + "%02d" % windowid + ".*"
                r = re.compile(di_regex)
                di_filename = list(filter(r.match, dis))[0]
                yfiles.append(di_filename)
                # print("DI is: " + di_filename)
                filepath = os.path.join(root, filename)
                di_filepath = os.path.join(root, di_filename)
                
                spectrogram = np.load(filepath) # (n_bins, n_frames, 1) 
                di_spectrogram = np.load(di_filepath) # (n_bins, n_frames, 1) 
                
                x.append(spectrogram[..., np.newaxis])
                y.append(di_spectrogram[..., np.newaxis])
    return np.array(x), np.array(y), xfiles, yfiles

x, y, xfiles, yfiles = load_music_ds(SPECTROGRAMS_PATH, AMP_IDS, CLIP_IDS)
print(x.shape)

(11360, 256, 64, 1)


### Train Model

In [117]:
LEARNING_RATE = 0.0001
BATCH_SIZE = 500
EPOCHS = 500

musicae.compile(LEARNING_RATE)
history = musicae.train(x, y, BATCH_SIZE, EPOCHS)
losses = history.history['loss']


Train on 11360 samples
Epoch 1/500


ResourceExhaustedError: ignored

### Save Model

In [108]:
import pytz
from datetime import datetime

MUSICAE_SAVE_PATH = "/content/drive/MyDrive/Music/VAE/ICASSP/music-encoder/"
ts = pytz.timezone('Asia/Tokyo').localize(datetime.now())
tsf = ts.strftime("%Y-%m-%d-%H-%M-%S")

musicae.save(os.path.join(MUSICAE_SAVE_PATH, "model " + tsf))

[(256, 64, 1), (128, 64, 32, 32), (3, 3, 3, 3), (1, 2, 2, 1), 16]


### Save Meta

In [110]:
# Training History
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd


gc = gspread.authorize(GoogleCredentials.get_application_default())
ws = gc.open_by_url('https://docs.google.com/spreadsheets/d/1qbX7Fyv--JskTAWKnpWeQQJRQKzpM8L7sOUxbr6SPtc/edit#gid=1946289387').sheet1
all = ws.get_all_records()
last_ts = all[-1]['Timestamp']
ws.resize(len(all)+1)
if last_ts != ts.strftime("%Y/%m/%d %H:%M:%S"):
    ws.append_row([
                  ts.strftime("%Y/%m/%d %H:%M:%S"),
                  str(AMP_IDS),
                  str(CLIP_IDS),
                  str(CONV_FILTERS),
                  str(CONV_KERNELS),
                  str(CONV_STRIDES),
                  str(LATENT_SPACE_DIM),
                  str(LEARNING_RATE),
                  str(BATCH_SIZE),
                  str(EPOCHS),
                  str(int(min(losses))),
                  str(int(min(losses[:250]))), # temporary
                  str(losses)
    ])
else:
    print("Already added to Training history!")

with open(os.path.join(MUSICAE_SAVE_PATH, "model " + tsf, 'meta.csv'), 'w') as f:
    f.write("\nAmp IDs, " + str(ampids))
    f.write("\nClip IDs, " + str(clipids))
    f.write("\nBatch Size, " + str(BATCH_SIZE))
    f.write("\nEpochs, " + str(EPOCHS))
    f.write("\nLatent space dimension, " + str(LATENT_SPACE_DIM))
    f.write("\nLearning rate, " + str(LEARNING_RATE))
    f.write("\nTimestamp, " + ts.strftime("%Y/%m/%d %H:%M:%S"))
print("Metadata Saved!!")



Already added to Training history!
Metadata Saved!!


### Generate the embeddings (Independently executable)

In [111]:
# # https://github.com/musikalkemist/generating-sound-with-neural-networks/blob/49d7db32c43d1a04c596cbbb282a9521be1e7fc8/11%20Implementing%20VAE/code/analysis.py

import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display

# TODO before running this cell independently
# 1. Update the MODEL_NAME
# 2. Run the "Load Data" cell

# UPDATE THE MODEL!!!
MODEL_NAME = "model- 2021-09-30-16-01-23"
SPECTROGRAMS_PATH = "/content/drive/MyDrive/Music/VAE/ICASSP/spectrogram"
MODEL_PATH = "/content/drive/MyDrive/Music/VAE/ICASSP/music-encoder/" + MODEL_NAME

# Comes from the Load Data cell
dsfilenames = list(set(yfiles)) + xfiles

# Not tested!!
def plot_reconstructed_images(images, reconstructed_images):
    num_images = len(images)
    for i, (image, reconstructed_image) in enumerate(zip(images, reconstructed_images)):

        fig, ax = plt.subplots()
        image = image.squeeze()
        img = librosa.display.specshow(image, y_axis='log', x_axis='time', ax=ax)
        fig.colorbar(img, ax=ax, format="%+2.0f dB")

        fig, ax = plt.subplots()
        reconstructed_image = reconstructed_image.squeeze()
        recon_img = librosa.display.specshow(reconstructed_image, y_axis='log', x_axis='time', ax=ax)
        fig.colorbar(recon_img, ax=ax, format="%+2.0f dB")
    plt.show()

# Not tested!!
def plot_images_encoded_in_latent_space(latent_representations, sample_labels):
    plt.figure(figsize=(10, 10))
    plt.scatter(latent_representations[:, 0],
                latent_representations[:, 1],
                cmap="rainbow",
                c=sample_labels,
                alpha=0.5,
                s=2)
    plt.colorbar()
    plt.show()

def save_embeddings(musicae, dsfilenames, download_path):


    dsspectrogram = []
    for filename in dsfilenames:
        filepath = os.path.join(SPECTROGRAMS_PATH, filename)
        spectrogram = np.load(filepath)
        dsspectrogram.append(spectrogram[..., np.newaxis])

    dsspectrogram = np.array(dsspectrogram)
    latent_representations = musicae.encoder.predict(dsspectrogram)

    with open(os.path.join(download_path, 'embeddings.tsv'), 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output, delimiter='\t')
        tsv_output.writerows(latent_representations)
    # Write corresdonding filenames
    with open(os.path.join(download_path, 'embedding-filenames.tsv'), 'a') as f_output:
        f_output.seek(0)
        f_output.truncate()
        for data in dsfilenames:
            f_output.write(data)
            f_output.write('\n')
    print("Embeddings saved!!")
    return latent_representations

## Driver coder

if musicae == None:
    musicae = VAE.load(MODEL_PATH)

save_embeddings(musicae, dsfilenames, MODEL_PATH)

# reconstructed_images, _ = autoencoder.reconstruct(np.array(list(dataset.values())))
# plot_reconstructed_images(sample_images, reconstructed_images)



Embeddings saved!!


array([[-0.94976586,  3.589117  , -5.6696005 , ..., -2.1024005 ,
        -0.52547985,  0.53528875],
       [-0.39703095,  0.09528753, -1.1663307 , ...,  3.14496   ,
        -3.7484055 , -0.44704148],
       [-3.3417277 ,  4.7321954 ,  1.7879758 , ..., -2.5602322 ,
         0.777218  ,  2.006522  ],
       ...,
       [ 0.72897494,  4.4406204 ,  3.682107  , ...,  3.967624  ,
         7.013556  ,  1.5005401 ],
       [-3.957868  , -0.02090748, -1.6777387 , ...,  6.1119556 ,
         2.5452788 ,  6.7693944 ],
       [-4.885683  ,  2.5795383 , -1.1673104 , ...,  8.482125  ,
         2.1263878 ,  3.0961082 ]], dtype=float32)