In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
import soundfile as sf

In [2]:
# Check that TensorFlow can see the GPU
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Prepare the audio files
Run the constant Q transform over the audio files and save as numpy arrays

### Make all the clips 3 seconds long


In [None]:
#Scale clips to be about 3 seconds
target_duration  = 3

# Source File path
source = "./samples/"

# Read the duration of each clip in the source file
for i, sample in enumerate(os.listdir(source)):
    
    try:
        # Try to read the sample duration
        song = AudioSegment.from_mp3(source+sample)
        sample_dur = (song.duration_seconds)

        scale_factor = target_duration / sample_dur

        # time stretch by 1/scale_factor to get the target sample length
        y, sr = librosa.load(source+sample)
        y_adjusted = librosa.effects.time_stretch(y, rate=1/scale_factor)

        # Write out audio as 24bit PCM WAV
        sf.write(f'scaled_samples/{i}.wav', y_adjusted, sr, subtype='PCM_24')
    
    except:
        os.remove(source + sample)

### Convert the sounds to tensors and save

In [None]:
def convert_audio_to_complex_array(filename, outfilename=None, overwrite=False):
    """convert_audio_to_complex_array -- using librosa's short time Fourier transform.
    
    Arguments:
    filename -- filepath to the file that you to copy to an array
    outfilename -- filepath to the output array 
    overwrite -- whether to overwrite if a file already exists with the given outfilename
    
    Returns -- None
    """
    
    audio_data, sr = librosa.load(filename)
    
    # Get the CQT magnitude, 7 ocatves at 36 bins per octave
    # NOTE THERE IS A MIN FREQ SETTING fmin=librosa.note_to_hz('C2'),
    
    C = np.abs(librosa.cqt(y=audio_data, sr=sr, bins_per_octave=36, n_bins=7*36))
    
    print(np.shape(C))
    np.save(filename[:-4] + ".npy", C)

In [None]:
# Convert all the files to numpy arrays and save
total = len(os.listdir("samples/"))

for i, item in enumerate(os.listdir("samples/")):
    convert_audio_to_complex_array("samples/"+item)
    print(f"{i+1} out of {total}")

In [None]:
# Delete the mp3 files
for i, item in enumerate(os.listdir("samples/")):
    if item.endswith(".mp3") or item.endswith(".wav"):
        os.remove("samples/" + item)

# Load all the files into a single tensor

In [None]:
#Create the mega tensor

target_len = 128
target_height = 256
target_samples = len(os.listdir("samples/"))

mega_tensor = np.zeros([target_samples, target_height, target_len, 1], dtype = np.float32)

# Add every sample to the mega tensor
for i, name in enumerate(os.listdir("samples/")):
    item = np.load("samples/" + name)
    
    for j in range(len(item)):
        for k in range(len(item[0])):
            if k < target_len:
                mega_tensor[i][j][k] = item[j][k]
                
                
# Then save the mega tensor
np.save("data.npy", mega_tensor)

# Define the model

In [3]:
## KERAS MODELS
latent_dim = 128

generator = keras.Sequential(
    [
        keras.Input(shape=(latent_dim,)),
        layers.Dense(32 * 16 * 128),
        layers.Reshape((32, 16, 128)),
        layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(512, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(1, kernel_size=5, padding="same", activation="tanh"),
    ],
    name="generator",
)
generator.summary()

Model: "generator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 65536)             8454144   
                                                                 
 reshape (Reshape)           (None, 32, 16, 128)       0         
                                                                 
 conv2d_transpose (Conv2DTra  (None, 64, 32, 128)      262272    
 nspose)                                                         
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 64, 32, 128)       0         
                                                                 
 conv2d_transpose_1 (Conv2DT  (None, 128, 64, 256)     524544    
 ranspose)                                                       
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128, 64, 256)      0 

In [4]:
discriminator = keras.Sequential(
    [
        keras.Input(shape=(256, 128, 1)),
        layers.Conv2D(64, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Flatten(),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid"),
    ],
    name="discriminator",
)
discriminator.summary()

Model: "discriminator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 128, 64, 64)       1088      
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 128, 64, 64)       0         
                                                                 
 conv2d_2 (Conv2D)           (None, 64, 32, 128)       131200    
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 64, 32, 128)       0         
                                                                 
 conv2d_3 (Conv2D)           (None, 32, 16, 128)       262272    
                                                                 
 leaky_re_lu_5 (LeakyReLU)   (None, 32, 16, 128)       0         
                                                                 
 flatten (Flatten)           (None, 65536)           

# Create the Dataset

In [5]:
# Load dataset from directory with keras
mega_tensor =  np.load("data.npy")

train_ds = tf.data.Dataset.from_tensor_slices(mega_tensor)
dataset = train_ds.batch(4)


# Training
https://towardsdatascience.com/generative-adversarial-network-gan-for-dummies-a-step-by-step-tutorial-fdefff170391

1. Select a number of real images from the training set.
2. Generate a number of fake images. This is done by sampling random noise vectors and creating images from them using the generator
3. Train the discriminator for one or more epochs using both fake and real images. This will update on the discrimators weights by labeling all the real images as 1 and the fake images as 0.
4. Generate another number of fake images
5. Train the full GAN model for one or more epochs using only fake images. This will update only the generator's weights by labeling all fake images as 1. 

**SOURCE**: Link above

In [6]:
class GAN(keras.Model):
    def __init__(self, discriminator, generator, latent_dim):
        super(GAN, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim

    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super(GAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn
        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
        self.g_loss_metric = keras.metrics.Mean(name="g_loss")

    @property
    def metrics(self):
        return [self.d_loss_metric, self.g_loss_metric]

    def train_step(self, real_images):
        # Sample random points in the latent space
        batch_size = tf.shape(real_images)[0]
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Decode them to fake images
        generated_images = self.generator(random_latent_vectors)
        
        # Combine them with real images
        combined_images = tf.concat([generated_images, real_images], axis=0)

        # Assemble labels discriminating real from fake images
        labels = tf.concat(
            [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
        )
        # Add random noise to the labels - important trick!
        labels += 0.05 * tf.random.uniform(tf.shape(labels))

        # Train the discriminator
        with tf.GradientTape() as tape:
            predictions = self.discriminator(combined_images)
            d_loss = self.loss_fn(labels, predictions)
        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
        self.d_optimizer.apply_gradients(
            zip(grads, self.discriminator.trainable_weights)
        )

        # Sample random points in the latent space
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Assemble labels that say "all real images"
        misleading_labels = tf.zeros((batch_size, 1))

        # Train the generator (note that we should *not* update the weights
        # of the discriminator)!
        with tf.GradientTape() as tape:
            predictions = self.discriminator(self.generator(random_latent_vectors))
            g_loss = self.loss_fn(misleading_labels, predictions)
        grads = tape.gradient(g_loss, self.generator.trainable_weights)
        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))

        # Update metrics
        self.d_loss_metric.update_state(d_loss)
        self.g_loss_metric.update_state(g_loss)
        return {
            "d_loss": self.d_loss_metric.result(),
            "g_loss": self.g_loss_metric.result(),
        }

In [7]:
class GANMonitor(keras.callbacks.Callback):
    def __init__(self, num_img=3, latent_dim=256):
        self.num_img = num_img
        self.latent_dim = latent_dim

    def on_epoch_end(self, epoch, logs=None):
        random_latent_vectors = tf.random.normal(shape=(self.num_img, self.latent_dim))
        generated_images = self.model.generator(random_latent_vectors)
        generated_images.numpy()
        for i in range(self.num_img):
            
            sample = np.reshape(generated_images[i], (256, 128))
            
            # Save the numpy array
            np.save(f"output-arrays/epoch_{epoch+1}_sample_{i}.npy", sample)
            
            # Save a spectrogram
            fig, ax = plt.subplots()
            img = librosa.display.specshow(librosa.amplitude_to_db(sample), x_axis='time', y_axis='cqt_note', ax=ax)
            ax.set_title('Constant-Q power spectrum')
            fig.colorbar(img, ax=ax, format="%+2.0f dB")
            plt.savefig(f"output-specs/epoch_{epoch+1}_sample_{i}.png")
            plt.close()


In [8]:
epochs = 100   # In practice, use ~100 epochs

gan = GAN(discriminator=discriminator, generator=generator, latent_dim=128)
gan.compile(
    d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss_fn=keras.losses.BinaryCrossentropy(),
)

gan.fit(
    dataset, epochs=epochs, callbacks=[GANMonitor(num_img=2, latent_dim=128)]
)

  42/1285 [..............................] - ETA: 3:19 - d_loss: 0.6803 - g_loss: 0.7263

KeyboardInterrupt: 