In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import os
import librosa

In [None]:
# Check that TensorFlow can see the GPU
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)


# Prepare the audio files
Run the short time Fourier transform over the audio files and save as numpy arrays in csv format

In [None]:
def convert_audio_to_complex_array(filename, outfilename=None, overwrite=False):
    """convert_audio_to_complex_array -- using librosa's short time Fourier transform.
    
    Arguments:
    filename -- filepath to the file that you to copy to an array
    outfilename -- filepath to the output array 
    overwrite -- whether to overwrite if a file already exists with the given outfilename
    
    Returns -- None
    """
    
    # sr == sampling rate
    audio_data, sr = librosa.load(filename, sr=4096)
    
    vertical_res = 256
    
    # Apply the short time Fourier transform
    result = librosa.stft(audio_data, center=False, n_fft=vertical_res, win_length=vertical_res)
    #print(str(len(result)) + "   " + str(len(result[0])))
    np.save(filename[:-4] + ".npy", result)

In [None]:
# Convert all the files to numpy arrays and save
for i, item in enumerate(os.listdir("samples/")):
    convert_audio_to_complex_array("samples/"+item)

In [None]:
# Delete the mp3 files
for i, item in enumerate(os.listdir("samples/")):
    if item.endswith(".mp3"):
        os.remove("samples/" + item)

# Load all the files into a single tensor

In [None]:
#Create the mega tensor

target_len = 312
target_height = len(np.load("samples/" +  os.listdir("samples/")[0]))-1
target_samples = len(os.listdir("samples/"))
channels = 2

print(target_height)

mega_tensor = np.zeros([target_samples, target_height, target_len, channels], dtype = np.float32)

# Add every sample to the mega tensor
for i, name in enumerate(os.listdir("samples/")):
    item = np.load("samples/" + name)
    
    for j in range(len(item)-1):
        for k in range(len(item[0])):
            if k < target_len:
                mega_tensor[i][j][k][0] = np.real(item[j][k])
                mega_tensor[i][j][k][1] = np.imag(item[j][k])
                
                
# Then save the mega tensor
np.save("data.npy", mega_tensor)

# Define the model

In [None]:
## KERAS MODELS
latent_dim = 128

generator = keras.Sequential(
    [
        keras.Input(shape=(latent_dim,)),
        layers.Dense(16 * 39 * 128),
        layers.Reshape((16, 39, 128)),
        layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(512, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(2, kernel_size=5, padding="same", activation="tanh"),
    ],
    name="generator",
)
generator.summary()

In [None]:
discriminator = keras.Sequential(
    [
        keras.Input(shape=(128, 312, 2)),
        layers.Conv2D(64, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.Flatten(),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid"),
    ],
    name="discriminator",
)
discriminator.summary()

# Create the Dataset

In [None]:
# Load dataset from directory with keras
mega_tensor =  np.load("data.npy")

train_ds = tf.data.Dataset.from_tensor_slices(mega_tensor)
dataset = train_ds.batch(8)


# Training
https://towardsdatascience.com/generative-adversarial-network-gan-for-dummies-a-step-by-step-tutorial-fdefff170391

1. Select a number of real images from the training set.
2. Generate a number of fake images. This is done by sampling random noise vectors and creating images from them using the generator
3. Train the discriminator for one or more epochs using both fake and real images. This will update on the discrimators weights by labeling all the real images as 1 and the fake images as 0.
4. Generate another number of fake images
5. Train the full GAN model for one or more epochs using only fake images. This will update only the generator's weights by labeling all fake images as 1. 

**SOURCE**: Link above

In [None]:
class GAN(keras.Model):
    def __init__(self, discriminator, generator, latent_dim):
        super(GAN, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim

    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super(GAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn
        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
        self.g_loss_metric = keras.metrics.Mean(name="g_loss")

    @property
    def metrics(self):
        return [self.d_loss_metric, self.g_loss_metric]

    def train_step(self, real_images):
        # Sample random points in the latent space
        batch_size = tf.shape(real_images)[0]
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Decode them to fake images
        generated_images = self.generator(random_latent_vectors)
        
        # Combine them with real images
        combined_images = tf.concat([generated_images, real_images], axis=0)

        # Assemble labels discriminating real from fake images
        labels = tf.concat(
            [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
        )
        # Add random noise to the labels - important trick!
        labels += 0.05 * tf.random.uniform(tf.shape(labels))

        # Train the discriminator
        with tf.GradientTape() as tape:
            predictions = self.discriminator(combined_images)
            d_loss = self.loss_fn(labels, predictions)
        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
        self.d_optimizer.apply_gradients(
            zip(grads, self.discriminator.trainable_weights)
        )

        # Sample random points in the latent space
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))

        # Assemble labels that say "all real images"
        misleading_labels = tf.zeros((batch_size, 1))

        # Train the generator (note that we should *not* update the weights
        # of the discriminator)!
        with tf.GradientTape() as tape:
            predictions = self.discriminator(self.generator(random_latent_vectors))
            g_loss = self.loss_fn(misleading_labels, predictions)
        grads = tape.gradient(g_loss, self.generator.trainable_weights)
        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))

        # Update metrics
        self.d_loss_metric.update_state(d_loss)
        self.g_loss_metric.update_state(g_loss)
        return {
            "d_loss": self.d_loss_metric.result(),
            "g_loss": self.g_loss_metric.result(),
        }

In [None]:
class GANMonitor(keras.callbacks.Callback):
    def __init__(self, num_img=3, latent_dim=128):
        self.num_img = num_img
        self.latent_dim = latent_dim

    def on_epoch_end(self, epoch, logs=None):
        random_latent_vectors = tf.random.normal(shape=(self.num_img, self.latent_dim))
        generated_images = self.model.generator(random_latent_vectors)
        generated_images *= 255
        generated_images.numpy()
        for i in range(self.num_img):
            
            sample = generated_images[i]
            
            # Save the numpy array
            np.save("output-arrays/generated_%03d_%d.npy" % (epoch, i), sample)
            
            # Save a spectrogram

            
            des = np.zeros([128, 312], dtype=np.complex64)

            for i in range(len(sample)):
                for k in range(len(sample[0])):
                    des[i][k] = complex(sample[i][k][0], sample[i][k][1])

            res = librosa.istft(des)

            # convert the slices to amplitude
            sgram_db = librosa.amplitude_to_db(abs(des))

            _, ax = plt.subplots(figsize=(5, 5))

            librosa.display.specshow(sgram_db, sr=4096, x_axis='time', y_axis='log', ax=ax, cmap='gray')

            plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0)

            plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,)
            plt.margins(0,0)
            plt.savefig("output-specs/generated_%03d_%d.png" % (epoch, i))
            plt.close()


In [None]:
epochs = 300   # In practice, use ~100 epochs

gan = GAN(discriminator=discriminator, generator=generator, latent_dim=128)
gan.compile(
    d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss_fn=keras.losses.BinaryCrossentropy(),
)

gan.fit(
    dataset, epochs=epochs, callbacks=[GANMonitor(num_img=1, latent_dim=128)]
)

In [None]:
test = np.load("500.npy")

# convert back to complex nums
np.shape(test)

des = np.zeros([128, 312], dtype=np.complex64)

for i in range(len(test)):
    for k in range(len(test[0])):
        des[i][k] = complex(test[i][k][0], test[i][k][1])

In [None]:
res = librosa.istft(des)

In [None]:
# We'll need IPython.display's Audio widget
from IPython.display import Audio

Audio(data=res, rate=4096*16)

In [None]:
import os

from PIL import Image
import librosa
import librosa.display
import matplotlib.pyplot as plt

    
# convert the slices to amplitude
sgram_db = librosa.amplitude_to_db(abs(des))

_, ax = plt.subplots(figsize=(5, 5))

librosa.display.specshow(sgram_db, sr=4096, x_axis='time', y_axis='log', ax=ax, cmap='gray')

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0)

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,)
plt.margins(0,0)

                    

In [None]:
import os

from PIL import Image
import librosa
import librosa.display
import matplotlib.pyplot as plt

prev = np.load("samples/11.npy")

# convert the slices to amplitude
sgram_db = librosa.amplitude_to_db(abs(prev))

_, ax = plt.subplots(figsize=(5, 5))

librosa.display.specshow(sgram_db, sr=4096, x_axis='time', y_axis='log', ax=ax, cmap='gray')

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0)

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,)
plt.margins(0,0)


In [None]:
print(np.shape(des))

In [None]:
prev_sound = librosa.istft(prev)


# We'll need IPython.display's Audio widget
from IPython.display import Audio

Audio(data=prev_sound, rate=4096)

In [None]:
type(prev[0][0])

In [None]:
fullset=np.load("data.npy")

In [None]:
sample = fullset[10]

In [None]:
np.shape(sample)

In [None]:
des = np.zeros([128, 312], dtype=np.complex64)

for i in range(len(sample)):
    for k in range(len(sample[0])):
        des[i][k] = complex(sample[i][k][0], sample[i][k][1])

In [None]:
res = librosa.istft(des)

In [None]:
# We'll need IPython.display's Audio widget
from IPython.display import Audio

Audio(data=res, rate=4096)

In [None]:
sample = fullset[30]
des = np.zeros([128, 312], dtype=np.complex64)

for i in range(len(sample)):
    for k in range(len(sample[0])):
        des[i][k] = complex(sample[i][k][0], sample[i][k][1])

res = librosa.istft(des)

# convert the slices to amplitude
sgram_db = librosa.amplitude_to_db(abs(des))

_, ax = plt.subplots(figsize=(5, 5))

librosa.display.specshow(sgram_db, sr=4096, x_axis='time', y_axis='log', ax=ax, cmap='gray')

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0)

plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,)
plt.margins(0,0)

# We'll need IPython.display's Audio widget
from IPython.display import Audio

Audio(data=res, rate=4096)
