GAN implementation based off of the model from https://www.kaggle.com/code/mrhippo/audio-generation-with-simple-gans/notebook#Bulding-GANs-Model

In [None]:
# data science and mathematical operations
import numpy as np
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.io import wavfile
import librosa
import IPython
import IPython.display as ipd 
import glob
import random

# deep learning
from keras.layers import Dense, Dropout, Input, ReLU
from keras.models import Model, Sequential
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras import models

import os

In [None]:
# Audio Config
DURATION = 4
SAMPLE_RATE = 16000
AUDIO_SHAPE = SAMPLE_RATE*DURATION

#MFCC = 40

# Paths
#DATASET_PATH = "/ccdata/*/*.wav"

# Load 
def load_train_data(input_length=AUDIO_SHAPE):
    files = glob.glob("/Users/ryanzrymiak/Downloads/ccdata/*/*.wav")
    print(files)
    X = np.empty((len(files), input_length))
    i = 0
    #audio_samples = []
    for file_path in glob.glob("/Users/ryanzrymiak/Downloads/ccdata/*/*.wav"):
        #file_path = DATASET_PATH + "audio_train/" + train_fname
        
        # Read and Resample the audio
        audio, _ = librosa.core.load(file_path, sr=SAMPLE_RATE)#, res_type='kaiser_fast')

        # Random offset / Padding
        if len(audio) > input_length:
            max_offset = len(audio) - input_length
            offset = np.random.randint(max_offset)
            audio = audio[offset:(input_length+offset)]
        else:
            if input_length > len(audio):
                max_offset = input_length - len(audio)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            audio = np.pad(audio, (offset, input_length - len(audio) - offset), "constant")
        X[i,] = audio
        #audio_samples.append(audio)
        i+=1
    print("Data loading complete")
    return X#audio_samples

# Stardize Data 
def normalization(sample):
    #for i in range(0,len(sample)):
    sample = (sample - sample.mean()) / sample.std()
    print("Normalization complete\n")
    return sample

# Rescale Data to be in range [rangeMin, rangeMax]
def rescale(sample):
    #minRange=-1
    #maxRange=1
    #maxi = sample.max()
    #mini = sample.min()
    #sample = np.interp(sample, (mini, maxi), (minRange, maxRange))
    #for i in range(0,len(sample)):
    sample /= np.max(np.abs(sample), axis=0)
    print("Rescaling complete\n")
    return sample

In [None]:
train_data = normalization(load_train_data(AUDIO_SHAPE))

In [None]:
x_train = rescale(train_data)

In [None]:
def create_generator():
    
    generator = Sequential()
    generator.add(Dense(units = 512, input_dim = 100))
    generator.add(ReLU())
    
    generator.add(Dense(units = 512))
    generator.add(ReLU())
    
    generator.add(Dense(units = 1024))
    generator.add(ReLU())
    
    generator.add(Dense(units = 1024))
    generator.add(ReLU())
    
    generator.add(Dense(units = 64000))
    
    generator.compile(loss ="binary_crossentropy",
                     optimizer = Adam(0.0001, 0.5))
    
    return generator

#g = create_generator()
#g.summary()

In [None]:
def create_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(units = 1024,input_dim = 64000)) 
    discriminator.add(ReLU())
    discriminator.add(Dropout(0.4))
    
    discriminator.add(Dense(units = 512)) 
    discriminator.add(ReLU())
    discriminator.add(Dropout(0.4))
    
    discriminator.add(Dense(units = 256)) 
    discriminator.add(ReLU())
    
    discriminator.add(Dense(units = 1, activation = "sigmoid"))
    
    discriminator.compile(loss = "binary_crossentropy",
                         optimizer = Adam(0.0001, 0.5))
    return discriminator

#d = create_discriminator()
#d.summary()

In [None]:
def create_gan(discriminator, generator):
    discriminator.trainable = False
    gan_input = Input(shape=(100,))
    x = generator(gan_input)
    gan_output = discriminator(x)
    gan = Model(inputs = gan_input, outputs = gan_output)
    gan.compile(loss = "binary_crossentropy", optimizer = "adam")
    return gan

#gan = create_gan(d,g)
#gan.summary()

In [None]:
def show_gen_samples(epochs):#, samples = 3):
    samplePlot = []
    #fig        = plt.figure(figsize = (1, samples))
    #noise      = np.random.normal(0, 1, (samples,100))
    audios     = g.predict(noise)    
    #for i, audio in enumerate(audios):
    for i in range(0, 3):
        #print(audio)
        IPython.display.display(ipd.Audio(data = random.choice(audios), rate = SAMPLE_RATE))
    #    samplePlot.append(fig.add_subplot(1, samples, i+1))
    #    samplePlot[i].plot(audio.flatten(), '-', )
    #plt.gcf().set_size_inches(25, 5)
    #plt.subplots_adjust(wspace=0.3,hspace=0.3)
    #fig.suptitle("{} Epochs Result".format(epochs), fontsize = 17)
    #plt.show()

In [None]:
#import time

D_loss = []
G_loss = []
epochs = 40 
batch_size = 128
#current_time = time.time()

g = create_generator()
g.summary()

d = create_discriminator()
d.summary()

gan = create_gan(d,g)
gan.summary()

for e in range(epochs):
    #start_time = time.time()
    #for _ in range(batch_size):
        #print(e, "--", _)
        # I reccomend you to look "Training Diagram" (at the top) 
        noise = np.random.normal(0,1, [batch_size,100])
        
        generated_audio = g.predict(noise)
       
        audio_batch = x_train[np.random.randint(low = 0, high = x_train.shape[0], size = batch_size)] #get samples from real data

        x = np.concatenate([audio_batch, generated_audio])

        y_dis = np.zeros(batch_size*2) 
        y_dis[:batch_size] = 1 # we labeled real audios as 1 and generated audios as 0
        
        d.trainable = True
        d_loss = d.train_on_batch(x,y_dis) # we are training discriminator (train_on_batch)
        
        noise = np.random.normal(0,1,[batch_size,100])
        
        y_gen = np.ones(batch_size) # our generator says "these audios are real"
        
        d.trainable = False
        
        g_loss = gan.train_on_batch(noise, y_gen) #train_on_batch
        
        #D_loss.append(d_loss)
        #G_loss.append(g_loss)
        
    #if (e%2 == 0) or (e == epochs-1) :
    #    print("epochs: ",e)
    #if e == epochs-1:
    #    print("Time since start: {}".format(np.round(start_time - current_time)))
    #    print("Training Complete.")
    # printing results
        if e%10 == 0:
        #print("Time since start: {}".format(np.round(start_time - current_time)))
            show_gen_samples(e)