# a project to enhance the apparent quality of microphone in real time

## loading the useful libraries

In [None]:
#this notebook is able to run both locally and in google colab
#if running in google colab, some additional actions need to be performed
#The variable IN_COLAB tells the code whether to perform those actions
import sys
IN_COLAB = 'google.colab' in sys.modules
IN_COLAB

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
import librosa
import torchaudio
from IPython.display import Audio #play back the signal (original waveform)
import  IPython




In [None]:
#install audio_preprocessing from github - this is necessary if running in Google Colab, otherwise not necessary
if (IN_COLAB):
   #!git clone https://github.com/RomanZhvanskiy/microphone_enhancer_gh.git
   !git -C "microphone_enhancer_gh" pull || git clone https://github.com/RomanZhvanskiy/microphone_enhancer_gh.git "microphone_enhancer_gh"


In [None]:
if (IN_COLAB):  # it is also necessary to change directory in Google Colab to load audio_preprocessing
  %cd /content/microphone_enhancer_gh/

In [None]:
if (IN_COLAB): # switch to the appropriate branch
  !git checkout better_models_and_gridsearch
  !git pull

In [None]:
print(os.getcwd())
print(sys.argv[0])
print(os.path.dirname(os.path.realpath('__file__')))

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import os.path
sys.path.append(
    os.path.dirname(os.path.realpath('__file__')))

sys.path.append(
    "/".join(os.path.realpath('__file__').split ("/")[0:-2])) #root

sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end") #root
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/api"                  ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/audio_cache"          ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/audio_preprocessing"  ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/hugging_models"       ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/image_metrics"        ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/interface"            ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/ml_logic"             ) 
sys.path.append( "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Back_end/pretrained_models"    ) 
sys.path.append("..") # Adds higher directory to python modules path.




In [None]:
%load_ext autoreload
%autoreload 2
from audio_preprocessing import preprocessing as pp


In [None]:
!pwd

In [None]:
if (IN_COLAB):  # the training data is loaded in the google drive for the purpose of being used in google colab
  from google.colab import drive
  drive.mount('/content/gdrive')




In [None]:
#!ls -la /content/gdrive/MyDrive/'Colab Notebooks'/data_audio/VCTK-Corpus/wav48


In [None]:
#os.listdir("/content/gdrive/MyDrive/Colab Notebooks/data_audio/VCTK-Corpus/wav48")

### Degrade quality

In [None]:
def degrade_quaity(spectrogram, sr, upper_limit=3000.0, lower_limit=100.0, insensitive_level = 0.5,relative_noise_level=0.1, debug=0):
    degraded_spectrogram = pp.mel_spectrogram_remove_frequency(
            spectrogram,
            sr,
            remove_above=upper_limit,
            remove_below=lower_limit,
            debug=debug)


    #remove quiet sounds  (our simulated bad microphone cannot capture quiet sounds)
    degraded_spectrogram = pp.mel_spectrogram_remove_quiet_sounds (
            degraded_spectrogram,
            sr,
            remove_below=insensitive_level,
            debug=debug)

    #add noise (our simulated bad microphone also captures noize)
    degraded_spectrogram = pp.mel_spectrogram_add_noise(degraded_spectrogram,
            sr,
            relative_noise_level=relative_noise_level,
            add_above=lower_limit,
            add_below=upper_limit,
            debug=debug)
    return degraded_spectrogram

### Convert MEL spectrogram to waveform

In [None]:
pp.plot_mel_spectrogram(spectrogram,sr)

In [None]:
pp.plot_mel_spectrogram(degraded_x,sr)

In [None]:
#https://datasciencedojo.com/blog/python-libraries-for-generative-ai/#
#https://huggingface.co/docs/diffusers/tutorials/basic_training
#library for distortions
#https://github.com/iver56/audiomentations?tab=readme-ov-file

## training a simple model

### data preparation

In [None]:
where_to_get_training_data = "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Data/raw_data/VCTK-Corpus/wav48"

In [None]:
print (f"where_to_get_training_data = {where_to_get_training_data}")

In [None]:
large_data, sr = pp.get_all_speech_as_one_mel(where_to_get_training_data= where_to_get_training_data , num_spectrograms=100, num_speaker =0, debug = 1,working_in_google_colab = IN_COLAB)

In [None]:
train_sg, test_sg = pp.split_spectrogram_in_train_and_test(large_data,0.2, debug=1)

In [None]:
#degrade quality of both train and test
degraded_train_sg =pp.degrade_quaity(train_sg, sr )
degraded_test_sg =pp.degrade_quaity(test_sg, sr )




In [None]:

reconstructed_test = pp.spectrogram_2_waveform (test_sg, sr=sr)
reconstructed_degraded_test = pp.spectrogram_2_waveform (degraded_test_sg, sr=sr)

In [None]:
reconstructed_train = pp.spectrogram_2_waveform (train_sg, sr=sr)
reconstructed_degraded_train = pp.spectrogram_2_waveform (degraded_train_sg, sr=sr)

In [None]:
if (IN_COLAB):
    where_to_get_preprocessed_training_data = "/content/gdrive/MyDrive/Colab Notebooks/data_audio"
else:
    where_to_get_preprocessed_training_data = "/".join(os.path.realpath('__file__').split ("/")[0:-2]) + "/Data/postprocessed_training_data"

In [None]:
#the above can take a long time on large datasets, so I'll save the results to file
np.savetxt(fname=where_to_get_preprocessed_training_data + "/train_sg.sg", X=train_sg)
np.savetxt(fname=where_to_get_preprocessed_training_data + "/test_sg.sg", X=test_sg)
np.savetxt(fname=where_to_get_preprocessed_training_data + "/degraded_train_sg.sg", X=degraded_train_sg)
np.savetxt(fname=where_to_get_preprocessed_training_data + "/degraded_test_sg.sg", X=degraded_test_sg)

In [None]:

print ("audio reconstructed_test")
IPython.display.display(IPython.display.Audio(reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))

print ("audio degraded_test_sg")
IPython.display.display(IPython.display.Audio(reconstructed_degraded_test,  rate=sr))
pp.plot_mel_spectrogram(degraded_train_sg,sr, figsize=(2,2))


## https://keras.io/examples/vision/autoencoder/

this model actually works on 2D images -we need to split the input into a number of 256x256 images. In addition, we will take logs of input data

In [None]:
inp_long = np.log( degraded_train_sg + 0.0000001)
re_long = np.log( train_sg + 0.0000001)

In [None]:
print (f"inp_long.shape = {inp_long.shape}, re_long.shape = {re_long.shape}")
from matplotlib import pyplot as plt

plt.figure()
plt.imshow(inp_long )
plt.figure()
plt.imshow(re_long )

In [None]:
from tensorflow.keras.backend import expand_dims

def spectrogram_2_series_of_images(sg, debug=0):
    #calculate number of 256x256 images to split SG in
    numImages = int(sg.shape[1]/256)
    #reduce the length of SG so that it is evenly divisible by 256 - then it can be nicely reshaped
    sg_for_reshape = sg[:,0:numImages*256 ]
    if (debug):
        print(f"inp_long_for_reshape.shape={sg_for_reshape.shape}")
        
    sg_reshaped = sg_for_reshape.reshape(256, 256, numImages)
    #sg_reshaped has dimensions (n_x, n_y, n_image). We need shape (n_image, n_x, n_y) for tensorflow input
    #use swapaxes method to achieve the correct shape
    sg_reshaped_out = sg_reshaped.swapaxes(0, 2).swapaxes(1,2)

    #Convolutional Neural Network models need to be fed with images whose last dimension is the number of channels
    #The shape of tensors fed into ***ConvNets*** is the following: `(NUMBER_OF_IMAGES, HEIGHT, WIDTH, CHANNELS)`
    #add 1 channel as the last dimension
    sg_reshaped_out = expand_dims(sg_reshaped_out, axis=-1)
    

    if (debug):
        print(f"sg_reshaped_out.shape={sg_reshaped_out.shape}")

    
    return sg_reshaped_out

In [None]:
import tensorflow as tf

inp_long = np.log( degraded_train_sg + 0.0000001)
re_long = np.log( train_sg + 0.0000001)
test_inp_long = np.log( degraded_test_sg + 0.0000001)
test_re_long = np.log( test_sg + 0.0000001)

data_inp_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(inp_long, debug=1))
data_re_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(re_long))
data_test_inp_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(test_inp_long, debug=1))
data_test_re_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(test_re_long))


print(f"data_inp_series_of_images.shape={data_inp_series_of_images.shape}, data_re_series_of_images.shape={data_re_series_of_images.shape}, ")


In [None]:
plt.figure()
plt.imshow(data_inp_series_of_images[56, :,:, 0] )
plt.figure()
plt.imshow(data_re_series_of_images[56, :,:, 0] )

In [None]:
from tensorflow.keras import layers
from keras.models import Model
from keras.optimizers import Adam
input = layers.Input(shape=(256, 256, 1))

# Encoder
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(input)
x = layers.MaxPooling2D((2, 2), padding="same")(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2, 2), padding="same")(x)

# Decoder
x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2D(1, (3, 3), activation="sigmoid", padding="same")(x)

# Autoencoder
optimizer = Adam(lr=0.01)
autoencoder = Model(input, x)
autoencoder.compile(optimizer=optimizer, loss="mae")
autoencoder.summary()

In [None]:
history=autoencoder.fit(
    x=data_inp_series_of_images,
    y=data_re_series_of_images,
    epochs=25,
    batch_size=128,
    shuffle=True,
    validation_data=(data_test_inp_series_of_images, data_test_re_series_of_images),
)

#data_test_inp_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(test_inp_long, debug=1))
#data_test_re_series_of_images = tf.convert_to_tensor(spectrogram_2_series_of_images(test_re_long))


In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
#plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)
plot = hist_df[headers[0]].plot(title=f"{headers[0]}")

In [None]:
predictions = autoencoder.predict(data_test_inp_series_of_images)


In [None]:
print(f"predictions.shape={predictions.shape} ")


In [None]:
plt.figure()
plt.imshow(predictions[0, :,:, 0] )
plt.figure()
plt.imshow(data_re_series_of_images[0, :,:, 0] )
plt.figure()
plt.imshow(data_test_inp_series_of_images[0, :,:, 0] )

#    validation_data=(data_test_inp_series_of_images, data_test_re_series_of_images),


In [None]:
data_train_X_series_of_images, data_train_Y_series_of_images, data_test_X_series_of_images,data_test_Y_series_of_images = pp.get_all_speech_as_series_of_images(num_spectrograms=20)

In [None]:
print(f"data_train_X_series_of_images.shape={data_train_X_series_of_images.shape}")
print(f"data_train_Y_series_of_images.shape={data_train_Y_series_of_images.shape}")
print(f" data_test_X_series_of_images.shape={ data_test_X_series_of_images.shape}")
print(f" data_test_Y_series_of_images.shape={ data_test_Y_series_of_images.shape}")





In [None]:
for i in range (0, data_test_X_series_of_images.shape[0]):
    print (f"i = {i}")
    

In [None]:
predictions.shape

In [None]:
predictions_sg = pp.series_of_images_2_spectrogram(predictions, debug=1)

In [None]:
print ("audio predictions_sg")
IPython.display.display(IPython.display.Audio(pp.spectrogram_2_waveform (predictions_sg,sr),  rate=sr))  
pp.plot_mel_spectrogram(predictions_sg,sr, figsize=(2,2))

In [None]:
test_Y_sg = pp.series_of_images_2_spectrogram(data_test_Y_series_of_images, debug=1)

In [None]:
print ("audio predictions_sg")
IPython.display.display(IPython.display.Audio(pp.spectrogram_2_waveform (test_Y_sg,sr),  rate=sr))  
pp.plot_mel_spectrogram(test_Y_sg,sr, figsize=(2,2))

## https://keras.io/examples/vision/mirnet/

## https://huggingface.co/keras-io/lowlight-enhance-mirnet/tree/main

In [None]:
from tensorflow.keras import models

fpath = "/home/romanz/code/RomanZhvanskiy/microphone_enhancer_gh/Data/pretrained_models/lowlight-enhance-mirnet/saved_model.pb"
model_mirnet = models.load_model(fpath)

### model - pix2pix


In [None]:
import tensorflow as tf
def downsample(filters, size, apply_batchnorm=True):
  initializer = tf.random_normal_initializer(0., 0.02)

  result = tf.keras.Sequential()
  result.add(
      tf.keras.layers.Conv2D(filters, size, strides=2, padding='same',
                             kernel_initializer=initializer, use_bias=False))

  if apply_batchnorm:
    result.add(tf.keras.layers.BatchNormalization())

  result.add(tf.keras.layers.LeakyReLU())

  return result

In [None]:
def upsample(filters, size, apply_dropout=False):
  initializer = tf.random_normal_initializer(0., 0.02)

  result = tf.keras.Sequential()
  result.add(
    tf.keras.layers.Conv2DTranspose(filters, size, strides=2,
                                    padding='same',
                                    kernel_initializer=initializer,
                                    use_bias=False))

  result.add(tf.keras.layers.BatchNormalization())

  if apply_dropout:
      result.add(tf.keras.layers.Dropout(0.5))

  result.add(tf.keras.layers.ReLU())

  return result

In [None]:
inp_long = degraded_train_sg

In [None]:
re_long = train_sg

In [None]:
print (f"inp_long.shape = {inp_long.shape}, re_long.shape = {re_long.shape}")

In [None]:
from matplotlib import pyplot as plt

plt.figure()
plt.imshow(inp_long / 255.0)


In [None]:
inp = np.log(inp_long[:, 0:256] + 0.0000001) # inp_long[:, 0:255] added  + 0.0000001 to  prevent log 0
re = np.log(re_long[:, 0:256] + 0.0000001)
print (f"inp.shape = {inp.shape}, re.shape = {re.shape}")
plt.figure()
plt.imshow(inp / 255.0)
plt.figure()
plt.imshow(re / 255.0)

## https://keras.io/examples/vision/zero_dce/

In [None]:
def build_dce_net():
    input_img = keras.Input(shape=[None, None, 3])
    conv1 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(input_img)
    conv2 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(conv1)
    conv3 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(conv2)
    conv4 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(conv3)
    int_con1 = layers.Concatenate(axis=-1)([conv4, conv3])
    conv5 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(int_con1)
    int_con2 = layers.Concatenate(axis=-1)([conv5, conv2])
    conv6 = layers.Conv2D(
        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
    )(int_con2)
    int_con3 = layers.Concatenate(axis=-1)([conv6, conv1])
    x_r = layers.Conv2D(24, (3, 3), strides=(1, 1), activation="tanh", padding="same")(
        int_con3
    )
    return keras.Model(inputs=input_img, outputs=x_r)

In [None]:
def color_constancy_loss(x):
    mean_rgb = tf.reduce_mean(x, axis=(1, 2), keepdims=True)
    mr, mg, mb = (
        mean_rgb[:, :, :, 0],
        mean_rgb[:, :, :, 1],
        mean_rgb[:, :, :, 2],
    )
    d_rg = tf.square(mr - mg)
    d_rb = tf.square(mr - mb)
    d_gb = tf.square(mb - mg)
    return tf.sqrt(tf.square(d_rg) + tf.square(d_rb) + tf.square(d_gb))

In [None]:
def exposure_loss(x, mean_val=0.6):
    x = tf.reduce_mean(x, axis=3, keepdims=True)
    mean = tf.nn.avg_pool2d(x, ksize=16, strides=16, padding="VALID")
    return tf.reduce_mean(tf.square(mean - mean_val))

In [None]:
def downsample(filters, size, apply_batchnorm=True):
  initializer = tf.random_normal_initializer(0., 0.02)

  result = tf.keras.Sequential()
  result.add(
      tf.keras.layers.Conv2D(filters, size, strides=2, padding='same',
                             kernel_initializer=initializer, use_bias=False))

  if apply_batchnorm:
    result.add(tf.keras.layers.BatchNormalization())

  result.add(tf.keras.layers.LeakyReLU())

  return result

In [None]:
#as the documentation says: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2D
#you need a 4 dimensional input for Conv2d layer. you have to a add a channel either after or before 2 main dimensions of the image:

#train_images = train_images.reshape(train_size, height, width, 1)

#or

#train_images = train_images.reshape(train_size, 1, height, width)

#in both cases you have to define the art of input in every layer in the network with data_format="channels_first" or data_format="channels_last".
#for example:
#ncoder_output = Conv2D(64, (3,3), activation='relu', padding='same', strides=2, data_format="channels_last")(encoder_input)


down_model = downsample(3, 4)

#image is a 256 x 256 array
#add a dimension on the end to account for channels
inp = tf.expand_dims(inp, -1) # expand dimensions as "channels last"
#add a dimension on the front to account for batch_size
inp = tf.expand_dims(inp, 0)
print (f"inp.shape = {inp.shape}")

In [None]:

down_model = downsample(3, 4)
down_result = down_model(tf.expand_dims(inp, 0))
print (down_result.shape)

In [None]:
up_model = upsample(3, 4)
up_result = up_model(tf.squeeze(down_result,axis=0))
print (up_result.shape)

In [None]:
def Generator():
  #inputs = tf.keras.layers.Input(shape=[256, 256, 3])
  
    inputs = tf.keras.layers.Input(shape=[256, 256, 1])
    down_stack = [
        downsample(64, 4, apply_batchnorm=False),  # (batch_size, 128, 128, 64)
        downsample(128, 4),  # (batch_size, 64, 64, 128)
        downsample(256, 4),  # (batch_size, 32, 32, 256)
        downsample(512, 4),  # (batch_size, 16, 16, 512)
        downsample(512, 4),  # (batch_size, 8, 8, 512)
        downsample(512, 4),  # (batch_size, 4, 4, 512)
        downsample(512, 4),  # (batch_size, 2, 2, 512)
        downsample(512, 4),  # (batch_size, 1, 1, 512)
        ]

    up_stack = [
        upsample(512, 4, apply_dropout=True),  # (batch_size, 2, 2, 1024)
        upsample(512, 4, apply_dropout=True),  # (batch_size, 4, 4, 1024)
        upsample(512, 4, apply_dropout=True),  # (batch_size, 8, 8, 1024)
        upsample(512, 4),  # (batch_size, 16, 16, 1024)
        upsample(256, 4),  # (batch_size, 32, 32, 512)
        upsample(128, 4),  # (batch_size, 64, 64, 256)
        upsample(64, 4),  # (batch_size, 128, 128, 128)
        ]

    OUTPUT_CHANNELS = 1

    initializer = tf.random_normal_initializer(0., 0.02)
    last = tf.keras.layers.Conv2DTranspose(OUTPUT_CHANNELS, 4,
                                         strides=2,
                                         padding='same',
                                         kernel_initializer=initializer,
                                         activation='tanh')  # (batch_size, 256, 256, 3)

    x = inputs

    # Downsampling through the model
    skips = []
    for down in down_stack:
        x = down(x)
        skips.append(x)

    skips = reversed(skips[:-1])

    # Upsampling and establishing the skip connections
    for up, skip in zip(up_stack, skips):
        x = up(x)
        x = tf.keras.layers.Concatenate()([x, skip])

    x = last(x)

    return tf.keras.Model(inputs=inputs, outputs=x)

In [None]:
#add a dimension on the end to account for channels
inp = tf.expand_dims(inp, -1) # expand dimensions as "channels last"
#add a dimension on the front to account for batch_size
print (f"inp.shape = {inp.shape}")

In [None]:
gen_output = generator(inp[tf.newaxis, ...], training=False)
plt.imshow(gen_output[0, ...])

## Define the generator loss

In [None]:
#GANs learn a loss that adapts to the data, while cGANs learn a structured loss that penalizes
# a possible structure that differs from the network output and the target image, as described 
# in the pix2pix paper.
#The generator loss is a sigmoid cross-entropy loss of the generated images and an array of
# ones.
#The pix2pix paper also mentions the L1 loss, which is a MAE (mean absolute error) between the
# generated image and the target image.
#This allows the generated image to become structurally similar to the target image.
#The formula to calculate the total generator loss is gan_loss + LAMBDA * l1_loss, where
# LAMBDA = 100. This value was decided by the authors of the paper.

In [None]:
LAMBDA = 100
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def generator_loss(disc_generated_output, gen_output, target):
  gan_loss = loss_object(tf.ones_like(disc_generated_output), disc_generated_output)

  # Mean absolute error
  l1_loss = tf.reduce_mean(tf.abs(target - gen_output))

  total_gen_loss = gan_loss + (LAMBDA * l1_loss)

  return total_gen_loss, gan_loss, l1_loss

## Build the discriminator

In [None]:
#The discriminator in the pix2pix cGAN is a convolutional PatchGAN classifier—it tries to
# classify if each image patch is real or not real, as described in the pix2pix paper.
#
#Each block in the discriminator is: Convolution -> Batch normalization -> Leaky ReLU.
#The shape of the output after the last layer is (batch_size, 30, 30, 1).
#Each 30 x 30 image patch of the output classifies a 70 x 70 portion of the input image.
#The discriminator receives 2 inputs:
#       (1) The input image and the target image, which it should classify as real.
#       (2) The input image and the generated image (the output of the generator), which it should
#              classify as fake.
#Use tf.concat([inp, tar], axis=-1) to concatenate these 2 inputs together.
#Let's define the discriminator:

In [None]:
def Discriminator():
  initializer = tf.random_normal_initializer(0., 0.02)

  #inp = tf.keras.layers.Input(shape=[256, 256, 3], name='input_image') - B&W image, hence reduced to 1 channel
  #tar = tf.keras.layers.Input(shape=[256, 256, 3], name='target_image')
  inp = tf.keras.layers.Input(shape=[256, 256, 1], name='input_image')
  tar = tf.keras.layers.Input(shape=[256, 256, 1], name='target_image')    

  x = tf.keras.layers.concatenate([inp, tar])  # (batch_size, 256, 256, channels*2)

  down1 = downsample(64, 4, False)(x)  # (batch_size, 128, 128, 64)
  down2 = downsample(128, 4)(down1)  # (batch_size, 64, 64, 128)
  down3 = downsample(256, 4)(down2)  # (batch_size, 32, 32, 256)

  zero_pad1 = tf.keras.layers.ZeroPadding2D()(down3)  # (batch_size, 34, 34, 256)
  conv = tf.keras.layers.Conv2D(512, 4, strides=1,
                                kernel_initializer=initializer,
                                use_bias=False)(zero_pad1)  # (batch_size, 31, 31, 512)

  batchnorm1 = tf.keras.layers.BatchNormalization()(conv)

  leaky_relu = tf.keras.layers.LeakyReLU()(batchnorm1)

  zero_pad2 = tf.keras.layers.ZeroPadding2D()(leaky_relu)  # (batch_size, 33, 33, 512)

  last = tf.keras.layers.Conv2D(1, 4, strides=1,
                                kernel_initializer=initializer)(zero_pad2)  # (batch_size, 30, 30, 1)

  return tf.keras.Model(inputs=[inp, tar], outputs=last)

In [None]:
disc_out = discriminator([inp[tf.newaxis, ...], gen_output], training=False)
#plt.imshow(disc_out[0, ..., -1], vmin=-20, vmax=20, cmap='RdBu_r')
#plt.colorbar()

In [None]:
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [None]:
discriminator = Discriminator()
tf.keras.utils.plot_model(discriminator, show_shapes=True, dpi=64)

In [None]:
generator = Generator()
tf.keras.utils.plot_model(generator, show_shapes=True, dpi=64)


### model - simple autoencoder

let us start with the simplest possible model - restoring 1 column of MEL spectrogram (256 entries)
this loses out on the previous time snapshots, but should be simple to train
Accordingly, instead of Conv2d, we will have Conv1d

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
#from tensorflow.keras.layers import Conv2D, Conv1D, MaxPooling2D, Flatten, Dense
#from tensorflow.keras.layers import MaxPooling1D



In [None]:

def build_encoder():
    '''returns an encoder model, of output_shape equals to latent_dimension'''
    encoder = models.Sequential()
    encoder.add(layers.Dense(100, input_dim=256, activation='tanh'))

    return encoder


In [None]:
encoder = build_encoder()
encoder.summary()

In [None]:

def build_decoder():
    decoder = models.Sequential()

    decoder.add(layers.Dense(256, input_dim=100, activation='relu'))

    return decoder


In [None]:
decoder = build_decoder()
decoder.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

autoencoder = models.Sequential([encoder, decoder])
#autoencoder.compile(loss="mse", optimizer=Adam(learning_rate=0.1))
optimizer = Adam()

autoencoder.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mse'])


In [None]:
autoencoder.summary()

In [None]:
train_sg_t =  np.transpose(train_sg)

In [None]:
degraded_train_sg_t =  np.transpose(degraded_train_sg)

### train the model

In [None]:
import tensorflow as tf
def reinitialize(model):
    for l in model.layers:
        if isinstance(l, tf.keras.Model):
            reinitialize(l)
            continue
        if hasattr(l,"kernel_initializer"):
            l.kernel.assign(l.kernel_initializer(tf.shape(l.kernel)))
        if hasattr(l,"bias_initializer"):
            l.bias.assign(l.bias_initializer(tf.shape(l.bias)))
        if hasattr(l,"recurrent_initializer"):
            l.recurrent_kernel.assign(l.recurrent_initializer(tf.shape(l.recurrent_kernel)))

In [None]:
reinitialize(autoencoder)

In [None]:


history = autoencoder.fit(train_sg_t, train_sg_t ,
                           validation_split = 0.2,
                           epochs=100,
                           batch_size=32,
                           workers=3,
                           use_multiprocessing=True,
                           verbose=1)

In [None]:
hist_df = pd.DataFrame(history.history)

headers = list(hist_df.columns.values)


plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)
#plot = hist_df[[headers[1], headers[3]]].plot(title=f"{headers[1]}, {headers[3]}", logy=False)

### save the model

In [None]:
models.save_model(autoencoder, 'autoencoder_001a')


### how does the model sound?

In [None]:
restored_test_sg_t = autoencoder.predict(np.transpose(degraded_test_sg))

In [None]:
reconstructed_restored_test = pp.spectrogram_2_waveform (np.transpose(restored_test_sg_t), sr=sr)

In [None]:
print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_restored_test,  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

# even simpler (10, 256) model training


In [None]:
### simper model
from tensorflow.keras.optimizers import Adam

def build_the_simplest_model_possible():
    simplest_model = models.Sequential()
    simplest_model.add(layers.Dense(10, input_dim=256, activation='relu'))
    simplest_model.add(layers.Dense(256, input_dim=100, activation='relu'))
    return simplest_model
simplest_model = build_the_simplest_model_possible()
optimizer = Adam()

simplest_model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mse'])


In [None]:
# Cache to RAM to speed up the training. This requires arrays to be converted to Datasets
from tensorflow import data
#from tensorflow.data.Dataset import cache



train_dataset = data.Dataset.from_tensor_slices((np.transpose(train_sg), np.transpose(train_sg))).batch(10000)
validation_dataset = data.Dataset.from_tensor_slices((np.transpose(test_sg), np.transpose(test_sg))).batch(3000)


AUTOTUNE = data.experimental.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)

#val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
print (f"train_sg.shape={train_sg.shape}")
print (f"test_sg.shape={test_sg.shape}")

In [None]:
from tensorflow.keras import callbacks
from keras.callbacks import EarlyStopping
from keras.callbacks import BackupAndRestore


es = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
    verbose=0
)

br = BackupAndRestore(
    backup_dir="training_backup",
    save_freq="epoch",
    delete_checkpoint=True
)
reinitialize(simplest_model)
history = simplest_model.fit( x=train_dataset,
                           batch_size=4,
                           validation_data=validation_dataset,
                           epochs=500,
                           verbose=1,
                           workers=24,
                           callbacks=[es,br],
                           use_multiprocessing=True)

In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)

In [None]:
models.save_model(simplest_model, 'autoencoder_baseline_10-256-trained-on-good')

restored_test_sg_t = simplest_model.predict(np.transpose(degraded_test_sg))

print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=pp.spectrogram_2_waveform(np.transpose(restored_test_sg_t), sr=sr),  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

## preprocessing  - add log on the input to the model to reduce the variance of features

In [None]:
train_df=pd.DataFrame(np.transpose(train_sg))


In [None]:
train_df.describe()

In [None]:
#inp = np.log(inp_long[:, 0:255]) # inp_long[:, 0:255]
#note - adding 0.00000001 to all entries to prevent log(0)
log_train_dataset = data.Dataset.from_tensor_slices(
    (
        np.log(
            0.00000001 + np.transpose(
                train_sg
            )
        ), 
        np.log(
            0.00000001 + np.transpose(
                train_sg
            )
        )
    )
).batch(10000)
log_validation_dataset = data.Dataset.from_tensor_slices(
    (
        np.log(
            0.00000001 + np.transpose(
                test_sg
            )
        ), 
        np.log(
            0.00000001 + np.transpose(
                test_sg
            )
        )
    )
).batch(3000)


In [None]:
AUTOTUNE = data.experimental.AUTOTUNE
log_train_dataset = log_train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
log_validation_dataset = log_validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
simplest_model_on_log = build_the_simplest_model_possible()
optimizer = Adam()

simplest_model_on_log.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mse'])


In [None]:
from tensorflow.keras import callbacks
from keras.callbacks import EarlyStopping
from keras.callbacks import BackupAndRestore


es = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
    verbose=0
)

br = BackupAndRestore(
    backup_dir="training_backup",
    save_freq="epoch",
    delete_checkpoint=True
)
reinitialize(simplest_model_on_log)
history = simplest_model_on_log.fit( x=log_train_dataset,
                           batch_size=4,
                           validation_data=log_validation_dataset,
                           epochs=500,
                           verbose=1,
                           workers=24,
                           callbacks=[es,br],
                           use_multiprocessing=True)

In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)

In [None]:
#models.save_model(simplest_model, 'autoencoder_baseline_10-256-trained-on-good')

log_restored_test_sg_t = simplest_model_on_log.predict(np.log(0.00000001 + np.transpose(degraded_test_sg)))

restored_test_sg_t = (np.exp(log_restored_test_sg_t)) - 0.00000001 

print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=pp.spectrogram_2_waveform(np.transpose(restored_test_sg_t), sr=sr),  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

# how about training the simpler model on degraded rather than perfect audio?

In [None]:
# Cache to RAM to speed up the training. This requires arrays to be converted to Datasets
from tensorflow import data
#from tensorflow.data.Dataset import cache

train_dataset = data.Dataset.from_tensor_slices((np.transpose(degraded_train_sg), np.transpose(train_sg))).batch(10000)
validation_dataset = data.Dataset.from_tensor_slices((np.transpose(degraded_test_sg), np.transpose(test_sg))).batch(3000)


AUTOTUNE = data.experimental.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)

#val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
es = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
    verbose=0
)

br = BackupAndRestore(
    backup_dir="training_backup",
    save_freq="epoch",
    delete_checkpoint=True
)

history = simplest_model.fit( x=train_dataset,
                           batch_size=4,
                           validation_data=validation_dataset,
                           epochs=1000,
                           verbose=1,
                           workers=24,
                           callbacks=[es,br],
                           use_multiprocessing=True)

In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)

In [None]:
models.save_model(simplest_model, 'autoencoder_baseline_10-256-trained-on_degraded')

restored_test_sg_t = simplest_model.predict(np.transpose(degraded_test_sg))

print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_restored_test,  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

# try convolutional autoencoder


In [None]:
def build_convolutional_autoencoder():
    conv_ac = models.Sequential()
    conv_ac.add(layers.Reshape([256,1], input_shape=[256]))
    conv_ac.add(layers.Conv1D(16, kernel_size=6, padding="same", input_dim=[256,1], activation='selu'))
    conv_ac.add(layers.MaxPool1D(pool_size=4))
    conv_ac.add(layers.Conv1D(32, kernel_size=6, padding="same", activation='selu'))
    conv_ac.add(layers.MaxPool1D(pool_size=4))
    conv_ac.add(layers.Conv1D(64, kernel_size=6, padding="same", activation='selu'))
    conv_ac.add(layers.MaxPool1D(pool_size=4))

    conv_ac.add(layers.Conv1DTranspose(32, kernel_size=6, strides=4, padding="same", activation='selu'))
    conv_ac.add(layers.Conv1DTranspose(16, kernel_size=6, strides=4, padding="same", activation='selu'))
    conv_ac.add(layers.Conv1DTranspose(1, kernel_size=6, strides=4, padding="same", activation='relu'))


    return conv_ac
conv_ac = build_convolutional_autoencoder()
optimizer = Adam()

conv_ac.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mse'])

In [None]:
conv_ac.summary()

In [None]:
# Cache to RAM to speed up the training. This requires arrays to be converted to Datasets
from tensorflow import data
#from tensorflow.data.Dataset import cache

train_dataset = data.Dataset.from_tensor_slices((np.transpose(degraded_train_sg), np.transpose(train_sg))).batch(10000)
validation_dataset = data.Dataset.from_tensor_slices((np.transpose(degraded_test_sg), np.transpose(test_sg))).batch(3000)


AUTOTUNE = data.experimental.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)

#val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
es = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
    verbose=0
)

br = BackupAndRestore(
    backup_dir="training_backup",
    save_freq="epoch",
    delete_checkpoint=True
)

history = conv_ac.fit( x=train_dataset,
                           batch_size=4,
                           validation_data=validation_dataset,
                           epochs=100,
                           verbose=1,
                           workers=24,
                           callbacks=[es,br],
                           use_multiprocessing=True)

In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)

In [None]:
restored_test_sg_t = conv_ac.predict(np.transpose(degraded_test_sg))


In [None]:
restored_test_sg_t.shape

In [None]:
reconstructed_test.shape

In [None]:
models.save_model(conv_ac, 'conv_16-32-64-32-16-1')

restored_test_sg_t = conv_ac.predict(np.transpose(degraded_test_sg))[:, :, 0]

print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=np.transpose(restored_test_sg_t),  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

# Simple autoencoder (100, 256 dense) model training


In [None]:
reinitialize(autoencoder)

In [None]:
history = autoencoder.fit( x=train_dataset,
                           batch_size=4,
                           validation_data=validation_dataset,
                           epochs=10000,
                           verbose=1,
                           workers=24,
                           use_multiprocessing=True)

In [None]:
hist_df = pd.DataFrame(history.history)
headers = list(hist_df.columns.values)
plot = hist_df[[headers[0], headers[2]]].plot(title=f"{headers[0]}, {headers[2]}", logy=True)

In [None]:
models.save_model(autoencoder, 'autoencoder_baseline_10000')

In [None]:
restored_test_sg_t = autoencoder.predict(np.transpose(degraded_test_sg))

In [None]:
print ("reconstructed_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_test,  rate=sr))
pp.plot_mel_spectrogram(test_sg,sr, figsize=(2,2))


print ("reconstructed_restored_degraded_test")
IPython.display.display(IPython.display.Audio(data=reconstructed_restored_test,  rate=sr))
pp.plot_mel_spectrogram(np.transpose(restored_test_sg_t),sr, figsize=(2,2))

In [None]:
from tensorflow.keras import callbacks
from keras.callbacks import EarlyStopping
from keras.callbacks import BackupAndRestore



## same  model with reg

## same  model with dropout

## same  model with dropout & reg

In [None]:
models.save_model(autoencoder_do_reg, 'autoencoder_002')

In [None]:
models.save_model(autoencoder_do_reg_selu, 'autoencoder_003')