In [1]:
import librosa
import numpy as np
import IPython.display as ipd
import scipy.io.wavfile as wavfile

from tensorflow import keras
import tensorflow as tf

In [2]:
class AudioStuff:
    
    def __init__(self, filename):
        self.filename = filename
        
    def audio_to_spectrogram(self):
        audio, sr = librosa.load(self.filename)
        D = np.abs(librosa.stft(audio))**2
        audio= librosa.feature.melspectrogram(y=audio, sr=sr, S=D)
        return (audio, sr)

In [3]:
def MergeSpectrogram(spectrometer1, spectrometer2):
        return (spectrogram1 + self.spectrogram2)/2

In [4]:
def gram_matrix(tensor):
    #https://www.datacamp.com/community/tutorials/implementing-neural-style-transfer-using-tensorflow
    """
    Computes the gram matrix of the input tensor, assuming it has exactly one layer.
    :param input_tensor: input tensor
    :return: gram matrix of the input tensor
    """
    temp = tensor
    temp = tf.squeeze(temp)

    return tf.matmul(temp, tf.transpose(temp))

In [5]:
def get_content_loss(mel_targets, mel_outputs):
        return tf.reduce_mean(tf.losses.mean_squared_error(mel_targets, mel_outputs))
        
def get_style_loss(mel_targets, mel_outputs):
        return tf.reduce_mean(tf.losses.mean_squared_error(gram_matrix(mel_targets), gram_matrix(mel_outputs)))

In [6]:
def spectrogram_to_audio(spectrogram):
    res = librosa.feature.inverse.mel_to_audio(spectrogram)
    return res

In [7]:
# I dont know if dense layers will be required have to look at the paper to see the model
def model_audio_tranfer( combination_spectrogram):
    # note according to keras lib sequential model is not beneficial here and vgg19 
    #is for images and cannot be used for audio 
    # for better result add dense layers have 3 functions and name them blocks and then these block 
    # would have softmax and dense layers
    # Flatten would be needed
    _, co_time, co_frequency, co_channel = tuple(combination_spectrogram.shape)
    print(combination_spectrogram.shape)
    
    model = tf.keras.layers.Conv2D(64, 3, activation="relu",
                               strides = (1, 1), input_shape=(1, co_time, co_frequency, co_channel))
    # Decreased filter size b/c of mem issues.
    
    
    return model

In [8]:
def compute_loss(content_features, style_features, combination_features):

    style_weight = 1e-6
    content_weight = 2.5e-8

    
    # Add content loss

    loss = content_weight * get_content_loss(content_features, combination_features)
    # Add style loss

    style_loss = get_style_loss(style_spectrogram, combination_spectrogram)
    loss += (style_weight / len(style_features)) * style_loss

    # Add total variation loss
    #loss += total_variation_weight * total_variation_loss(combination_image)
    return loss

In [9]:
def train_step(model,optimizer, content_features, style_features, combination_spectrogram):
    with tf.GradientTape() as tape:
        combination_features = model(combination_spectrogram)
        loss = compute_loss(content_features, style_features, combination_features)
        grads = tape.gradient(loss, combination_spectrogram)
        # I think b/c we applied the model under the tape, and combination_features is derived from combination_spectrogram, applying 
        # model will be factored in for the spectrogram
    optimizer.apply_gradients([(grads, combination_spectrogram)])
    return loss, grads

In [10]:
content_object = AudioStuff("content.wav")
style_object = AudioStuff("style.wav")
content_spectrogram, content_rate = content_object.audio_to_spectrogram()
print("original content_spectrogram shape")
print(content_spectrogram.shape)
style_spectrogram, style_rate = style_object.audio_to_spectrogram()





FileNotFoundError: ignored

In [None]:
optimizer = tf.optimizers.Adam(learning_rate=0.02, beta_1=0.99, epsilon=1e-1)





s_time, s_frequency = style_spectrogram.shape
c_time, c_frequency = content_spectrogram.shape


# change 1 to the number of trainable batches ie the first dimension
content_spectrogram = tf.reshape(content_spectrogram, [1, c_time , c_frequency, 1])
style_spectrogram = tf.reshape(style_spectrogram, [1, s_time, s_frequency, 1])
combination_spectrogram  =   tf.Variable(tf.random.normal([1, c_time, c_frequency, 1]))
#print(type(combination_spectrogram))
model = model_audio_tranfer(combination_spectrogram)

print(content_spectrogram.shape)

content_features = model(content_spectrogram)
print(content_features.shape)
style_features = model(style_spectrogram)
print(style_features.shape)
combination_features = model(combination_spectrogram)
print(combination_features.shape)

iterations = 100
for i in range(iterations):
    #print(content_spectrogram)
    loss, grads = train_step(
        model, optimizer, content_features, style_features, combination_spectrogram
    )
print(loss)

In [None]:
final_combination_spectrogram = tf.squeeze(combination_spectrogram)
print('pre conversion spectrogram dims')
print(final_combination_spectrogram.shape)
# I'm assuming we just need to get the spectrogram back to it's original shape
# its (1, x,y,1), and was originally (x,y) so just sqeeze
merge_audio = spectrogram_to_audio(final_combination_spectrogram.numpy())
wavfile.write("output.wav", style_rate, merge_audio)
print("wrote file")

# Note: https://stackoverflow.com/questions/60365904/reconstructing-audio-from-a-melspectrogram-has-some-clipping-with-librosa
# says if we just don't use mel spectrograms, the results are way better

In [None]:
if __name__ == '__main__':
    content_object = AudioStuff("vocals.wav")
    style_object = AudioStuff("accompaniment.wav")
    content_spectrogram, content_rate = content_object.audio_to_spectrogram()
    style_spectrogram, style_rate = style_object.audio_to_spectrogram()
    #print(content_spectrogram.shape)
    #print(style_spectrogram)
    object1 = SpectrogramStuff(content_spectrogram, style_spectrogram)
    print(object1.MergeSpectrogram().shape)
    #content_audio = spectrogram_to_audio(content_spectrogram)
    merge_spectrogram = object1.MergeSpectrogram()
    merge_audio = spectrogram_to_audio(merge_spectrogram)
    #style_audio = spectrogram_to_audio(style_spectrogram)
    wavfile.write("OUTPUT/outp3.wav", style_rate, merge_audio)