## Step 1: Load and Preprocess Audio Data

In [40]:
import librosa
import numpy as np
import tensorflow as tf

def load_audio(path, sr=22050, duration=30):
    audio, _ = librosa.load(path, sr=sr, duration=duration)
    return audio

def audio_to_spectrogram(audio, n_fft=2048, hop_length=512):
    spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(spectrogram)
    # Add a channel dimension (3 channels for RGB)
    spectrogram = np.stack([spectrogram, spectrogram, spectrogram], axis=-1)
    return spectrogram

content_audio = load_audio('../audio/fade.mp3')
style_audio = load_audio('../audio/dont.mp3')

content_spectrogram = audio_to_spectrogram(content_audio)
style_spectrogram = audio_to_spectrogram(style_audio)

# Convert to TensorFlow tensor and add batch dimension
content_spectrogram = tf.convert_to_tensor(content_spectrogram, dtype=tf.float32)
style_spectrogram = tf.convert_to_tensor(style_spectrogram, dtype=tf.float32)

# content_spectrogram = tf.expand_dims(content_spectrogram, axis=0)
# style_spectrogram = tf.expand_dims(style_spectrogram, axis=0)

In [49]:
print(content_spectrogram.shape)
print(style_spectrogram.shape)

(1025, 434, 3)
(1025, 437, 3)


## Step 2: Define Content and Style Representations

In [43]:
from tensorflow.keras.applications import VGG19

def vgg_layers(layer_names):
    vgg = VGG19(include_top=False, weights='imagenet')
    vgg.trainable = False
    outputs = [vgg.get_layer(name).output for name in layer_names]
    model = tf.keras.Model([vgg.input], outputs)
    return model

content_layers = ['block5_conv2']
style_layers = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1']

num_content_layers = len(content_layers)
num_style_layers = len(style_layers)

extractor = vgg_layers(style_layers + content_layers)

## Step 3: Build the Model

In [44]:
class StyleContentModel(tf.keras.models.Model):
    def __init__(self, style_layers, content_layers):
        super(StyleContentModel, self).__init__()
        self.vgg = vgg_layers(style_layers + content_layers)
        self.style_layers = style_layers
        self.content_layers = content_layers
        self.num_style_layers = len(style_layers)
        self.vgg.trainable = False

    def call(self, inputs):
        inputs = inputs * 255.0
        preprocessed_input = tf.keras.applications.vgg19.preprocess_input(inputs)
        outputs = self.vgg(preprocessed_input)
        style_outputs, content_outputs = (outputs[:self.num_style_layers], outputs[self.num_style_layers:])

        style_outputs = [gram_matrix(style_output) for style_output in style_outputs]

        content_dict = {content_name: value for content_name, value in zip(self.content_layers, content_outputs)}
        style_dict = {style_name: value for style_name, value in zip(self.style_layers, style_outputs)}

        return {'content': content_dict, 'style': style_dict}

def gram_matrix(input_tensor):
    result = tf.linalg.einsum('bijc,bijd->bcd', input_tensor, input_tensor)
    input_shape = tf.shape(input_tensor)
    num_locations = tf.cast(input_shape[1] * input_shape[2], tf.float32)
    return result / num_locations

In [47]:
extractor = StyleContentModel(style_layers, content_layers)

## Step 4: Calculate Style and Content Loss 

In [45]:
style_weight = 1e-2
content_weight = 1e4

def style_content_loss(outputs):
    style_outputs = outputs['style']
    content_outputs = outputs['content']
    style_loss = tf.add_n([tf.reduce_mean((style_outputs[name] - style_targets[name]) ** 2) for name in style_outputs.keys()])
    style_loss *= style_weight / num_style_layers

    content_loss = tf.add_n([tf.reduce_mean((content_outputs[name] - content_targets[name]) ** 2) for name in content_outputs.keys()])
    content_loss *= content_weight / num_content_layers
    loss = style_loss + content_loss
    return loss

## Step 5: Run Gradient Descent

In [46]:
opt = tf.keras.optimizers.Adam(learning_rate=0.02, beta_1=0.99, epsilon=1e-1)
image = tf.Variable(content_spectrogram)

@tf.function()
def train_step(image):
    with tf.GradientTape() as tape:
        # Ensure the image has the correct shape for VGG19
        image_expanded = tf.expand_dims(image, axis=0)
        outputs = extractor(image_expanded)
        loss = style_content_loss(outputs)

    grad = tape.gradient(loss, image)
    opt.apply_gradients([(grad, image)])
    image.assign(clip_0_1(image))

def clip_0_1(image):
    return tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0)

import time
start = time.time()

epochs = 10
steps_per_epoch = 100

step = 0
for n in range(epochs):
    for m in range(steps_per_epoch):
        step += 1
        train_step(image)
        print(".", end='', flush=True)
    print("Train step: {}".format(step))

end = time.time()
print("Total time: {:.1f}".format(end - start))



TypeError: in user code:

    File "C:\Users\gupta\AppData\Local\Temp\ipykernel_2036\4054949617.py", line 10, in train_step  *
        loss = style_content_loss(outputs)
    File "C:\Users\gupta\AppData\Local\Temp\ipykernel_2036\785713401.py", line 5, in style_content_loss  *
        style_outputs = outputs['style']

    TypeError: list indices must be integers or slices, not str
