<a href="https://colab.research.google.com/github/SamuelW669/DeBaussyAI/blob/main/TheREALDeal1_1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tensorflow magenta librosa soundfile

Collecting magenta
  Using cached magenta-2.1.4-py3-none-any.whl.metadata (2.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-1.2.0-py3-none-any.whl.metadata (2.3 kB)
Collecting dm-sonnet==2.0.0 (from magenta)
  Using cached dm_sonnet-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting imageio==2.20.0 (from magenta)
  Using cached imageio-2.20.0-py3-none-any.whl.metadata (4.9 kB)
Collecting librosa
  Using cached librosa-0.7.2.tar.gz (1.6 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.5.2 (from magenta)
  Using cached matplotlib-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting mido==1.2.6 (from magenta)
  Using cached mido-1.2.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting mir-eval==0.7 (from magenta)
  Using cached mir_eval-0.7.tar.gz (90 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting note-seq==0.0.3 (from magenta)
  Using cached note_seq-0.0.3-py3-none-any.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import librosa

# Load the NSynth dataset
def load_nsynth_dataset(batch_size=32, split='train'):
    dataset, info = tfds.load('nsynth', split=split, with_info=True)
    def preprocess_fn(x):
        audio = tf.cast(x['audio'], tf.float32) / 32768.0  # Normalize the audio
        pitch = tf.cast(x['pitch'], tf.float32)
        return audio, pitch
    dataset = dataset.map(preprocess_fn).batch(batch_size)
    return dataset

train_dataset = load_nsynth_dataset()




Downloading and preparing dataset 73.07 GiB (download: 73.07 GiB, generated: 73.09 GiB, total: 146.16 GiB) to /root/tensorflow_datasets/nsynth/full/2.3.3...


Dl Completed...:   0%|          | 0/1069 [00:00<?, ? file/s]

Dataset nsynth downloaded and prepared to /root/tensorflow_datasets/nsynth/full/2.3.3. Subsequent calls will reuse this data.


In [None]:
!pip install tensorflow



In [None]:
from tensorflow.keras import layers
import tensorflow as tf

def build_generator(input_dim):
    model = tf.keras.Sequential([
        layers.InputLayer(input_shape=(input_dim,)),
        layers.Dense(16 * 16 * 128, activation='relu'),  # Initial dense layer
        layers.Reshape((16, 16, 128)),  # Reshape into (16, 16, 128) tensor
        layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding='same', activation='relu'),  # Output: (32, 32, 128)
        layers.BatchNormalization(),
        layers.Conv2DTranspose(64, kernel_size=4, strides=2, padding='same', activation='relu'),  # Output: (64, 64, 64)
        layers.BatchNormalization(),
        layers.Conv2DTranspose(1, kernel_size=4, strides=2, padding='same', activation='tanh'),  # Output: (128, 128, 1)
    ])
    return model

generator = build_generator(100)  # 100 is the latent dimension size



In [None]:
from tensorflow.keras import layers
import tensorflow as tf

def build_discriminator(input_shape=(64, 64, 1)):  # Adjusted to include the channel dimension
    model = tf.keras.Sequential([
        layers.InputLayer(input_shape=input_shape),
        layers.Conv2D(64, kernel_size=4, strides=2, padding='same', activation='relu'),
        layers.Conv2D(128, kernel_size=4, strides=2, padding='same', activation='relu'),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Output: binary classification (real or fake)
    ])
    return model

discriminator = build_discriminator()


In [None]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    return real_loss + fake_loss

generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
import os
import time

BATCH_SIZE = 32  # Set this to the desired batch size

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

@tf.function
def train_step(audio, pitch, batch_size):
    noise = tf.random.normal([batch_size, 100])  # Use the passed batch size

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        # Generate spectrograms from the noise input
        generated_spectrograms = generator(noise, training=True)

        # Directly use audio in the shape expected by the discriminator
        real_audio = tf.reshape(audio, (batch_size, 64, 64, 1))  # Assuming audio is preprocessed to (batch_size, 64, 64, 1)

        # Discriminator output for real and generated spectrograms
        real_output = discriminator(real_audio, training=True)
        fake_output = discriminator(generated_spectrograms, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    # Calculate gradients and apply them
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    return gen_loss, disc_loss

def train(dataset, epochs):
    for epoch in range(epochs):
        start = time.time()
        for audio, pitch in dataset:
            gen_loss, disc_loss = train_step(audio, pitch, BATCH_SIZE)  # Pass BATCH_SIZE here

        print(f'Epoch {epoch+1}, Generator Loss: {gen_loss}, Discriminator Loss: {disc_loss}')
        if (epoch + 1) % 10 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)

train(train_dataset, epochs=50)


ValueError: in user code:

    File "<ipython-input-20-a47937a80b43>", line 22, in train_step  *
        real_audio = tf.reshape(audio, (tf.shape(audio)[0], 64, 64, 1))  # Adjust dimensions as needed

    ValueError: Cannot reshape a tensor with 2048000 elements to shape [32,64,64,1] (131072 elements) for '{{node Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](audio, Reshape/shape)' with input shapes: [32,64000], [4] and with input tensors computed as partial shapes: input[1] = [32,64,64,1].


In [None]:
import librosa
import librosa.display

# Example: Inverse transformation of a Mel-spectrogram back to audio
def spectrogram_to_audio(spectrogram):
    return librosa.feature.inverse.mel_to_audio(spectrogram)

In [None]:
# To save the model
checkpoint.save(file_prefix = checkpoint_prefix)

# To load the model
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))