In [None]:
!pip install pretty_midi
!pip install o



In [None]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, Model
import soundfile as sf
from scipy.signal import resample


#      Load & Preprocess Audio  #


In [None]:
def load_audio(file_path, target_sr=22050):
    audio, sr = librosa.load(file_path, sr=target_sr)
    return audio, sr

#         Encoder Model         #

In [None]:
def build_encoder(latent_dim, input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2, padding='same')(x)
    x = layers.Conv1D(32, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(2, padding='same')(x)
    x = layers.Flatten()(x)

    z_mean = layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)

    def sampling(args):
        z_mean, z_log_var = args
        epsilon = tf.keras.backend.random_normal(shape=(tf.shape(z_mean)[0], latent_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
    encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
    return encoder

#         Decoder Model         #

In [None]:
def build_decoder(latent_dim, output_shape):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(output_shape[0] * output_shape[1], activation='relu')(latent_inputs)
    x = layers.Reshape(output_shape)(x)

    x = layers.Conv1D(32, 3, activation='relu', padding='same')(x)
    x = layers.UpSampling1D(2)(x)
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(x)
    x = layers.UpSampling1D(2)(x)
    outputs = layers.Conv1D(output_shape[1], 3, activation='sigmoid', padding='same')(x)

    decoder = Model(latent_inputs, outputs, name='decoder')
    return decoder

#       VAE Class        #

In [None]:
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
        self.add_loss(kl_loss)
        return reconstructed


#       Mix & Manipulate Audio  #

In [None]:
def mix_audio(audio1, audio2, alpha=0.5):
    min_length = min(len(audio1), len(audio2))
    mixed = alpha * audio1[:min_length] + (1 - alpha) * audio2[:min_length]
    return mixed

def change_speed(audio, factor):
    return resample(audio, int(len(audio) * factor))

#       Execution & Training    #

In [None]:
latent_dim = 16
input_shape = (22050, 1)
encoder = build_encoder(latent_dim, input_shape)
decoder = build_decoder(latent_dim, input_shape)
vae = VAE(encoder, decoder)

# Load two different MP3 files
audio1, sr1 = load_audio('/content/Piano sa re ja ma pa.mp3')
audio2, sr2 = load_audio('/content/Sa re ja ma pa.mp3')

# Mix & Manipulate
audio_mixed = mix_audio(audio1, audio2, alpha=0.5)
audio_slow = change_speed(audio_mixed, 1.2)
audio_fast = change_speed(audio_mixed, 0.8)

# Save processed audio
sf.write('mixed_audio.wav', audio_mixed, sr1)
sf.write('slow_audio.wav', audio_slow, sr1)
sf.write('fast_audio.wav', audio_fast, sr1)

# Train the VAE
# (This is a placeholder; you need to define your training process)
# train_vae([audio1, audio2], vae, epochs=10)

# Generate music
random_latent_vector = np.random.normal(size=(1, latent_dim))
generated_audio = decoder.predict(random_latent_vector)
sf.write('generated_music.wav', generated_audio[0], sr1)

print("✅ Music Style Transfer Complete!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step
✅ Music Style Transfer Complete!
