In [1]:
import os
import librosa
import librosa.display
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt


In [2]:
# Paths to data
# train_blended_paths = '../data/audios/english/train/blended'
# train_clean_paths = '../data/audios/english/train/clean'
train_blended_paths = '../data/audios/english/train/blended_trim'
train_clean_paths = '../data/audios/english/train/clean_trim'


# val_blended = '../data/audios/english/validation/blended'
# val_clean = '../data/audios/english/validation/clean'
val_blended = '../data/audios/english/validation/blended_trim'
val_clean = '../data/audios/english/validation/clean_trim'


test_blended = '../data/audios/english/test/blended'
test_clean = '../data/audios/english/test/clean'

In [3]:
# Function to load audio and create spectrogram
def load_and_preprocess_audio(filepath, sr=16000, fixed_length=300):
    y, _ = librosa.load(filepath, sr=sr)
    spectrogram = librosa.stft(y, n_fft=1024, hop_length=512)
    spectrogram_db = librosa.amplitude_to_db(np.abs(spectrogram))
    
    # Adjust the spectrogram length to exactly 300 frames
    if spectrogram_db.shape[1] < fixed_length:
        # Pad with zeros if it's shorter than the fixed length
        padding = fixed_length - spectrogram_db.shape[1]
        spectrogram_db = np.pad(spectrogram_db, ((0, 0), (0, padding)), mode='constant')
    else:
        # Truncate if it's longer than the fixed length
        spectrogram_db = spectrogram_db[:, :fixed_length]
        
    return spectrogram_db


# Loading data pairs (blended and clean)
def load_data_pairs(blended_path, clean_path):
    blended_files = sorted([os.path.join(blended_path, f) for f in os.listdir(blended_path) if f.endswith('.mp3')])
    clean_files = sorted([os.path.join(clean_path, f) for f in os.listdir(clean_path) if f.endswith('.flac')])
    
    blended_spectrograms = [load_and_preprocess_audio(f) for f in blended_files]
    clean_spectrograms = [load_and_preprocess_audio(f) for f in clean_files]
    
    return blended_spectrograms, clean_spectrograms


In [4]:
# Load training and validation data
train_blended, train_clean = load_data_pairs(train_blended_paths, train_clean_paths)
val_blended, val_clean = load_data_pairs(val_blended, val_clean)


In [5]:
from tensorflow.keras.layers import Layer

class ResizeLayer(Layer):
    def __init__(self, target_height, target_width, **kwargs):
        super(ResizeLayer, self).__init__(**kwargs)
        self.target_height = target_height
        self.target_width = target_width

    def call(self, inputs):
        return tf.image.resize(inputs, [self.target_height, self.target_width])

    def get_config(self):
        config = super(ResizeLayer, self).get_config()
        config.update({
            "target_height": self.target_height,
            "target_width": self.target_width,
        })
        return config


In [6]:
# Model Architecture (U-Net for Denoising)
def build_unet_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    
    # Downsampling
    x1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    x1 = layers.MaxPooling2D((2, 2), padding='same')(x1)
    
    x2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x1)
    x2 = layers.MaxPooling2D((2, 2), padding='same')(x2)
    
    x3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(x2)
    x3 = layers.MaxPooling2D((2, 2), padding='same')(x3)
    
    # Bottleneck
    b = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(x3)
    
    # Upsampling
    x3 = layers.UpSampling2D((2, 2))(b)
    x3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(x3)
    
    x2 = layers.UpSampling2D((2, 2))(x3)
    x2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x2)
    
    x1 = layers.UpSampling2D((2, 2))(x2)
    x1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x1)
    
    # Final layer with the same shape as input, followed by resizing layer
    outputs = layers.Conv2D(1, (1, 1), activation='linear', padding='same')(x1)
    outputs = ResizeLayer(target_height=input_shape[0], target_width=input_shape[1])(outputs)  # Resize to exact input shape
    
    model = models.Model(inputs, outputs)
    return model


In [7]:
# Define fixed input shape for spectrograms
input_shape = (513, 300, 1)  # (frequency_bins, time_frames, 1)

# Update model function to use the fixed input shape
model = build_unet_model(input_shape)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')





In [8]:
# Prepare data in batches (generator)
def data_generator(blended, clean, batch_size=8, fixed_length=300):
    while True:
        for i in range(0, len(blended), batch_size):
            x_batch = np.array([np.expand_dims(b, -1) for b in blended[i:i + batch_size]])
            y_batch = np.array([np.expand_dims(c, -1) for c in clean[i:i + batch_size]])
            yield x_batch, y_batch


In [9]:
# Train the model
batch_size = 8
train_gen = data_generator(train_blended, train_clean, batch_size)
val_gen = data_generator(val_blended, val_clean, batch_size)


In [10]:
history = model.fit(
    train_gen, 
    epochs=50, 
    steps_per_epoch=len(train_blended) // batch_size,
    validation_data=val_gen, 
    validation_steps=len(val_blended) // batch_size
)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 62s/step - loss: 856.0461 - val_loss: 2982.1938
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step - loss: 1809.9778 - val_loss: 390.7227
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 42s/step - loss: 623.2143 - val_loss: 555.6476
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - loss: 612.6620 - val_loss: 511.6505
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37s/step - loss: 838.0853 - val_loss: 579.9418
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - loss: 631.6924 - val_loss: 512.3835
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37s/step - loss: 839.4728 - val_loss: 569.4586
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - loss: 621.7940 - val_loss: 492.6734
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━

In [11]:
# Evaluate and save the model
model.save("audio_denoising_unet.h5")



In [12]:
# Example inference function
def denoise_audio(model, blended_audio):
    spectrogram = load_and_preprocess_audio(blended_audio)
    input_spec = np.expand_dims(spectrogram, axis=[0, -1])
    denoised_spec = model.predict(input_spec)
    denoised_audio = librosa.istft(denoised_spec[0, ..., 0])
    return denoised_audio

# Usage example (replace with an actual audio file path):
# denoised_audio = denoise_audio(model, '../data/audios/english/test/blended/sample.mp3')

In [13]:
model.save("audio_denoising_unet.keras")

In [15]:
model.save_weights('model_weights.weights.h5')