<a href="https://colab.research.google.com/github/SanyaShresta25/Speech-Enhancement-Using-UNet-Architecture/blob/main/SpeechEnhancementUsingUNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Clone the Repo**

In [1]:
!git clone https://github.com/dathu/Speech-enhancement-deeplearn-vbelz.git
%cd Speech-enhancement-deeplearn-vbelz

Cloning into 'Speech-enhancement-deeplearn-vbelz'...
remote: Enumerating objects: 157, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 157 (delta 35), reused 31 (delta 31), pack-reused 117 (from 1)[K
Receiving objects: 100% (157/157), 47.19 MiB | 15.66 MiB/s, done.
Resolving deltas: 100% (53/53), done.
/content/Speech-enhancement-deeplearn-vbelz


# **Install Requirements**

In [2]:
# Update all requirements to more recent versions
!sed -i 's/tensorflow==1.15.2/tensorflow>=2.10.0/g' requirements.txt
!sed -i 's/scipy==1.3.1/scipy>=1.7.0/g' requirements.txt
!sed -i 's/matplotlib==3.1.1/matplotlib>=3.5.0/g' requirements.txt
!sed -i 's/numpy==1.17.2/numpy>=1.20.0/g' requirements.txt
!sed -i 's/librosa==0.7.0/librosa>=0.9.0/g' requirements.txt
!sed -i 's/sklearn/scikit-learn/g' requirements.txt

# Install the updated requirements
!pip install -r requirements.txt



# **Speech Data Preparation**

In [3]:
## 🧩 Step 1: Install Required Libraries
!pip install librosa soundfile wget

## 📁 Step 2: Setup Directory Structure
import os

base_dirs = ['Train', 'Test']
sub_dirs = ['clean_voice', 'noise', 'sound', 'spectrogram', 'time_serie']

for base in base_dirs:
    for sub in sub_dirs:
        os.makedirs(os.path.join(base, sub), exist_ok=True)

## 📦 Step 3: Download Datasets (LibriSpeech dev-clean & ESC-50)
import wget

# LibriSpeech
librispeech_url = 'http://www.openslr.org/resources/12/dev-clean.tar.gz'
esc50_url = 'https://github.com/karoldvl/ESC-50/archive/master.zip'

wget.download(librispeech_url, 'dev-clean.tar.gz')
wget.download(esc50_url, 'ESC-50.zip')

## 📂 Step 4: Extract Datasets
import tarfile
import zipfile

with tarfile.open('dev-clean.tar.gz', 'r:gz') as tar:
    tar.extractall('.')

with zipfile.ZipFile('ESC-50.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

## 🎧 Step 5: Process and Convert Audio Files
import librosa
import soundfile as sf
import numpy as np
from glob import glob
import random

# Paths
clean_path = './LibriSpeech/dev-clean'
noise_path = './ESC-50-master/audio'

# Collect files
clean_files = [f for f in glob(f'{clean_path}/**/*.flac', recursive=True)]
noise_files = glob(f'{noise_path}/*.wav')

random.seed(42)
np.random.seed(42)

# Split into Train and Test
train_clean = clean_files[:100]
test_clean = clean_files[100:150]

train_noise = noise_files[:50]
test_noise = noise_files[50:75]

# Utility: Create noisy mixture
def mix_audio(clean, noise, snr):
    clean_power = np.mean(clean**2)
    noise_power = np.mean(noise**2)
    scale = np.sqrt(clean_power / (10**(snr / 10) * noise_power))
    return clean + scale * noise

# Processing function
def process_set(clean_list, noise_list, set_type):
    for i, clean_file in enumerate(clean_list):
        clean, sr = librosa.load(clean_file, sr=16000)

        noise_file = np.random.choice(noise_list)
        noise, _ = librosa.load(noise_file, sr=16000)

        if len(noise) < len(clean):
            noise = np.tile(noise, int(np.ceil(len(clean)/len(noise))))
        noise = noise[:len(clean)]

        snr = np.random.uniform(0, 10)
        noisy = mix_audio(clean, noise, snr)

        clean_out = f"{set_type}/clean_voice/clean_{i:04d}.wav"
        noise_out = f"{set_type}/noise/noise_{i:04d}.wav"
        noisy_out = f"{set_type}/sound/noisy_{i:04d}.wav"

        sf.write(clean_out, clean, sr)
        sf.write(noise_out, noise, sr)
        sf.write(noisy_out, noisy, sr)

        if i % 10 == 0:
            print(f"[{set_type}] Processed {i} files")

print("\n🔧 Processing Training Set...")
process_set(train_clean, train_noise, 'Train')

print("\n🔧 Processing Test Set...")
process_set(test_clean, test_noise, 'Test')

print("\n✅ Data preparation complete!")


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=96a8c480de74e36f876f1044b9c331276fe371e485650a80ab2b1c0b3671cc26
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2

🔧 Processing Training Set...
[Train] Processed 0 files
[Train] Processed 10 files
[Train] Processed 20 files
[Train] Processed 30 files
[Train] Processed 40 files
[Train] Processed 50 files
[Train] Processed 60 files
[Train] Processed 70 files
[Train] Processed 80 files
[Train] Processed 90 files

🔧 Processing Test Set...
[Test] Processed 0 files
[Test] Processed 10 files
[Test] Processed 20 files
[Test] Processed 30 files
[Test] Processed 40 fil

In [4]:
!python prepare_data.py

# **Data Creation**

In [5]:
!pip install soundfile



In [9]:
!python main.py \
  --mode data_creation \
  --noise_dir ./Train/noise \
  --voice_dir ./Train/clean_voice \
  --path_save_spectrogram ./Train/spectrogram/ \
  --path_save_time_serie ./Train/time_serie/ \
  --path_save_sound ./Train/sound/ \
  --nb_samples 200


2025-04-23 12:01:33.450641: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745409693.485980    4810 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745409693.497016    4810 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-23 12:01:33.534857: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# **Model Training**

In [10]:
!python main.py \
  --mode training \
  --weights_folder ./weights \
  --training_from_scratch True \
  --epochs 30 \
  --batch_size 16 \
  --name_model model_unet


2025-04-23 12:03:15.108352: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745409795.140187    5258 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745409795.150062    5258 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-23 12:03:15.180477: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
DescribeResult(nobs=3276800, minmax=(array([-80.]), array([0.])), mean=array([-41.43137509]), variance=array([256.882

# **Prediction**

In [None]:
!python main.py \
  --mode prediction \
  --weights_folder ./weights \
  --name_model model_unet \
  --audio_dir_prediction ./demo_data/test \
  --dir_save_prediction ./demo_data/save_predictions \
  --audio_input_prediction ["noisy_sample.wav"] \
  --audio_output_prediction denoised_sample.wav


In [29]:
!ls /content/Speech-enhancement-deeplearn-vbelz/


args.py		 dev-clean.tar.gz  main.py		  requirements.txt
AUTHORS.rst	 ESC-50-master	   model_unet.py	  Test
colab		 ESC-50.zip	   prediction_denoise.py  tests
data_display.py  img		   prepare_data.py	  Train
data_tools.py	 LibriSpeech	   __pycache__		  train_model.py
demo_data	 LICENSE	   README.md		  weights


In [None]:
import tensorflow as tf
print(tf.__version__)
import os
print(os.path.exists('/content/Speech-enhancement-deeplearn-vbelz/weights'))


2.18.0
True


In [18]:
import os

weights_path = '/content/Speech-enhancement-deeplearn-vbelz/weights'
name_model = 'model_unet'
weights_file = f"{weights_path}/{name_model}.h5"

print("Exists:", os.path.exists(weights_file))
print("Expected file path:", weights_file)


Exists: True
Expected file path: /content/Speech-enhancement-deeplearn-vbelz/weights/model_unet.h5
