<a href="https://colab.research.google.com/github/Stefano-Previti/Diffiner/blob/main/Diffiner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**▶LOAD** **THE** **DATASET**

⚓Loading of clean and noisy dataset composed, in both cases, by 28 speakers.

⏰**Citation**:Valentini-Botinhao, Cassia. (2017). Noisy speech database for training speech enhancement algorithms and TTS models, 2016 [sound]. University of Edinburgh. School of Informatics. Centre for Speech Technology Research (CSTR). https://doi.org/10.7488/ds/2117.

In [2]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import zipfile

# Defining paths to your ZIP files on Google Drive
zip_clean_train = '/content/drive/My Drive/clean_trainset_28spk_wav.zip'
zip_noisy_train = '/content/drive/My Drive/noisy_trainset_28spk_wav.zip'
zip_clean_test = '/content/drive/My Drive/clean_testset_wav.zip'
zip_noisy_test = '/content/drive/My Drive/noisy_testset_wav.zip'

# Defining extraction directories
clean_dir = '/content/data/clean_trainset/'
noisy_dir = '/content/data/noisy_trainset/'
clean_test_dir = '/content/data/clean_testset/'
noisy_test_dir = '/content/data/noisy_testset/'

# Function to create a directory if it doesn't exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Creation of directories for extraction
create_directory(clean_dir)
create_directory(noisy_dir)
create_directory(clean_test_dir)
create_directory(noisy_test_dir)

# Function to extract ZIP files
def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Extraction of each ZIP file to its corresponding directory
extract_zip(zip_clean_train, clean_dir)
extract_zip(zip_noisy_train, noisy_dir)
extract_zip(zip_clean_test, clean_test_dir)
extract_zip(zip_noisy_test, noisy_test_dir)




Mounted at /content/drive


In [None]:
#  Verification of the extraction by listing files in all subdirectories
def verify_extraction(directory, num_files_to_check=6):
    # Walking through all directories and files
    for root, dirs, files in os.walk(directory):
        print(f'Checking directory: {root}')

        # Showing some files (up to num_files_to_check) in this directory
        for i, file_name in enumerate(files[:num_files_to_check]):
            file_path = os.path.join(root, file_name)
            print(f'File {i+1}: {file_path}')

# Running the verification functions
verify_extraction(clean_dir)
verify_extraction(noisy_dir)
verify_extraction(clean_test_dir)
verify_extraction(noisy_test_dir)


⚓Creation of a custom dataset object.

⏰The choiche is to load only 1/4 of each dataset,after a random shuffling, because of the limitation of the RAM in the colab enviroment.

In [None]:
# Installing PyTorch and torchaudio
!pip install torch torchvision torchaudio


In [None]:
import torch
import torchaudio
import random
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, data_dir):
        """
        Initialization of the dataset with audio files from the specified directory and its subdirectories.
        """
        self.data_dir = data_dir
        self.files = self._get_subset_of_audio_files(data_dir)

    def _get_subset_of_audio_files(self, directory):
        """
        Recursively collecting a subset (1/4) of all audio files in the directory and its subdirectories.
        """
        audio_files = []
        for root, _, files in os.walk(directory):
            for file in files:
                if file.lower().endswith('.wav'):
                    audio_files.append(os.path.join(root, file))

        # Shuffling the list of files and taking 1/4 of it
        random.shuffle(audio_files)
        index = len(audio_files) // 4
        return sorted(audio_files[:index])


    def __len__(self):
        """Returning the number of audio files."""
        return len(self.files)

    def __getitem__(self, idx):
        """
        Loading and returning the waveform and sample rate for a specific index.
        """
        file_path = os.path.join(self.data_dir, self.files[idx])
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"{file_path} is not a file")

        waveform, sample_rate = torchaudio.load(file_path)
        return {'waveform': waveform, 'sample_rate': sample_rate}

    def compute_stft(self,waveform):
     """
     Computing the STFT of the waveform using a Hann window and returning a
     256x256 tensor with real and imaginary parts as separate channels.
     """
     # Defining th STFT parameters
     n_fft = 512
     hop_length = 256
     win_length = 512
     window_fn = torch.hann_window

     # Performing STFT
     stft_transform = T.Spectrogram(
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window_fn=window_fn,
        power=None,
     )
     stft_spectrogram = stft_transform(waveform)

     # Truncation of the DC component
     stft_spectrogram = stft_spectrogram[:, 1:, :]

     # Extraction of real and imaginary parts
     real_part = stft_spectrogram.real
     imag_part = stft_spectrogram.imag

     # Stacking into a tensor with two channels
     two_channel_tensor = torch.stack((real_part, imag_part), dim=1)

     # Ensuring the size is 256x256 for the selected time frames and frequency bins
     two_channel_tensor = two_channel_tensor[:, :, :256, :256]

     return two_channel_tensor

# Parameters
batch_size = 8

# Creation of dataset instances
clean_train_dataset = AudioDataset(clean_dir)
noisy_train_dataset = AudioDataset(noisy_dir)
clean_test_dataset = AudioDataset(clean_test_dir)
noisy_test_dataset = AudioDataset(noisy_test_dir)

# Print dataset lengths
print("Number of samples in clean_train_dataset:", len(clean_train_dataset))
print("Number of samples in noisy_train_dataset:", len(noisy_train_dataset))
print("Number of samples in clean_test_dataset:", len(clean_test_dataset))
print("Number of samples in noisy_test_dataset:", len(noisy_test_dataset))

#Custom collate function to handle variable-length audio files in order to avoid the runtime exception of the DataLoader
def collate_fn(batch):
    waveforms = [item['waveform'] for item in batch]
    sample_rates = [item['sample_rate'] for item in batch]
    return {'waveform': waveforms, 'sample_rate': sample_rates}

# Creation of DataLoader instances
clean_train_loader = DataLoader(clean_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
noisy_train_loader = DataLoader(noisy_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
clean_test_loader = DataLoader(clean_test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
noisy_test_loader = DataLoader(noisy_test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Debug: Iteration over the first few batches of the clean_train_loader to see what's loaded
for batch_idx, batch in enumerate(clean_train_loader):
    print(f"Batch {batch_idx} Loaded")
    for idx in range(len(batch['waveform'])):
        waveform = batch['waveform'][idx]
        sample_rate = batch['sample_rate'][0]

        # Print details for each waveform
        print(f"Sample {idx} - Waveform shape: {waveform.shape}, Sample rate: {sample_rate}")

        if idx >= 1:  # Limit to first few samples for brevity
            break
    if batch_idx >= 1:  # Limit to first few batches for brevity
        break



▶**PREPROCESSING OF THE DATA**

⚓Here the pre-processing of the data through the computation
of the STFT parameters.

In [None]:
import os
import torch

def STFT_preprocess_and_save(dataset,data_loader, output_dir):
    """
    Preprocesses the dataset by computing the STFT for each waveform and saving each STFT tensor as a separate file.

    Parameters:
    - data_loader: A DataLoader object providing batches of audio data.
    - output_dir: The directory where the individual STFT tensor files will be saved.
    """
    # Ensuring the destination directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Initialization of a counter for processed files
    file_counter = 0

    # Iterate over each batch in the data loader
    for batch_index, batch_data in enumerate(data_loader):
        # Assuming batch_data is a dictionary containing 'waveform'
        waveforms = batch_data['waveform']

        # Computing the STFT for each waveform in the batch
        for waveform_index, waveform in enumerate(waveforms):
            # Computing the STFT for the current waveform
            stft_tensor = dataset.compute_stft(waveform)

            # Definition of the file name for the current tensor
            file_name = f"stft_tensor_{file_counter}.pt"
            file_path = os.path.join(output_dir, file_name)

            # Saving the STFT tensor to a file
            torch.save(stft_tensor, file_path)

            # Incrementing the file counter
            file_counter += 1

            # Logging progress every 10 files
            if file_counter % 10 == 0:
                print(f"File {file_counter}/{len(data_loader.dataset)} processed and saved to {file_path}")

    print(f"All tensors have been processed and saved to {output_dir}")

# Directory Paths
STFT_clean_preprocessed_dir = '/content/drive/My Drive/STFT_clean_preprocessed/'
STFT_clean_test_preprocessed_dir = '/content/drive/My Drive/STFT_clean_test_preprocessed/'
STFT_noisy_preprocessed_dir = '/content/drive/My Drive/STFT_noisy_preprocessed/'
STFT_noisy_test_preprocessed_dir = '/content/drive/My Drive/STFT_noisy_test_preprocessed/'

#Calling the function
STFT_preprocess_and_save(clean_train_dataset,clean_train_loader, STFT_clean_preprocessed_dir)
STFT_preprocess_and_save(clean_test_dataset,clean_test_loader, STFT_clean_test_preprocessed_dir)
STFT_preprocess_and_save(noisy_train_dataset,noisy_train_loader, STFT_noisy_preprocessed_dir)
STFT_preprocess_and_save(noisy_test_dataset,noisy_test_loader, STFT_noisy_test_preprocessed_dir)


⚓Here the preprocessing for the waveform of each audio in segment lenghts of 16384 sample.

In [None]:
def waveform_preprocess_and_save(dataloader, output_dir, segment_length=16384):
    """
    Preprocess audio data from a DataLoader and save the preprocessed files.

    Args:
        dataloader (DataLoader): DataLoader instance that provides audio data.
        output_dir (str): Directory to save the preprocessed data.
        segment_length (int): Length of audio segments in samples.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for batch_idx, batch in enumerate(dataloader):
        for idx in range(len(batch['waveform'])):
            waveform = batch['waveform'][idx]
            sample_rate = batch['sample_rate'][0]

            # Resampling if necessary
            target_sample_rate = 16000
            if sample_rate != target_sample_rate:
                resample_transform = T.Resample(sample_rate, target_sample_rate)
                waveform = resample_transform(waveform)

            # Normalizing waveform
            waveform = waveform / waveform.abs().max()

            # Segmenting and padding the audio
            length = waveform.size(1)
            if length >= segment_length:
                start = random.randint(0, length - segment_length)
                segment = waveform[:, start:start + segment_length]
            else:
                padding = segment_length - length
                segment = torch.nn.functional.pad(waveform, (0, padding), 'constant', 0)

            # Saving the preprocessed data
            file_name = f"sample_{batch_idx}_{idx}.pt"
            save_path = os.path.join(output_dir, file_name)
            torch.save(segment, save_path)

            print(f"Saved preprocessed data: {save_path}")


# Directory Paths
waveform_clean_preprocessed_dir = '/content/drive/My Drive/waveform_clean_preprocessed/'
waveform_clean_test_preprocessed_dir='/content/drive/My Drive/waveform_clean_test_preprocessed/'
waveform_noisy_preprocessed_dir = '/content/drive/My Drive/waveform_noisy_preprocessed/'
waveform_noisy_test_preprocessed_dir = '/content/drive/My Drive/waveform_noisy_test_preprocessed/'

#Calling the function
waveform_preprocess_and_save(clean_train_loader, waveform_clean_preprocessed_dir)
waveform_preprocess_and_save(clean_test_loader, waveform_clean_test_preprocessed_dir)
waveform_preprocess_and_save(noisy_train_loader, waveform_noisy_preprocessed_dir)
waveform_preprocess_and_save(noisy_test_loader, waveform_noisy_test_preprocessed_dir)
