In [None]:
# !pip install speechbrain

In [1]:
%%capture

import glob
import json
from concurrent.futures import ThreadPoolExecutor
import itertools
from tqdm.auto import tqdm
import os
from speechbrain.utils.data_utils import get_all_files
import random
import torchaudio
import torch
from speechbrain.augment.time_domain import AddNoise, AddReverb
from speechbrain.dataio.dataio import read_audio, write_audio
from speechbrain.augment.preparation import write_csv

In [2]:
#Windows
base_path = r'D:\Winter 24\COMP 691 X- Conversational AI\project'
libri_root_path = os.path.join('content', 'miniLibriSpeech')
miniLibriSpeechSegments_path = os.path.join('content',  'miniLibriSpeechSegments')
output_directory = os.path.join('content',  'miniLibriSpeechOverlappedSegments') 

os.makedirs(miniLibriSpeechSegments_path, exist_ok=True)
os.makedirs(output_directory, exist_ok=True)

In [None]:
# %%capture

# from google.colab import drive
# drive.mount('/content/drive/')

# # !tar -xvzf /content/drive/MyDrive/ConvAI/Project/train-clean-100.tar.gz # Complete Dataset

# !tar -xvzf /content/drive/MyDrive/ConvAI/Project/miniLibriSpeech.tar.gz # Mini Dataset

In [8]:
# ## Colab
# libri_root_path = "content/miniLibriSpeech/"
# librispeech_csv = "/content/librispeech.csv"
# 
# miniLibriSpeechSegments_path = "/content/miniLibriSpeechSegments/"
# 
# output_folder_corruptednoises = "/content/corruptednoises/"
# corruptednoises_csv = '/content/corruptednoises.csv'
# 
# openrir_folder = "/content/"
# reverb_csv = "reverb.csv"
# noise_csv = "noise.csv"
# max_noise_len = 10.0
# 
# output_directory = "/content/miniLibriSpeechOverlappedSegments"  # Update this path
# 


In [3]:
def segment_audio(audio_path, output_folder, segment_info_list, start_offset=4.0):
    # Load audio file
    sample_rate = 16000
    waveform, _ = torchaudio.load(audio_path)

    # Calculate the start position in samples to skip the initial offset
    start_position = int(start_offset * sample_rate)

    # Iterate over the audio waveform starting from the offset and extract segments with random durations between 1.5 and 2 seconds
    current_position = start_position  # Start after the first 4 seconds
    while current_position < waveform.size(1):
        segment_duration = random.uniform(1.5, 2)  # Random duration between 1.5 and 2 seconds
        segment_length = int(segment_duration * sample_rate)
        
        # Ensure the segment does not exceed the waveform length
        end_position = min(current_position + segment_length, waveform.size(1))
        
        segment = waveform[:, current_position:end_position]

        # Extract file information
        file_path, file_name = os.path.split(audio_path)
        file_name, _ = os.path.splitext(file_name)
        speaker_id = file_name.split('-')[0]
        chapter_id = file_name.split('-')[1]
        audio_id = file_name.split('-')[2]

        # Create output directory structure if not exists
        output_subfolder = os.path.join(output_folder, speaker_id, chapter_id)
        os.makedirs(output_subfolder, exist_ok=True)

        # Define output filename based on segment start time
        start_time = current_position / sample_rate
        end_time = end_position / sample_rate
        output_filename = f"{file_name}-{start_time:.2f}-{end_time:.2f}.flac"
        output_path = os.path.join(output_subfolder, output_filename)

        # Save the segment as a separate audio file
        torchaudio.save(output_path, segment, sample_rate)

        # Add segment information to the list
        segment_info_list.append({
            
            "speakerID": speaker_id,
            "chapter_id": chapter_id,
            "audio_id": audio_id,
            "start_time": start_time,
            "end_time": end_time,
            "segment_length": segment_length,
            "file_path": output_path.replace("\\", "/")  # Ensure consistent path format
        })

        # Move to the next position
        current_position = end_position

# Assuming 'libri_root_path' and 'miniLibriSpeechSegments_path' are defined as per your environment
# Initialize list to store segment information
all_segment_info = []

# Assuming glob and os are correctly set up to find your FLAC files
flac_files = glob.glob(os.path.join(libri_root_path, "**/*.flac"), recursive=True)

# Iterate over FLAC files and segment them
for audio_file in flac_files:
    segment_audio(audio_file, miniLibriSpeechSegments_path, all_segment_info)

# Write all segment information to a single JSON file
json_output_path = os.path.join(miniLibriSpeechSegments_path, "train-clean-100_segments.json")
with open(json_output_path, "w") as json_file:
    json.dump(all_segment_info, json_file, indent=4)


In [ ]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

json_file_path = os.path.join(base_path, 'content', 'miniLibriSpeechSegments', 'train-clean-100_segments.json')
output_directory = os.path.join(base_path, 'content', 'miniLibriSpeechOverlappedSegments')

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def load_segment_info(json_file_path):
    with open(json_file_path, 'r') as json_file:
        return json.load(json_file)

all_segment_info = load_segment_info(json_file_path)

def mix_audio_segments(segments):
    max_length = max(segment.size(1) for segment in segments)
    mixed = torch.zeros(1, max_length)
    for segment in segments:
        length = min(segment.size(1), max_length)
        mixed[:, :length] += segment[:, :length]
    return mixed

def process_combination(combo_info, segments_by_audio, output_directory):
    combo, num_combinations = combo_info
    segments_to_mix = [torchaudio.load(segments_by_audio[audio_key]['file_path'])[0] for audio_key in combo]
    mixed_waveform = mix_audio_segments(segments_to_mix)
    combo_id = "_".join(combo) + f"_mixed_{num_combinations}"
    output_filename = os.path.join(output_directory, f"{combo_id}.wav")
    torchaudio.save(output_filename, mixed_waveform, 16000)

    metadata = {
        "num_speakers": num_combinations,
        "path": output_filename,
        "total_length": mixed_waveform.size(1) / 16000  # Assuming a sample rate of 16000
    }
    # Adding speaker-specific metadata
    for audio_key in combo:
        speaker_id = audio_key.split('_')[0]
        metadata[f"Speaker{speaker_id}"] = {
            "start_time": segments_by_audio[audio_key]['start_time'],
            "end_time": segments_by_audio[audio_key]['end_time']
        }
    
    return metadata

def prepare_combinations_data(all_segment_info, num_combinations, max_combinations_per_audio):
    audio_usage_counter = {}
    segments_by_audio = {}
    for seg_info in all_segment_info:
        audio_key = f"{seg_info['speakerID']}_{seg_info['chapter_id']}_{seg_info['audio_id']}"
        segments_by_audio[audio_key] = seg_info
        audio_usage_counter[audio_key] = 0
        
    # # Create a list of audio keys and shuffle it to ensure diversity
    audio_keys = list(segments_by_audio.keys())
    random.shuffle(audio_keys)
    
    valid_combinations = []
    for combo in itertools.combinations(segments_by_audio.keys(), num_combinations):
        # Ensure unique speakers and within audio limit
        speakers = set(key.split('_')[0] for key in combo)
        if len(speakers) == num_combinations and all(audio_usage_counter[key] < max_combinations_per_audio for key in combo):
            valid_combinations.append(combo)
            for key in combo:
                audio_usage_counter[key] += 1  

    return valid_combinations, segments_by_audio

def main(all_segment_info, max_combinations_per_audio=20, max_workers=10):
    segments_by_audio = {f"{seg_info['speakerID']}_{seg_info['chapter_id']}_{seg_info['audio_id']}": seg_info for seg_info in all_segment_info}
    all_metadata = {"combos": []}  # Initialize the metadata collection

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for num_combinations in range(2, 5):
            valid_combinations, _ = prepare_combinations_data(all_segment_info, num_combinations, max_combinations_per_audio)
            for combo in valid_combinations:
                futures.append(executor.submit(process_combination, (combo, num_combinations), segments_by_audio, output_directory))
        
        # Collect metadata from futures
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing audio combos"):
            all_metadata["combos"].append(future.result())

    # Write collected metadata to a single JSON file
    metadata_path = os.path.join(output_directory, "all_combos_metadata.json")
    with open(metadata_path, 'w') as json_file:
        json.dump(all_metadata, json_file, indent=4)

if __name__ == "__main__":
    main(all_segment_info)

In [None]:
### Add Noise and reverb


# # Define the paths
# noises_path = os.path.join(base_path, 'content', 'rirs_noises', 'noises')

# os.makedirs(noises_path, exist_ok=True)

# pointsource_noises_path = os.path.join(base_path, 'content', 'rirs_noises', 'pointsource_noises')
# real_rirs_isotropic_noises_path = os.path.join(base_path, 'content', 'rirs_noises', 'real_rirs_isotropic_noises')

# shutil.move(pointsource_noises_path, noises_path)
# shutil.move(real_rirs_isotropic_noises_path, noises_path)


Overlapped_audios = get_all_files(os.path.join(base_path, 'content', 'miniLibriSpeechOverlappedSegments'), match_and=['.wav'])
rir_audios = get_all_files(os.path.join(base_path, 'content', 'rirs_noises', 'simulated_rirs'), match_and=['.wav'])
noise_audios = get_all_files(os.path.join(base_path, 'content', 'rirs_noises', 'noises'), match_and=['.wav'])

Overlapped_audios_csv = write_csv(Overlapped_audios, os.path.join(base_path, 'content', 'Overlapped_audios.csv'))
rir_audios_csv = write_csv(rir_audios, os.path.join(base_path, 'content', 'rir_audios.csv'))
noise_audios_csv = write_csv(noise_audios, os.path.join(base_path, 'content', 'noise_audios.csv'))

In [None]:
Overlapped_audios[1].split("\\")[-1]

In [None]:
from tqdm import tqdm
import torch
import os
import torchaudio
from speechbrain.dataio.dataio import read_audio

# Initialize your noise and reverb processors
noisifier = AddNoise(csv_file=os.path.join(base_path, 'content', 'noise_audios.csv'), num_workers=8)
reverb = AddReverb(csv_file=os.path.join(base_path, 'content', 'rir_audios.csv'), num_workers=8)

batch_size = 10
total_batches = len(Overlapped_audios) // batch_size + (1 if len(Overlapped_audios) % batch_size > 0 else 0)

for i in tqdm(range(0, len(Overlapped_audios), batch_size), desc='Processing Batches', total=total_batches):
    batch_paths = Overlapped_audios[i:i+batch_size]
    
    # Load and process each audio file in the batch
    processed_audios = []
    for audio_path in batch_paths:
        audio_path = audio_path.replace("\\", "/")
        signal = read_audio(audio_path)
        clean = signal.unsqueeze(0) # if signal.dim() == 1 else signal.transpose(0, 1).unsqueeze(0)

        # Apply noise
        noisy = noisifier(clean, torch.ones(clean.size(0)))

        # Apply reverb
        if noisy.dim() == 2:
            noisy = noisy.unsqueeze(-1)  # Adding channel dimension for mono signals
        reverbed = reverb(noisy)

        # Ensure the output is in the correct shape for saving [time, channels]
        processed_audio = reverbed.squeeze(0).transpose(0, 1)
        processed_audios.append(processed_audio)

    # Save processed audio
    for j, processed_audio in enumerate(processed_audios):
        output_path = batch_paths[j]
        torchaudio.save(output_path, processed_audio, 16000)
# noisifier = AddNoise(csv_file=os.path.join(base_path, 'content', 'noise_audios.csv'), num_workers=8)
# reverb = AddReverb(csv_file=os.path.join(base_path, 'content', 'rir_audios.csv'), num_workers=8)
# for audio in Overlapped_audios:
#     audio = audio.replace("\\", "/")
#     signal  = read_audio(audio)
#     clean = signal.unsqueeze(0)
#     
#     noisy = noisifier(clean, torch.ones(1))
#     
#     reberbed = reverb(noisy.unsqueeze(0)) 
#     
#     torchaudio.save(audio, reberbed, 16000) 

In [None]:
#### Without saving json data

json_file_path = os.path.join(base_path, 'content', 'miniLibriSpeechSegments', 'train-clean-100_segments.json')
output_directory = os.path.join(base_path, 'content', 'miniLibriSpeechOverlappedSegments')

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def load_segment_info(json_file_path):
    with open(json_file_path, 'r') as json_file:
        return json.load(json_file)

all_segment_info = load_segment_info(json_file_path)

def mix_audio_segments(segments):
    max_length = max(segment.size(1) for segment in segments)
    mixed = torch.zeros(1, max_length)
    for segment in segments:
        length = min(segment.size(1), max_length)
        mixed[:, :length] += segment[:, :length]
    return mixed

def process_combination(combo_info):
    combo, segments_by_audio, output_directory, num_combinations = combo_info
    segments_to_mix = [torchaudio.load(segments_by_audio[audio_key]['file_path'])[0] for audio_key in combo]
    mixed_waveform = mix_audio_segments(segments_to_mix)
    combo_id = "_".join(combo) + f"_mixed_{num_combinations}"
    output_filename = f"{combo_id}.wav"
    torchaudio.save(os.path.join(output_directory, output_filename), mixed_waveform, 16000)
    return combo_id

def prepare_combinations_data(all_segment_info, num_combinations, max_combinations_per_audio):
    audio_usage_counter = {}
    segments_by_audio = {}
    for seg_info in all_segment_info:
        audio_key = f"{seg_info['speakerID']}_{seg_info['chapter_id']}_{seg_info['audio_id']}"
        segments_by_audio[audio_key] = seg_info
        audio_usage_counter[audio_key] = 0
        
    # # Create a list of audio keys and shuffle it to ensure diversity
    audio_keys = list(segments_by_audio.keys())
    random.shuffle(audio_keys)
    
    valid_combinations = []
    for combo in itertools.combinations(segments_by_audio.keys(), num_combinations):
        # Ensure unique speakers and within audio limit
        speakers = set(key.split('_')[0] for key in combo)
        if len(speakers) == num_combinations and all(audio_usage_counter[key] < max_combinations_per_audio for key in combo):
            valid_combinations.append(combo)
            for key in combo:
                audio_usage_counter[key] += 1  

    return valid_combinations, segments_by_audio

def main(all_segment_info, max_combinations_per_audio=50, max_workers=10):
    segments_by_audio = {f"{seg_info['speakerID']}_{seg_info['chapter_id']}_{seg_info['audio_id']}": seg_info for seg_info in all_segment_info}
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for num_combinations in tqdm(range(2, 5), desc="Generating combinations"):
            valid_combinations, _ = prepare_combinations_data(all_segment_info, num_combinations, max_combinations_per_audio)
            tasks = [(combo, segments_by_audio, output_directory, num_combinations) for combo in valid_combinations]
            futures.extend(executor.map(process_combination, tasks))

        tqdm(futures, total=len(futures), desc="Processing audio combos")

if __name__ == "__main__":
    main(all_segment_info)