# Installations

In [None]:
# !pip install soundfile
# !pip install torchaudio

# Parameters and Imports

In [None]:
original_songs_sr = 44100
working_sr = 16000
duration=2 #time in seconds

In [None]:
import os
import soundfile as sf
import librosa
import numpy as np
import IPython.display as ipd
import torchaudio
import torch
import torch.nn as nn
from tqdm import tqdm

In [None]:
resampler = torchaudio.transforms.Resample(orig_freq=original_songs_sr, new_freq=working_sr)
sr = working_sr

# Load Data

In [None]:
base_path = '/content/drive/MyDrive/Ofir/SoundProcessingFinalProject'
data_path = os.path.join(base_path, 'Data')
output_data = os.path.join(data_path, 'dataset_organized')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Audio Songs- load and save organaized

In [None]:
songs_path = os.path.join(data_path, 'split_data_Billaboard')
accompaniment_path = os.path.join(songs_path, 'accompaniment_of_songs')
vocal_path = os.path.join(songs_path, 'vocal_of_songs')
# songs_output_data = os.path.join(output_data, 'songs')
songs_output_data = os.path.join(output_data, 'songs_2s')

In [None]:
import numpy as np
def slice_and_save_audio(song_name, voc, acc, output_dir, slice_duration=duration, overlap=1):
    """
    Slices vocal and accompaniment audio files into segments and saves them if both have sufficient sound energy.

    """
    samples_per_slice = int(slice_duration * sr)
    samples_per_overlap = int(overlap * sr)
    print(f"samples_per_overlap: {samples_per_overlap}, samples_per_slice: {samples_per_slice}, audio_length: {voc.shape[1]}")
    # samples_per_overlap: 40000, samples_per_slice: 80000, audio_length: 2757045
    voc_threshold = 0.05 * torch.max(torch.abs(voc)) * samples_per_slice
    acc_threshold = 0.05 * torch.max(torch.abs(acc)) * samples_per_slice
    print(f"voc_threshold: {voc_threshold}, acc_threshold: {acc_threshold}")
    for start_sample in range(0, voc.shape[1] - samples_per_slice + 1, samples_per_slice - samples_per_overlap):
        end_sample = start_sample + samples_per_slice
        # slice torch voc and acc
        voc_slice = voc[:, start_sample:end_sample]
        acc_slice = acc[:, start_sample:end_sample]

        # Check for sufficient sound energy in both slices
        voc_energy = torch.sum(torch.abs(voc_slice))
        acc_energy = torch.sum(torch.abs(acc_slice))

        if voc_energy >= voc_threshold and acc_energy >= acc_threshold :
          song_folder = os.path.join(output_dir, song_name)
          if not os.path.exists(song_folder):
              os.makedirs(song_folder)
          voc_filename = os.path.join(song_folder, f"voc_{start_sample // samples_per_slice}.wav")
          acc_filename = os.path.join(song_folder, f"acc_{start_sample // samples_per_slice}.wav")
          # print(f"Saving {voc_filename} and {acc_filename}")

          # # # play voc and acc sliced audio:
          # print("Voc slice:")
          # ipd.display(ipd.Audio(voc_slice, rate=sr))
          # print("Acc slice:")
          # ipd.display(ipd.Audio(acc_slice, rate=sr))
          torchaudio.save(voc_filename, voc_slice, sr)
          torchaudio.save(acc_filename, acc_slice, sr)
        else:
          pass
          # print(f"Skipping {song_name} slice {start_sample // samples_per_slice} due to insufficient sound energy. voc_energy: {voc_energy}, acc_energy: {acc_energy}")
          # print("Skipped Voc slice:")
          # ipd.display(ipd.Audio(voc_slice, rate=sr))
          # print("Skipped Acc slice:")
          # ipd.display(ipd.Audio(acc_slice, rate=sr))
          # print('-----------------------------------------------')



In [None]:
songs_pairs = []
for filename in os.listdir(accompaniment_path):
    if filename.endswith(".wav"):
        accompaniment_file = os.path.join(accompaniment_path, filename)
        vocal_file = os.path.join(vocal_path, filename)

        if os.path.exists(vocal_file):
            try:
                # Load accompaniment and vocal files with torchaudio
                y_acc, sr_acc = torchaudio.load(accompaniment_file)  # [channels, samples]
                y_voc, sr_voc = torchaudio.load(vocal_file)
                # Check if sample rates are the same
                if sr_acc != sr_voc:
                    print(f"Skipping {filename}: Different sample rates.")
                    continue
                # Ensure mono (convert stereo to mono if necessary)
                if y_acc.shape[0] > 1:
                    y_acc = torch.mean(y_acc, dim=0, keepdim=True)
                if y_voc.shape[0] > 1:
                    y_voc = torch.mean(y_voc, dim=0, keepdim=True)

                # Check if lengths are the same
                if y_acc.shape[1] != y_voc.shape[1]:
                    print(f"Skipping {filename}: Different lengths.")
                    continue

                y_acc = resampler(y_acc)
                y_voc = resampler(y_voc)

                slice_and_save_audio(filename[:-4], y_voc, y_acc, songs_output_data)
                songs_pairs.append((accompaniment_file, vocal_file))
                print(f"Done processing {filename}")
            except Exception as e:
                print(f"Error processing {filename}")

print(f"Found {len(songs_pairs)} matching pairs.")

Error processing 0003+I Don_t Mind+James Brown.wav
Error processing 0012+Lookin_ For Love+Johnny Lee.wav
Error processing 0006+The Rose+Bette Midler.wav
samples_per_overlap: 16000, samples_per_slice: 32000, audio_length: 2757045
voc_threshold: 1504.97119140625, acc_threshold: 1413.278076171875
Done processing 0025+Chicago+Graham Nash.wav
Error processing 0023+And She Was+Talking Heads.wav
Error processing 0019+Here_s Some Love+Tanya Tucker.wav
samples_per_overlap: 16000, samples_per_slice: 32000, audio_length: 2333513
voc_threshold: 1567.469482421875, acc_threshold: 1510.3973388671875
Done processing 0026+Sweet Talkin_ Guy+The Chiffons.wav
samples_per_overlap: 16000, samples_per_slice: 32000, audio_length: 3233333
voc_threshold: 1163.5552978515625, acc_threshold: 1532.547607421875
Done processing 0043+Two Hearts+Phil Collins.wav
samples_per_overlap: 16000, samples_per_slice: 32000, audio_length: 4272843
voc_threshold: 1019.9617309570312, acc_threshold: 1066.502685546875
Done processing

In [None]:
ipd.display(ipd.Audio(y_voc, rate=sr))

# LibriSpeech- Load and organize

In [None]:
libri_speech_path = os.path.join(data_path, 'LibriSpeech/dev-clean')
libri_speech_output_data = os.path.join(output_data, 'Librispeech_2s/dev-clean')

In [None]:
# prompt: Load LibriSpeech as pairs (audio-file, and transcript) from drive. structure: folders that contains folders that contains text file with the transcript and audio files

libri_speech_pairs = []
for root, _, files in os.walk(libri_speech_path):
  #get the file that ends with .txt
  for file in files:
      if file.endswith(".txt"):
        try:
          with open(os.path.join(root, file), 'r') as f:
            all_transcripts = f.read().strip()
          # each row formated as '<file_name> <transcript>'
          # example:
          # 1272-141231-0003 HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL SHARP BLOW HIGH ON HIS CHEST while
          # <file_name> = 1272-141231-0003 ; <transcript> = HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL SHARP BLOW HIGH ON HIS CHEST
          # Parse all_transcripts and then Load file_name and save it as pair with its transcript (the one in the same line)
          for line in all_transcripts.split('\n'):
            audio_file_name, transcript = line.split(' ', 1)
            audio_file_path = os.path.join(root, audio_file_name) + '.flac'
            folders_path = root.split(os.sep)[-2:]
            save_file_path = os.path.join(libri_speech_output_data, folders_path[0],folders_path[1], audio_file_name + '.wav')

            if os.path.exists(save_file_path):
              continue
            # Load the audio file
            audio_file, sr = torchaudio.load(audio_file_path)
            # make audio length- duration seconds (pad with zero or slice according to input)
            if audio_file.shape[1] > duration * sr:
              audio_file = audio_file[:, :duration * sr]
            elif audio_file.shape[1] <= duration * sr:
              print(f"file {audio_file_name} with duration smaller than 2 seconds")
              audio_file = torch.nn.functional.pad(audio_file, (0, duration * sr - audio_file.shape[1]))
            else:
              print("Something wrong")
            # ipd.display(ipd.Audio(audio_file, rate=sr))
            #save new audio file
            os.makedirs(os.path.dirname(save_file_path), exist_ok=True)

            torchaudio.save(save_file_path, audio_file, sr)
            libri_speech_pairs.append((save_file_path, transcript))
            print(f"saved {audio_file_name}")
        except Exception as e:
          print(f"Error reading transcript for {line}: {e}")

print(f"Found {len(libri_speech_pairs)} LibriSpeech pairs.")

saved 5338-24615-0008
saved 5338-24615-0009
saved 5338-24615-0010
saved 5338-24615-0011
saved 5338-24615-0012
saved 5338-24615-0013
saved 5338-24615-0014
saved 6345-93306-0000
saved 6345-93306-0001
saved 6345-93306-0002
saved 6345-93306-0003
saved 6345-93306-0004
saved 6345-93306-0005
saved 6345-93306-0006
saved 6345-93306-0007
saved 6345-93306-0008
saved 6345-93306-0009
saved 6345-93306-0010
saved 6345-93306-0011
saved 6345-93306-0012
saved 6345-93306-0013
saved 6345-93306-0014
saved 6345-93306-0015
saved 6345-93306-0016
saved 6345-93306-0017
saved 6345-93306-0018
saved 6345-93306-0019
saved 6345-93306-0020
saved 6345-93306-0021
saved 6345-93306-0022
saved 6345-93306-0023
saved 6345-93306-0024
saved 6345-93306-0025
saved 6345-64257-0000
saved 6345-64257-0001
saved 6345-64257-0002
saved 6345-64257-0003
saved 6345-64257-0004
saved 6345-64257-0005
saved 6345-64257-0006
saved 6345-64257-0007
saved 6345-64257-0008
saved 6345-64257-0009
saved 6345-64257-0010
saved 6345-64257-0011
saved 6345