# Installations

In [2]:
!pip install soundfile
!pip install torchaudio



# Parameters and Imports

In [3]:
original_songs_sr = 44100
working_sr = 16000

In [4]:
import os
import soundfile as sf
import librosa
import numpy as np
import IPython.display as ipd
import torchaudio
import torch
import torch.nn as nn
from tqdm import tqdm

In [5]:
resampler = torchaudio.transforms.Resample(orig_freq=original_songs_sr, new_freq=working_sr)
sr = working_sr

# Load Data

In [6]:
base_path = '/content/drive/MyDrive/Ofir/SoundProcessingFinalProject'
data_path = os.path.join(base_path, 'Data')
output_data = os.path.join(data_path, 'dataset_organized')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Audio Songs- load and save organaized

In [13]:
songs_path = os.path.join(data_path, 'split_data_Billaboard')
accompaniment_path = os.path.join(songs_path, 'accompaniment_of_songs')
vocal_path = os.path.join(songs_path, 'vocal_of_songs')
songs_output_data = os.path.join(output_data, 'songs')

In [14]:
# prompt: create a function that takes 2 wav files(voc, acc), slice them into 5 seconds and save them only if voc and acc have sound for at least 3 out of the 5 seconds. please create overlap between the sliced parts

import numpy as np
def slice_and_save_audio(song_name, voc, acc, output_dir, slice_duration=2, overlap=1):
    """
    Slices vocal and accompaniment audio files into segments and saves them if both have sufficient sound energy.

    """
    samples_per_slice = int(slice_duration * sr)
    samples_per_overlap = int(overlap * sr)
    print(f"samples_per_overlap: {samples_per_overlap}, samples_per_slice: {samples_per_slice}, audio_length: {voc.shape[1]}")
    # samples_per_overlap: 40000, samples_per_slice: 80000, audio_length: 2757045
    voc_threshold = 0.03 * torch.max(torch.abs(voc)) * samples_per_slice
    acc_threshold = 0.03 * torch.max(torch.abs(acc)) * samples_per_slice
    print(f"voc_threshold: {voc_threshold}, acc_threshold: {acc_threshold}")
    for start_sample in range(0, voc.shape[1] - samples_per_slice + 1, samples_per_slice - samples_per_overlap):
        end_sample = start_sample + samples_per_slice
        # slice torch voc and acc
        voc_slice = voc[:, start_sample:end_sample]
        acc_slice = acc[:, start_sample:end_sample]

        # Check for sufficient sound energy in both slices
        voc_energy = torch.sum(torch.abs(voc_slice))
        acc_energy = torch.sum(torch.abs(acc_slice))

        if voc_energy >= voc_threshold and acc_energy >= acc_threshold :
          song_folder = os.path.join(output_dir, song_name)
          if not os.path.exists(song_folder):
              os.makedirs(song_folder)
          voc_filename = os.path.join(song_folder, f"voc_{start_sample // samples_per_slice}.wav")
          acc_filename = os.path.join(song_folder, f"acc_{start_sample // samples_per_slice}.wav")
          # print(f"Saving {voc_filename} and {acc_filename}")

          # # play voc and acc sliced audio:
          # print("Voc slice:")
          # ipd.display(ipd.Audio(voc_slice, rate=sr))
          # print("Acc slice:")
          # ipd.display(ipd.Audio(acc_slice, rate=sr))
          torchaudio.save(voc_filename, voc_slice, sr)
          torchaudio.save(acc_filename, acc_slice, sr)
        else:
          pass
          # print(f"Skipping {song_name} slice {start_sample // samples_per_slice} due to insufficient sound energy. voc_energy: {voc_energy}, acc_energy: {acc_energy}")
          # print("Skipped Voc slice:")
          # ipd.display(ipd.Audio(voc_slice, rate=sr))
          # print("Skipped Acc slice:")
          # ipd.display(ipd.Audio(acc_slice, rate=sr))
          # print('-----------------------------------------------')



In [15]:
songs_pairs = []
for filename in os.listdir(accompaniment_path):
    if filename.endswith(".wav"):
        accompaniment_file = os.path.join(accompaniment_path, filename)
        vocal_file = os.path.join(vocal_path, filename)

        if os.path.exists(vocal_file):
            try:
                # Load accompaniment and vocal files with torchaudio
                y_acc, sr_acc = torchaudio.load(accompaniment_file)  # [channels, samples]
                y_voc, sr_voc = torchaudio.load(vocal_file)
                # Check if sample rates are the same
                if sr_acc != sr_voc:
                    print(f"Skipping {filename}: Different sample rates.")
                    continue
                print(sr_acc)
                # Ensure mono (convert stereo to mono if necessary)
                if y_acc.shape[0] > 1:
                    y_acc = torch.mean(y_acc, dim=0, keepdim=True)
                if y_voc.shape[0] > 1:
                    y_voc = torch.mean(y_voc, dim=0, keepdim=True)

                # Check if lengths are the same
                if y_acc.shape[1] != y_voc.shape[1]:
                    print(f"Skipping {filename}: Different lengths.")
                    continue

                y_acc = resampler(y_acc)
                y_voc = resampler(y_voc)

                slice_and_save_audio(filename[:-4], y_voc, y_acc, songs_output_data)
                songs_pairs.append((accompaniment_file, vocal_file))
                print(f"Done processing {filename}")
            except Exception as e:
                print(f"Error processing {filename}")

print(f"Found {len(songs_pairs)} matching pairs.")

Error processing 0003+I Don_t Mind+James Brown.wav
Error processing 0012+Lookin_ For Love+Johnny Lee.wav
Error processing 0006+The Rose+Bette Midler.wav
44100
samples_per_overlap: 16000, samples_per_slice: 32000, audio_length: 2757045
voc_threshold: 902.982666015625, acc_threshold: 847.9668579101562


KeyboardInterrupt: 

In [69]:
ipd.display(ipd.Audio(y_voc, rate=sr))

# LibriSpeech- Load and organize

In [8]:
libri_speech_path = os.path.join(data_path, 'LibriSpeech/dev-clean')
libri_speech_output_data = os.path.join(output_data, 'Librispeech/dev-clean')

In [None]:
# prompt: Load LibriSpeech as pairs (audio-file, and transcript) from drive. structure: folders that contains folders that contains text file with the transcript and audio files

libri_speech_pairs = []
for root, _, files in os.walk(libri_speech_path):
  #get the file that ends with .txt
  for file in files:
      if file.endswith(".txt"):
        try:
          with open(os.path.join(root, file), 'r') as f:
            all_transcripts = f.read().strip()
          # each row formated as '<file_name> <transcript>'
          # example:
          # 1272-141231-0003 HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL SHARP BLOW HIGH ON HIS CHEST while
          # <file_name> = 1272-141231-0003 ; <transcript> = HIS INSTANT OF PANIC WAS FOLLOWED BY A SMALL SHARP BLOW HIGH ON HIS CHEST
          # Parse all_transcripts and then Load file_name and save it as pair with its transcript (the one in the same line)
          for line in all_transcripts.split('\n'):
            audio_file_name, transcript = line.split(' ', 1)
            audio_file_path = os.path.join(root, audio_file_name) + '.flac'
            # Load the audio file
            audio_file, sr = torchaudio.load(audio_file_path)
            # make audio length- 5 seconds (pad with zero or slice according to input)
            if audio_file.shape[1] > 5 * sr:
              audio_file = audio_file[:, :5 * sr]
            elif audio_file.shape[1] <= 5 * sr:
              audio_file = torch.nn.functional.pad(audio_file, (0, 5 * sr - audio_file.shape[1]))
            else:
              print("Something wrong")
            # ipd.display(ipd.Audio(audio_file, rate=sr))
            #save new audio file
            folders_path = root.split(os.sep)[-2:]
            save_file_path = os.path.join(libri_speech_output_data, folders_path[0],folders_path[1], audio_file_name + '.wav')
            os.makedirs(os.path.dirname(save_file_path), exist_ok=True)

            torchaudio.save(save_file_path, audio_file, sr)
            libri_speech_pairs.append((save_file_path, transcript))
            print(f"saved {audio_file_name}")
        except Exception as e:
          print(f"Error reading transcript for {line}: {e}")

print(f"Found {len(libri_speech_pairs)} LibriSpeech pairs.")

saved 1988-148538-0000
saved 1988-148538-0001
saved 1988-148538-0002
saved 1988-148538-0003
saved 1988-148538-0004
saved 1988-148538-0005
saved 1988-148538-0006
saved 1988-148538-0007
saved 1988-148538-0008
saved 1988-148538-0009
saved 1988-148538-0010
saved 1988-148538-0011
saved 1988-148538-0012
saved 1988-148538-0013
saved 1988-148538-0014
saved 1988-148538-0015
saved 1988-147956-0000
saved 1988-147956-0001
saved 1988-147956-0002
saved 1988-147956-0003
saved 1988-147956-0004
saved 1988-147956-0005
saved 1988-147956-0006
saved 1988-147956-0007
saved 1988-147956-0008
saved 1988-147956-0009
saved 1988-147956-0010
saved 1988-147956-0011
saved 1988-147956-0012
saved 1988-147956-0013
saved 1988-147956-0014
saved 1988-147956-0015
saved 1988-147956-0016
saved 1988-147956-0017
saved 1988-147956-0018
saved 1988-147956-0019
saved 1988-147956-0020
saved 1988-147956-0021
saved 1988-147956-0022
saved 1988-147956-0023
saved 1988-147956-0024
saved 1988-147956-0025
saved 1988-147956-0026
saved 1988-