In [1]:
# Libararies
import torch
import os
import re
from pydub import AudioSegment
from demucs.pretrained import get_model
from demucs.apply import apply_model
from demucs.separate import load_track
import librosa
import soundfile

In [3]:
dataset_path = '/home/kangyi/Lyrics-audio-Alignment/dataset/songs_en'
lang = "en-US"

PRE_PROCESSED = True
ORIGINAL_SR = 44100
TARGET_SR = 16000

In [4]:
# Return pytorch device
def select_device(mps_enable = True):
    if torch.cuda.is_available():
        return torch.device('cuda')     # CUDA
    elif torch.backends.mps.is_available() & mps_enable: 
        return torch.device('mps')    # Apple Sillicon
    else:
        return torch.device("cpu")    # CPU

In [5]:
def remove_special_characters(filename):
    # Replace all non-alphanumeric characters (except for dots and underscores) with an empty string
    return re.sub(r'[^a-zA-Z0-9_.-]', '', filename)

# Get filenames
# Get a list of file names in the directory
file_names_lrc = os.listdir(dataset_path + os.sep + 'lrc')
file_names_songs = os.listdir(dataset_path + os.sep + 'songs')
file_names_lrc_processed = [remove_special_characters(os.path.splitext(file)[0]) for file in file_names_lrc]
file_names_songs_processed = [remove_special_characters(os.path.splitext(file)[0]) for file in file_names_songs]

file_names_lrc_idx = []
file_names_songs_idx = []
for i in range(len(file_names_lrc_processed)):
    if file_names_lrc[i] != '.DS_Store':
        for j in range(len(file_names_songs_processed)):
            if file_names_songs[i] != '.DS_Store':
                if file_names_lrc_processed[i] == file_names_songs_processed[j]:
                    file_names_lrc_idx.append(file_names_lrc[i])
                    file_names_songs_idx.append(file_names_songs[j])
                    continue
len(file_names_lrc_idx)

302

In [7]:
if not os.path.exists('output-en'):
    os.makedirs('output-en')
    os.makedirs('output-en' + os.sep + 'vocal')

# Extract Vocals
# Select Device
device = select_device()
print("Device Selected:", device)

# Choose Demucs Model for Vocals Extraction
demucs_model = get_model(name="htdemucs", repo=None)
demucs_model.to(device)
demucs_model.eval()
vocals_source_idx = demucs_model.sources.index("vocals")
sample_rate = demucs_model.samplerate

for i in range(len(file_names_lrc_idx)):
    song_path_abs = dataset_path + os.sep + 'songs' + os.sep + file_names_songs[i]
    vocal_path = 'output-en' + os.sep + 'vocal' + os.sep + os.path.splitext(file_names_songs[i])[0] + "_vocal.wav"
    if os.path.exists(vocal_path) or os.path.splitext(file_names_songs[i])[-1] != '.ogg':
        continue
    # Load Aduio Track
    audio_track = load_track(song_path_abs, 2, sample_rate)

    # Extract Vocal
    ref = audio_track.mean(0)
    audio_track_nor = (audio_track - ref.mean()) / ref.std() # Normalization
    with torch.no_grad():
        sources = apply_model(demucs_model, audio_track_nor[None], device=device, shifts=1, split=True, overlap=0.25, progress=False)
    track_vocal = sources[0][vocals_source_idx].cpu().numpy()[0, ...]

    # Post-processing
    track_vocal = librosa.resample(track_vocal, orig_sr=ORIGINAL_SR, target_sr=TARGET_SR)

    # Write to Output
    soundfile.write(vocal_path, track_vocal, TARGET_SR)
    print(f"Converted {file_names_songs[i]} to {vocal_path}")


Device Selected: cuda
Converted Jonas Blue _ JP Cooper - Perfect Strangers.ogg to output-en/vocal/Jonas Blue _ JP Cooper - Perfect Strangers_vocal.wav
Converted Ellie Goulding - Burn.ogg to output-en/vocal/Ellie Goulding - Burn_vocal.wav
Converted Skrillex - Scary Monsters and Nice Sprites.ogg to output-en/vocal/Skrillex - Scary Monsters and Nice Sprites_vocal.wav
Converted Taylor Swift - Mine.ogg to output-en/vocal/Taylor Swift - Mine_vocal.wav
Converted Damian _Jr Gong_ Marley _ Skrillex - Make It Bun Dem.ogg to output-en/vocal/Damian _Jr Gong_ Marley _ Skrillex - Make It Bun Dem_vocal.wav
Converted Bruno Mars - Grenade.ogg to output-en/vocal/Bruno Mars - Grenade_vocal.wav
Converted Bloodhound Gang - The Bad Touch.ogg to output-en/vocal/Bloodhound Gang - The Bad Touch_vocal.wav
Converted Akon - Right Now(Na Na Na).ogg to output-en/vocal/Akon - Right Now(Na Na Na)_vocal.wav
Converted Michael Jackson - They Don't Care About Us.ogg to output-en/vocal/Michael Jackson - They Don't Care Ab