In [1]:
# Libararies
import torch
import os
import re
from pydub import AudioSegment
from demucs.pretrained import get_model
from demucs.apply import apply_model
from demucs.separate import load_track
import librosa
import soundfile

In [2]:
dataset_path = '/home/kangyi/Lyrics-audio-Alignment/dataset/songs_en'
lang = "en-US"

PRE_PROCESSED = True
ORIGINAL_SR = 44100
TARGET_SR = 16000

In [3]:
# Return pytorch device
def select_device(mps_enable = True):
    if torch.cuda.is_available():
        return torch.device('cuda')     # CUDA
    elif torch.backends.mps.is_available() & mps_enable: 
        return torch.device('mps')    # Apple Sillicon
    else:
        return torch.device("cpu")    # CPU

In [4]:
def remove_special_characters(filename):
    # Replace all non-alphanumeric characters (except for dots and underscores) with an empty string
    return re.sub(r'[^a-zA-Z0-9_.-]', '', filename)

# Get filenames
# Get a list of file names in the directory
file_names_lrc = os.listdir(dataset_path + os.sep + 'lrc_processed')
file_names_songs = os.listdir(dataset_path + os.sep + 'songs')
file_names_lrc_processed = [remove_special_characters(os.path.splitext(file)[0]) for file in file_names_lrc]
file_names_songs_processed = [remove_special_characters(os.path.splitext(file)[0]) for file in file_names_songs]

file_names_lrc_idx = []
file_names_songs_idx = []
for i in range(len(file_names_lrc_processed)):
    for j in range(len(file_names_songs_processed)):
        if file_names_lrc_processed[i] == file_names_songs_processed[j]:
            if file_names_lrc_processed[i] != '':
                file_names_lrc_idx.append(file_names_lrc[i])
                file_names_songs_idx.append(file_names_songs[j])
            continue
len(file_names_lrc_idx)

281

In [None]:
if not os.path.exists('dataset/output-en'):
    os.makedirs('dataset/output-en')
    os.makedirs('dataset/output-en' + os.sep + 'vocal')

# Extract Vocals
# Select Device
device = select_device()
print("Device Selected:", device)

output_path = "/home/kangyi/Lyrics-audio-Alignment/dataset/output-en"

# Choose Demucs Model for Vocals Extraction
demucs_model = get_model(name="htdemucs", repo=None)
demucs_model.to(device)
demucs_model.eval()
vocals_source_idx = demucs_model.sources.index("vocals")
sample_rate = demucs_model.samplerate

for i in range(len(file_names_songs_idx)):
    song_path_abs = dataset_path + os.sep + 'songs' + os.sep + file_names_songs_idx[i]
    vocal_path = output_path + os.sep + 'vocal' + os.sep + os.path.splitext(file_names_songs_idx[i])[0] + "_vocal.wav"
    if os.path.exists(vocal_path) or os.path.splitext(file_names_songs_idx[i])[-1] != '.ogg':
        continue
    # Load Aduio Track
    print(f"Loaded {song_path_abs}")
    audio_track = load_track(song_path_abs, 2, sample_rate)

    # Extract Vocal
    ref = audio_track.mean(0)
    audio_track_nor = (audio_track - ref.mean()) / ref.std() # Normalization
    with torch.no_grad():
        sources = apply_model(demucs_model, audio_track_nor[None], device=device, shifts=1, split=True, overlap=0.25, progress=False)
    track_vocal = sources[0][vocals_source_idx].cpu().numpy()[0, ...]

    # Post-processing
    track_vocal = librosa.resample(track_vocal, orig_sr=ORIGINAL_SR, target_sr=TARGET_SR)

    # Write to Output
    try:
        soundfile.write(vocal_path, track_vocal, TARGET_SR)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Converted {file_names_songs_idx[i]} to {vocal_path}")
    print(f"{i+1}/{len(file_names_songs_idx)}")


Device Selected: cuda


In [9]:
# Helper Function to convert time
def timestamp_to_seconds(ts_str):
    # ts_str format: "MM:SS.xxx"
    minutes, rest = ts_str.split(":")
    seconds = float(rest)
    total_seconds = int(minutes)*60 + seconds
    return total_seconds

files = []
lines = []

# Create folders if not exits
if not os.path.exists('dataset/output-en' + os.sep + 'splits'):
    os.makedirs('dataset/output-en' + os.sep + 'splits')

# Go through all of them
for idx in range(len(file_names_lrc_idx)):
    # Make sure it is lrc file
    if os.path.splitext(file_names_songs_idx[idx])[-1] != '.ogg':
        continue

    # Parse Lrc Files
    pattern = r'\[([0-9]{2}:[0-9]{2}\.[0-9]{3})\]([^[]*)'
    parsed_data = []
    lrc_path_abs = dataset_path + os.sep + 'lrc_processed' + os.sep + file_names_lrc_idx[idx]
    print(file_names_lrc_idx[idx])
    with open(lrc_path_abs, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            matches = re.findall(pattern, line)
            lyrics_line = ''
            for i in range(len(matches)):
                lyrics_line = lyrics_line + matches[i][1].strip() + ' '
            parsed_data.append((matches[0][0], matches[len(matches)-1][0], lyrics_line.strip()))

    line_segments = [(timestamp_to_seconds(ts1), timestamp_to_seconds(ts2), txt) for ts1, ts2, txt in parsed_data]

    # Export all the segments
    vocal_path = output_path + os.sep + 'vocal' + os.sep + os.path.splitext(file_names_songs_idx[idx])[0] + "_vocal.wav"
    audio = AudioSegment.from_wav(vocal_path)
    audio_duration = len(audio) / 1000.0

    for idy, (start, end, line) in enumerate(line_segments):
        segment_audio = audio[start*1000 : end*1000]  # pydub uses milliseconds
        segment_filename = output_path + os.sep + 'splits' + os.sep + os.path.splitext(file_names_songs_idx[idx])[0] + f"_vocal_{idy:05d}.wav"
        files.append(segment_filename)
        lines.append(line)
        segment_audio.export(segment_filename, format="wav")



Taylor Swift - Blank Space.lrc
Evanescence - Bring Me To Life.lrc
Christina Perri - A Thousand Years.lrc
Meghan Trainor - Me Too.lrc
Selena Gomez - Come & Get It.lrc
Linkin Park - In The End.lrc
Carly Rae Jepsen - Call Me Maybe.lrc
Zara Larsson - Lush Life.lrc
Beyoncé - Love On Top.lrc
Taylor Swift - I Knew You Were Trouble.lrc
Eminem - Like Toy Soldiers.lrc
One Direction - History.lrc
Bastille - Pompeii.lrc
Rihanna - Don't Stop The Music (Jody den Broeder).lrc
Mariah Carey - We Belong Together.lrc
Katy Perry - This Is How We Do.lrc
One Direction - What Makes You Beautiful.lrc
Little Mix - Black Magic.lrc
Israel Kamakawiwo Ole - Over The Rainbow.lrc
AC⁄DC - Back In Black.lrc
French Montana - Feeling Myself.lrc
Avril Lavigne - What The Hell.lrc
Britney Spears - I Wanna Go.lrc
Tove Lo - Habits (Stay High) (Oliver Nelson Remix).lrc
Sean Kingston - Beautiful Girls.lrc
Justin Bieber - Sorry.lrc
Justin Timberlake - Mirrors.lrc
Post Malone - White Iverson (Clean).lrc
Shawn Mendes - Stitches.

In [10]:
import csv

with open(output_path + os.sep + "metadata.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # Write header
    writer.writerow(["file_name", "text"])
    # Write each segment's data
    for i in range(len(files)):
        writer.writerow([files[i], lines[i]])