In [1]:
# Libararies
import torch
import pandas as pd
import os
import librosa
import soundfile
from pydub import AudioSegment
# Extract Vocals from Audio File
from demucs.pretrained import get_model
from demucs.apply import apply_model
from demucs.separate import load_track

In [2]:
dataset_path = '/home/kangyi/jamendolyrics/'
lang = "en-US"

PRE_PROCESSED = True
ORIGINAL_SR = 44100
TARGET_SR = 16000

# Dataset

JamendoLyrics MultiLang dataset for lyrics research
- Similarity-based Audio-Lyrics Alignment of Multiple Languages ICASSP 2023
- https://github.com/f90/jamendolyrics

In [3]:
# Load Dataset
# Load the CSV file into a DataFrame
df = pd.read_csv(dataset_path + 'JamendoLyrics.csv')

if lang == "en-US":
    df = df[df['Language'] == "English"]
# Display the first few rows
df

Unnamed: 0,URL,Filepath,Artist,Title,Genre,LicenseType,Language,LyricOverlap,Polyphonic,NonLexical
0,https://www.jamendo.com/track/1559261/give-me-...,HILA_-_Give_Me_the_Same.mp3,HILA,Give Me The Same,Pop,BY-ND,English,False,False,False
1,https://www.jamendo.com/track/1552064/keep-on,Quentin_Hannappe_-_Keep_On.mp3,Quentin Hannappe,Keep On,Pop,BY-NC-ND,English,False,False,False
2,https://www.jamendo.com/track/1537288/back-in-...,Songwriterz_-_Back_In_Time.mp3,Songwriterz,Back In Time,Pop,BY-ND,English,False,False,True
3,https://www.jamendo.com/track/1442030/peyote,Kinematic_-_Peyote.mp3,KINEMATIC,Peyote,Rock,BY-ND,English,False,True,False
4,https://www.jamendo.com/track/1465148/embers,Avercage_-_Embers.mp3,Avercage,Embers,Rock,BY-NC-SA,English,False,False,False
5,https://www.jamendo.com/track/1344500/falling-...,Color_Out_-_Falling_Star.mp3,Colour Out,Falling Star,Rock,BY-NC-ND,English,False,False,False
6,https://www.jamendo.com/track/1530532/one-way-...,The.madpix.project_-_One_Way_Street.mp3,The.Madpix.Project,One Way Street,Electronic,BY-NC-SA,English,False,False,False
7,https://www.jamendo.com/track/1468287/the-stat...,Wordsmith_-_The_Statement.mp3,Wordsmith,The statement,Hip-Hop,BY-ND,English,False,True,False
8,https://www.jamendo.com/track/1144143/crowd-pl...,JASON_MILLER_-_CROWD_PLEASER.mp3,Jason Miller,Crowdpleaser,Hip-Hop,BY-NC-SA,English,False,False,False
9,https://www.jamendo.com/track/1559614/is-it-right,Lower_Loveday_-_Is_It_Right_.mp3,Lower Loveday,Is It Right?,Indie,BY-SA,English,False,False,False


# Helper Functions

In [4]:
# Return pytorch device
def select_device(mps_enable = True):
    if torch.cuda.is_available():
        return torch.device('cuda')     # CUDA
    elif torch.backends.mps.is_available() & mps_enable: 
        return torch.device('mps')    # Apple Sillicon
    else:
        return torch.device("cpu")    # CPU

# Preprocessing

In [5]:
# Preprocess audio file if not processed
if not PRE_PROCESSED:
    # a directory to save the output
    if not os.path.exists('output'):
        os.makedirs('output')
        os.makedirs('output' + os.sep + 'wav')
        os.makedirs('output' + os.sep + 'vocal')


    for song_path in df['Filepath']:
        song_path_abs = dataset_path + 'mp3' + os.sep + song_path
        wav_path = 'output' + os.sep + 'wav' + os.sep + os.path.splitext(song_path)[0] + ".wav"
        vocal_path = 'output' + os.sep + 'vocal' + os.sep + os.path.splitext(song_path)[0] + "_vocal.wav"
        
        # Load audio
        song_audio = AudioSegment.from_mp3(song_path_abs)

        # Convert audio to wav
        song_audio.export(wav_path, format="wav")
        print(f"Converted {song_path} to {wav_path}")

        # Extract Vocals
        # Select Device
        device = select_device()
        print("Device Selected:", device)

        # Choose Demucs Model for Vocals Extraction
        demucs_model = get_model(name="htdemucs", repo=None)
        demucs_model.to(device)
        demucs_model.eval()
        vocals_source_idx = demucs_model.sources.index("vocals")
        sample_rate = demucs_model.samplerate

        # Load Aduio Track
        audio_track = load_track(wav_path, 2, sample_rate)

        # Extract Vocal
        ref = audio_track.mean(0)
        audio_track_nor = (audio_track - ref.mean()) / ref.std() # Normalization
        with torch.no_grad():
            sources = apply_model(demucs_model, audio_track_nor[None], device=device, shifts=1, split=True, overlap=0.25, progress=False)
        track_vocal = sources[0][vocals_source_idx].cpu().numpy()[0, ...]

        # Post-processing
        track_vocal = librosa.resample(track_vocal, orig_sr=ORIGINAL_SR, target_sr=TARGET_SR)

        # Write to Output
        soundfile.write(vocal_path, track_vocal, TARGET_SR)
        print(f"Converted {wav_path} to {vocal_path}")

In [6]:
# Get all the paths
songs_paths = []
wav_paths = []
vocals_paths = []
lyrics_paths = []
lyrics_words_paths = []
lyrics_words_results_paths = []

for song_path in df['Filepath']:
    songs_paths.append(dataset_path + 'mp3' + os.sep + song_path)
    wav_paths.append('output' + os.sep + 'wav' + os.sep + os.path.splitext(song_path)[0] + ".wav")
    vocals_paths.append('output' + os.sep + 'vocal' + os.sep + os.path.splitext(song_path)[0] + "_vocal.wav")
    lyrics_paths.append(dataset_path + 'lyrics' + os.sep + os.path.splitext(song_path)[0] + ".txt")
    lyrics_words_paths.append(dataset_path + 'lyrics' + os.sep + os.path.splitext(song_path)[0] + ".words.txt")
    lyrics_words_results_paths.append(dataset_path + 'annotations' + os.sep + 'words' + os.sep + os.path.splitext(song_path)[0] + ".csv")

# Evaluation on ASR Model

In [7]:
# Evaluation
from ARS_base import ASR_sync
from Lyrics import Lyrics_WS
from MIREXEvaluate import MIREXevalute
import statistics
from transformers import logging
logging.set_verbosity_error()

AAE = []
AAM = []
PCS = []
PCE = []
songs_num = len(song_path)
for i in range(20):
    synced_lyrics = ASR_sync(vocals_paths[i], lyrics_words_paths[i])
    synced_lyrics_ref = Lyrics_WS()
    df_temp = pd.read_csv(lyrics_words_results_paths[i])
    for index, row in df_temp.iterrows():
        synced_lyrics_ref.add_segment(label=synced_lyrics.lyrics[index].label, start=row['word_start'] , end=row['word_end'])

    AAE_temp, AAM_temp, PCS_temp, PCE_temp = MIREXevalute(synced_lyrics.lyrics, synced_lyrics_ref.lyrics)
    print('[' + str(i) + ']' + ': ' + str(AAE_temp) + ', ' + str(AAM_temp) + ', ' + str(PCS_temp) + ', ' + str(PCE_temp))
    AAE.append(AAE_temp), AAM.append(AAM_temp), PCS.append(PCS_temp), PCE.append(PCE_temp)



  from .autonotebook import tqdm as notebook_tqdm


[0]: 0.2662686564520169, 0.121456274949999, 0.5190048018807284, 0.8618012422360248
[1]: 0.27834355177457076, 0.0961135292499975, 0.31041739950800634, 0.9171428571428571
[2]: 0.34201555042205745, 0.0962103404499981, 0.5675368085879693, 0.8949579831932774
[3]: 0.1994144376768703, 0.07492778894999752, 0.4973082131338369, 0.9183673469387755
[4]: 0.6609603728735448, 0.12718223510000115, 0.4646336349905964, 0.7116402116402116
[5]: 0.5698693264796868, 0.06982785669999902, 0.49175207825451644, 0.8638392857142857
[6]: 1.7291134845877036, 0.08696105524999354, 0.33311511733554844, 0.8497267759562842
[7]: 0.1091301077806352, 0.08536061885000024, 0.4913318678721988, 0.9741824440619621
[8]: 0.11912554256900033, 0.10925520944999789, 0.46185507160503714, 0.9846449136276392
[9]: 0.21365569243679114, 0.09968076834999096, 0.5093700432660321, 0.9198113207547169
[10]: 4.248625191272806, 0.1438569229999942, 0.3374584678880772, 0.6622807017543859
[11]: 0.8968664532421046, 0.1281264054499971, 0.30083909155061

In [8]:
print('Results' + ': ' + str(statistics.mean(AAE)) + ', ' + str(statistics.mean(AAM)) + ', ' + str(statistics.mean(PCS)) + ', ' + str(statistics.mean(PCE)))
songs_num

Results: 0.6848422197282376, 0.10969465147499977, 0.45872478317200516, 0.8715226161298093


28

# Evaluation on Massively Multilingual Speech (MMS) Model
- torchaudio.pipelines.MMS_FA

In [None]:
# Evaluation
from MMS_pytorch_base import MMS_pytorch_sync
from Lyrics import Lyrics_WS
from MIREXEvaluate import MIREXevalute
import statistics


AAE = []
AAM = []
PCS = []
PCE = []
songs_num = len(song_path)
for i in range(20):
    synced_lyrics = MMS_pytorch_sync(vocals_paths[i], lyrics_words_paths[i])
    synced_lyrics_ref = Lyrics_WS()
    df_temp = pd.read_csv(lyrics_words_results_paths[i])
    for index, row in df_temp.iterrows():
        synced_lyrics_ref.add_segment(label=synced_lyrics.lyrics[index].label, start=row['word_start'] , end=row['word_end'])
    AAE_temp, AAM_temp, PCS_temp, PCE_temp = MIREXevalute(synced_lyrics.lyrics, synced_lyrics_ref.lyrics)
    print('[' + str(i) + ']' + ': ' + str(AAE_temp) + ', ' + str(AAM_temp) + ', ' + str(PCS_temp) + ', ' + str(PCE_temp))
    AAE.append(AAE_temp), AAM.append(AAM_temp), PCS.append(PCS_temp), PCE.append(PCE_temp)



In [None]:
print('Results' + ': ' + str(statistics.mean(AAE)) + ', ' + str(statistics.mean(AAM)) + ', ' + str(statistics.mean(PCS)) + ', ' + str(statistics.mean(PCE)))

Results: 1.4599451146740066, 0.2973756713449999, 0.36791806113769115, 0.7671859069892095
