# Filter and Rename

This notebook filters the songs in the dataset based on if they are duplicates. Duplicates are detected as a combination of artist and title from the metadata of the JAMS file.

Files are then renamed to be '{artist}_{title}'. If no metadata is available (~150 files) the original obfuscated file name is kept.

The notebook currently performs this operation on the audio and chords of the processed forms of the data.

In [1]:
import autorootcwd
import jams
import os

with open('./data/raw/audio/filelist.txt', 'r') as f:
    filenames = f.read().splitlines()

# Just get the root filename
filenames = [os.path.splitext(os.path.basename(f))[0] for f in filenames]

In [2]:
# for each filename, open the jams file and get the metadata
def get_metadata(filename):
    jam = jams.load('./data/raw/references_v2/' + filename + '.jams')
    metadata = jam.file_metadata
    artist = metadata.artist
    title = metadata.title
    # Strip, remove spaces and slashes and lowercase
    artist, title = artist.lower(), title.lower()
    artist, title = artist.strip(), title.strip()
    artist, title = artist.replace(' ', ''), title.replace(' ', '')
    artist, title = artist.replace('/', ''), title.replace('/', '')
    artist, title = artist.replace('.', ''), title.replace('.', '')
    
    duration = metadata.duration

    new_filename = f'{artist}_{title}'
    if artist == '' and title == '':
        new_filename = filename
    
    return {'new_filename': new_filename, 'artist': artist, 'title': title, 'filename': filename, 'duration': duration}

# Construct dict for each file
metadata = [get_metadata(f) for f in filenames]

In [3]:
# Keep only the first instance of each new_filename
metadata = {m['new_filename']: m for m in metadata}

# Create new dict of old_filename -> new_filename
filename_map = {m['filename']: m['new_filename'] for m in metadata.values()}

In [12]:
# Copy files to new directory
import shutil
from tqdm import tqdm

def rename_copy_files(old_dir: str, new_dir: str, extension:str) -> None:
    errored_files = []
    for old_filename, new_filename in tqdm(filename_map.items()):
        try:
            shutil.copy(f'{old_dir}/{old_filename}.{extension}', f'{new_dir}/{new_filename}.{extension}')
        except Exception as e:
            print(e)
            errored_files.append(old_filename)

    return errored_files

In [13]:
errored_files = rename_copy_files('./data/raw/audio', './data/processed/audio', 'mp3')

100%|██████████| 1213/1213 [00:11<00:00, 102.02it/s]


In [14]:
errored_chords = rename_copy_files('./data/raw/references_v2', './data/processed/chords', 'jams')

100%|██████████| 1213/1213 [00:00<00:00, 1399.36it/s]


In [15]:
len(errored_files), len(errored_chords)

(0, 0)