In [16]:
import os
import jams
import shutil
from tqdm import tqdm

errored_files = []
copied_files = []

# Loop over the files in the ./data/jams_all directory, and for each file, load the jam, check the jam.sandbox['type'] and if it is 'audio', copy the file to /data/jams directory. But only the first 100 filest
for root, dirs, files in os.walk('./data/jams_all'):
    for file in tqdm(files):
        path = os.path.join(root, file)
        try:
            jam = jams.load(path)
            if jam.sandbox['type'] == 'audio':
                shutil.copy2(path, './data/jams')
                copied_files.append(os.path.join(root, file))
        except:
            # Add this file to an error list
            errored_files.append(os.path.join(root, file))
                    

100%|██████████| 20086/20086 [01:22<00:00, 242.96it/s]


In [17]:
print(f'Copied {len(copied_files)} files')
print(f'Errored on {len(errored_files)} files')

Copied 1853 files
Errored on 18131 files


In [18]:
total_files = len(copied_files) + len(errored_files)
print(f'Total files: {total_files}')

files_in_dir = len(os.listdir('./data/jams_all'))
print(f'Files in dir: {files_in_dir}')

Total files: 19984
Files in dir: 20086


In [20]:
# Print file names that are in file_in_dir but not in the joint list of copied_files and errored_files
files = []
for root, dirs, files in os.walk('./data/jams_all'):
    for file in files:
        if file not in copied_files and file not in errored_files:
            files.append(file)

print(files)

In [7]:
print('Errored files:')
for file in errored_files:
    print(file)

Errored files:
./data/jams_all/real-book_1299.jams
./data/jams_all/wikifonia_5485.jams
./data/jams_all/biab-internet-corpus_960.jams
./data/jams_all/wikifonia_2340.jams
./data/jams_all/isophonics_222.jams
./data/jams_all/real-book_903.jams
./data/jams_all/nottingham_507.jams
./data/jams_all/biab-internet-corpus_2328.jams
./data/jams_all/weimar_228.jams
./data/jams_all/wikifonia_1995.jams
./data/jams_all/real-book_1763.jams
./data/jams_all/weimar_382.jams
./data/jams_all/ireal-pro_1091.jams
./data/jams_all/wikifonia_3802.jams
./data/jams_all/wikifonia_2710.jams
./data/jams_all/biab-internet-corpus_2282.jams
./data/jams_all/nottingham_157.jams
./data/jams_all/biab-internet-corpus_2778.jams
./data/jams_all/biab-internet-corpus_1857.jams
./data/jams_all/real-book_1333.jams
./data/jams_all/rock-corpus_115.jams
./data/jams_all/wikifonia_3551.jams
./data/jams_all/wikifonia_4294.jams
./data/jams_all/when-in-rome_311.jams
./data/jams_all/biab-internet-corpus_4706.jams
./data/jams_all/schubert-w

In [4]:
# Print if ./data/jams_all/isophonics_170.jams is in errored_files
print('./data/jams_all/isophonics_170.jams' in errored_files)

True


In [2]:
# Print # of files in ./data/jams
import os
print(len(os.listdir('./data/jams')))

1853


In [80]:
import jams
import pandas as pd

tracks = []

for file in os.listdir("./data/jams_all"):
    try:
        jam = jams.load(f"./data/jams_all/{file}")
    except:
        continue
    title = jam.file_metadata.title
    artist = jam.file_metadata.artist
    release = jam.file_metadata.release
    duration = jam.file_metadata.duration
    identifiers = dict(jam.file_metadata.identifiers)
    musicbrainz = identifiers.get("musicbrainz", None)
    youtube = identifiers.get("youtube", None)
    dataid_billboard = identifiers.get("dataid_billboard", None)
    corpus = jam['annotations'][0]['annotation_metadata']['corpus']
    tracks.append(
        {
            "title": title,
            "artist": artist,
            "release": release,
            "duration": duration,
            "corpus": corpus,
            "musicbrainz": musicbrainz,
            "youtube": youtube,
            "dataid_billboard": dataid_billboard,
        }
    )

df = pd.DataFrame(tracks)

In [81]:
df['corpus'].value_counts()

corpus
McGill Billboard                           889
isophonics                                 225
Schubert Winterreise Dataset               216
Uspop 2002                                 195
Real World Computing Music Database        100
Audio-aligned jazz harmony dataset          89
Real Book                                   62
Robbie Williams dataset                     61
Chordify Annotator Subjectivity Dataset     50
BiaB Internet Corpus                        40
Weimar Jazz Database                        28
Name: count, dtype: int64

In [79]:
df['corpus'].value_counts()

corpus
McGill Billboard                           889
isophonics                                 225
Schubert Winterreise Dataset               216
Uspop 2002                                 195
Real World Computing Music Database        100
Audio-aligned jazz harmony dataset          89
Robbie Williams dataset                     61
Chordify Annotator Subjectivity Dataset     50
Weimar Jazz Database                        28
Name: count, dtype: int64

In [47]:
all_keys = set()

for i in range(len(tracks)):
    all_keys.update(tracks[i]['identifiers'].keys())

print(all_keys)

{'youtube', 'musicbrainz', 'dataid_billboard'}


In [75]:
for file in os.listdir("./data/jams"):
    jam = jams.load(f"./data/jams/{file}")
    print(jam['annotations'][0]['annotation_metadata']['corpus'])
    break

isophonics


In [55]:
print(tracks[0])

{'title': 'Eleanor Rigby', 'artist': '', 'release': 'Revolver', 'duration': 127.667, 'musicbrainz': None, 'youtube': None, 'dataid_billboard': None}


In [54]:
df.head()

Unnamed: 0,title,artist,release,duration,musicbrainz,youtube,dataid_billboard
0,Eleanor Rigby,,Revolver,127.667,,,
1,Gefrorne Tränen,,,140.97,https://musicbrainz.org/release/f10034f7-5ebc-...,,
2,Freeze-Frame,,,239.830204,,,
3,Bye Bye Bye,,No Strings Attached,200.0,,,
4,Baby I'm Burnin',,,158.484898,,,


In [58]:
# Output the dataframe to a CSV file
df.to_csv('./data/tracks_with_metadata.csv', index=False)

# Output just the titles, artists and rleases to a CSV file
df[['title', 'artist', 'release']].to_csv('./data/tracks.csv', index=False)