# SSCS dataset conversion to HDF5

In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
import zipfile
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
import psutil
import ray

num_cpus = psutil.cpu_count(logical=False)
ray.init(num_cpus=num_cpus, num_gpus=0, ignore_reinit_error=True)

2023-05-29 22:46:26,353	INFO worker.py:1625 -- Started a local Ray instance.


0,1
Python version:,3.10.11
Ray version:,2.4.0


In [3]:
zipname = "Datasets/SynthSalienceChoralSet_v1.zip"
h5_pathname = "Datasets/HDF5/"

In [4]:
def name_conformity(name):
    return re.sub("[~\"#%&*:<>?/\\{|}]", "", name)

In [5]:
def sscs_get_split(split='train'):
    splitname = "Datasets/SynthSalienceChoralSet_dataSplits.json"
    if(split.lower() == 'train' or split.lower() == 'validate' or
       split.lower() == 'test'):
        return json.load(open(splitname, 'r'))[split.lower()]
    else:
        raise NameError("Split should be 'train', 'validate' or 'test'.")


In [6]:
def checkIntegrity(songlist):

    count = 0
    filtered_list = songlist
    
    with zipfile.ZipFile(zipname, "r") as zf:
        ziplist = zf.namelist()
    
    for song in songlist:
        fname = "sscs/" + song
        mix = fname + "_mix.csv"
        s = fname + "_S.csv"
        a = fname + "_A.csv"
        t = fname + "_T.csv"
        b = fname + "_B.csv"
        if  (not mix in ziplist) or \
            (not s in ziplist) or \
            (not a in ziplist) or \
            (not t in ziplist) or \
            (not b in ziplist):
                filtered_list.remove(song)
                count += 1

    print(f"{count} songs not present and removed from scanlist.")
    return filtered_list

In [7]:
train = checkIntegrity(sscs_get_split())
validate = checkIntegrity(sscs_get_split('validate'))
test = checkIntegrity(sscs_get_split('test'))

train_conformity = [name_conformity(name) for name in train]
validate_conformity = [name_conformity(name) for name in validate]
test_conformity = [name_conformity(name) for name in test]

hdf5_metadata = {}
hdf5_metadata['train'] = train_conformity
hdf5_metadata['validate'] = validate_conformity
hdf5_metadata['test'] = test_conformity

metadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"
with open(metadata_filename, "w") as metadata_file:
    json.dump(hdf5_metadata, metadata_file, indent=4)

12 songs not present and removed from scanlist.
2 songs not present and removed from scanlist.
3 songs not present and removed from scanlist.


In [8]:
MAX_MEM = 2 * 1024 * 1024 * 1024

def csv_to_df(songname):
    with zipfile.ZipFile(zipname) as zf:
        fname = "sscs/" + songname
        with zf.open(fname) as f:
            df = pd.read_csv(f, header=None, engine='pyarrow')
        df = df.T
        return df
    
def df_to_hdf5(df, savename, keyname):
    df.to_hdf(savename, keyname, mode='a',
            format='table', complevel=9, complib='blosc')

@ray.remote  
def csv_to_hdf5(songname, songname_conformity, split='train'):
    fnames   = [songname + "_mix.csv",
                songname + "_S.csv",
                songname + "_A.csv",
                songname + "_T.csv",
                songname + "_B.csv"] 
    keynames = ['mix', 'soprano', 'alto',
                'tenor', 'bass']
    savename = h5_pathname + "Files/" + songname_conformity + ".h5"
    try:
        for i in range(len(keynames)):
            df = csv_to_df(fnames[i])
            df_to_hdf5(df, savename, keynames[i])
    except:
        print(f"Problematic file: {songname}")
        if(os.path.exists(savename)):
            os.remove(savename)

In [9]:
def sscs_plot(dataframe):

    aspect_ratio = (3/8)*dataframe.shape[1]/dataframe.shape[0]
    fig, ax = plt.subplots(figsize=(13, 7))
    im = ax.imshow(dataframe, interpolation='nearest', aspect=aspect_ratio,
        cmap = mpl.colormaps['BuPu'])
    ax.invert_yaxis()
    plt.show()

In [10]:
'''
conv_train = [csv_to_hdf5.options(memory=MAX_MEM).remote(train[i],
    train_conformity[i]) for i in range(len(train))]

ray.get(conv_train)
'''

'\nconv_train = [csv_to_hdf5.options(memory=MAX_MEM).remote(train[i],\n    train_conformity[i]) for i in range(len(train))]\n\nray.get(conv_train)\n'

In [11]:
conv_val = [csv_to_hdf5.options(memory=MAX_MEM).remote(validate[i],
    validate_conformity[i]) for i in range(len(validate))]

conv_val_get = ray.get(conv_val)
print("Done.")

[2m[36m(csv_to_hdf5 pid=4692)[0m Problematic file:
[2m[36m(csv_to_hdf5 pid=4692)[0m Non ti contristi (Benedetto Marcello)
[2m[36m(csv_to_hdf5 pid=4692)[0m 
[2m[36m(csv_to_hdf5 pid=4692)[0m 
[2m[36m(csv_to_hdf5 pid=14424)[0m Gloria, RV 589 (Antonio Vivaldi)
[2m[36m(csv_to_hdf5 pid=14424)[0m 
[2m[36m(csv_to_hdf5 pid=14424)[0m [32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(csv_to_hdf5 pid=4692)[0m Problematic file:
[2m[36m(csv_to_hdf5 pid=4692)[0m Missa Ancor che col partire (Philippe de Monte)
[2m[36m(csv_to_hdf5 pid=4692)[0m [32m [repeated 2x across cluster][0m
[2m[36m(csv_to_hdf5 pid=12744)[0m Missa Iste Confessor (Giovanni Pierluigi da Palestrina)
[2m[36m(csv_to_hdf5 pid=12744)[0m 
[2m[36m(csv_to_hdf5 pid=12744)[0m [32m [repeated 2x across cluster][0m

KeyboardInterrupt: 

In [None]:
'''
conv_test = [csv_to_hdf5.options(memory=MAX_MEM).remote(test[i],
    test_conformity[i]) for i in range(len(test))]

ray.get(conv_test)
'''