# SSCS dataset conversion to HDF5

In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
import zipfile
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
import psutil
import ray

num_cpus = psutil.cpu_count(logical=False)
ray.init(num_cpus=num_cpus, num_gpus=0, ignore_reinit_error=True)

2023-05-31 21:58:00,589	INFO worker.py:1625 -- Started a local Ray instance.


0,1
Python version:,3.10.11
Ray version:,2.4.0


In [3]:
zipname = "Datasets/SynthSalienceChoralSet_v1.zip"
h5_pathname = "Datasets/HDF5/"

In [4]:
def name_conformity(name):
    return re.sub("[~\"#%&*:<>?/\\{|}]", "", name)

In [5]:
def sscs_get_split(split='train'):
    splitname = "Datasets/SynthSalienceChoralSet_dataSplits.json"
    if(split.lower() == 'train' or split.lower() == 'validate' or
       split.lower() == 'test'):
        return json.load(open(splitname, 'r'))[split.lower()]
    else:
        raise NameError("Split should be 'train', 'validate' or 'test'.")


In [6]:
def checkIntegrity(songlist):

    count = 0
    filtered_list = songlist
    
    with zipfile.ZipFile(zipname, "r") as zf:
        ziplist = zf.namelist()
    
    for song in songlist:
        fname = "sscs/" + song
        mix = fname + "_mix.csv"
        s = fname + "_S.csv"
        a = fname + "_A.csv"
        t = fname + "_T.csv"
        b = fname + "_B.csv"
        if  (not mix in ziplist) or \
            (not s in ziplist) or \
            (not a in ziplist) or \
            (not t in ziplist) or \
            (not b in ziplist):
                filtered_list.remove(song)
                with open(h5_pathname + "Problematic_Songs.txt", "a") as f:
                    f.write(song + "\n")
                count += 1

    print(f"{count} songs not present and removed from scanlist.")
    return filtered_list

In [24]:
train = checkIntegrity(sscs_get_split())
validate = checkIntegrity(sscs_get_split('validate'))
test = checkIntegrity(sscs_get_split('test'))

train_conformity = [name_conformity(name) for name in train]
validate_conformity = [name_conformity(name) for name in validate]
test_conformity = [name_conformity(name) for name in test]

12 songs not present and removed from scanlist.
2 songs not present and removed from scanlist.
3 songs not present and removed from scanlist.


In [8]:
MAX_MEM = 2 * 1024 * 1024 * 1024

def csv_to_df(songname):
    with zipfile.ZipFile(zipname, mode='r') as zf:
        fnames = ["sscs/" + songname + "_mix.csv",
                "sscs/" + songname + "_S.csv",
                "sscs/" + songname + "_A.csv",
                "sscs/" + songname + "_T.csv",
                "sscs/" + songname + "_B.csv"]
        df = []
        for i in range(len(fnames)):
            with zf.open(fnames[i]) as f:
                df.append(pd.read_csv(f, header=None, engine='pyarrow').T.astype('float32'))
        return df
    
def df_to_hdf5(df, savename, keyname):
    df.to_hdf(savename, keyname, mode='a',
            format='table', complevel=9, complib='blosc')

@ray.remote(max_retries=-1)  
def csv_to_hdf5(songname, songname_conformity, split='train'):
    keynames = ['mix', 'soprano', 'alto',
                'tenor', 'bass']
    savename = h5_pathname + "Files/" + songname_conformity + ".h5"
    try:
        if(not os.path.exists(savename)):
            df = csv_to_df(songname)
            for i in range(len(keynames)):
                df_to_hdf5(df[i], savename, keynames[i])
    except:
        with open(h5_pathname + "Problematic_Songs.txt", "a") as f:
            f.write(songname + "\n")
        if(os.path.exists(savename)):
            os.remove(savename)

        try:
            train.remove(songname)
            train_conformity.remove(songname_conformity)
        except:
            try:
                validate.remove(songname)
                validate_conformity.remove(songname_conformity)
            except:
                try:
                    test.remove(songname)
                    test_conformity.remove(songname_conformity)
                except:
                    pass

In [9]:
conv_train = [csv_to_hdf5.remote(train[i],
    train_conformity[i]) for i in range(3200, len(train))]
conv_train_get = ray.get(conv_train)

conv_val = [csv_to_hdf5.remote(validate[i],
    validate_conformity[i], "validate") for i in range(len(validate))]
conv_val_get = ray.get(conv_val)

conv_test = [csv_to_hdf5.remote(test[i],
    test_conformity[i], "test") for i in range(len(test))]
conv_test_get = ray.get(conv_test)

print("Done.")

Done.


In [61]:
hdf5_converted = [os.path.splitext(f)[0] for f in os.listdir(h5_pathname + "Files/")]
original_files = []
original_files.extend(name_conformity(i) for i in sscs_get_split())
original_files.extend(name_conformity(i) for i in sscs_get_split('validate'))
original_files.extend(name_conformity(i) for i in sscs_get_split('test'))
problematic_files = []
problematic_files.extend(original_files)

for song in hdf5_converted:
    if song in problematic_files:
        problematic_files.remove(song)

with open(h5_pathname + "Problematic_Songs.txt", "w") as f:
    f.writelines([i + "\n" for i in problematic_files])


hdf5_metadata = {}
hdf5_metadata['train'] = [name_conformity(i) for i in sscs_get_split()]
hdf5_metadata['validate'] = [name_conformity(i) for i in sscs_get_split('validate')]
hdf5_metadata['test'] = [name_conformity(i) for i in sscs_get_split('test')]
'''
for song in hdf5_metadata['train']:
    if not (song in hdf5_converted):
        hdf5_metadata['train'].remove(song)

for song in hdf5_metadata['validate']:
    if not (song in hdf5_converted):
        hdf5_metadata['validate'].remove(song)

for song in hdf5_metadata['test']:
    if not (song in hdf5_converted):
        hdf5_metadata['test'].remove(song)

metadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"
with open(metadata_filename, "w") as metadata_file:
    json.dump(hdf5_metadata, metadata_file, indent=4)

#len(original_files) - len(problematic_files)
#len(hdf5_metadata['train']) + len(hdf5_metadata['validate']) + len(hdf5_metadata['test'])
'''

'\nfor song in hdf5_metadata[\'train\']:\n    if not (song in hdf5_converted):\n        hdf5_metadata[\'train\'].remove(song)\n\nfor song in hdf5_metadata[\'validate\']:\n    if not (song in hdf5_converted):\n        hdf5_metadata[\'validate\'].remove(song)\n\nfor song in hdf5_metadata[\'test\']:\n    if not (song in hdf5_converted):\n        hdf5_metadata[\'test\'].remove(song)\n\nmetadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"\nwith open(metadata_filename, "w") as metadata_file:\n    json.dump(hdf5_metadata, metadata_file, indent=4)\n\n#len(original_files) - len(problematic_files)\n#len(hdf5_metadata[\'train\']) + len(hdf5_metadata[\'validate\']) + len(hdf5_metadata[\'test\'])\n'

In [10]:
hdf5_metadata = {}
hdf5_metadata['train'] = train_conformity
hdf5_metadata['validate'] = validate_conformity
hdf5_metadata['test'] = test_conformity

metadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"
with open(metadata_filename, "w") as metadata_file:
    json.dump(hdf5_metadata, metadata_file, indent=4)