# SSCS dataset conversion to HDF5

In [None]:
import os
import re
import numpy as np
import pandas as pd
import json
import zipfile
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
import psutil
import ray

num_cpus = psutil.cpu_count(logical=False)
ray.init(num_cpus=num_cpus, num_gpus=0, ignore_reinit_error=True)

In [None]:
zipname = "Datasets/SynthSalienceChoralSet_v1.zip"
h5_pathname = "Datasets/HDF5/"

In [None]:
def name_conformity(name):
    return re.sub("[~\"#%&*:<>?/\\{|}]", "", name)

In [None]:
def sscs_get_split(split='train'):
    splitname = "Datasets/SynthSalienceChoralSet_dataSplits.json"
    if(split.lower() == 'train' or split.lower() == 'validate' or
       split.lower() == 'test'):
        return json.load(open(splitname, 'r'))[split.lower()]
    else:
        raise NameError("Split should be 'train', 'validate' or 'test'.")


In [None]:
def checkIntegrity(songlist):

    count = 0
    filtered_list = songlist
    
    with zipfile.ZipFile(zipname, "r") as zf:
        ziplist = zf.namelist()
    
    for song in songlist:
        fname = "sscs/" + song
        mix = fname + "_mix.csv"
        s = fname + "_S.csv"
        a = fname + "_A.csv"
        t = fname + "_T.csv"
        b = fname + "_B.csv"
        if  (not mix in ziplist) or \
            (not s in ziplist) or \
            (not a in ziplist) or \
            (not t in ziplist) or \
            (not b in ziplist):
                filtered_list.remove(song)
                count += 1

    print(f"{count} songs not present and removed from scanlist.")
    return filtered_list

In [None]:
train = checkIntegrity(sscs_get_split())
validate = checkIntegrity(sscs_get_split('validate'))
test = checkIntegrity(sscs_get_split('test'))

train_conformity = [name_conformity(name) for name in train]
validate_conformity = [name_conformity(name) for name in validate]
test_conformity = [name_conformity(name) for name in test]

In [None]:
MAX_MEM = 4 * 1024 * 1024 * 1024

def csv_to_df(songname):
    with zipfile.ZipFile(zipname) as zf:
        fname = "sscs/" + songname
        with zf.open(fname) as f:
            df = pd.read_csv(f, header=None, engine='pyarrow')
        df = df.T
        return df
    
def df_to_hdf5(df, savename, keyname):
    df.to_hdf(savename, keyname, mode='a',
            format='table', complevel=9, complib='blosc')

@ray.remote  
def csv_to_hdf5(songname, songname_conformity, split='train'):
    if(split == 'train'):
        print(f"\rTrain split {train.index(songname)}/{len(train)}", end='')
    elif (split == 'validate'):
        print(f"\rValidate split {validate.index(songname)}/{len(validate)}", end='')
    elif (split == 'test'):
        print(f"\rTest split {test.index(songname)}/{len(test)}", end='')
    fnames   = [songname + "_mix.csv",
                songname + "_S.csv",
                songname + "_A.csv",
                songname + "_T.csv",
                songname + "_B.csv"] 
    keynames = ['mix', 'soprano', 'alto',
                'tenor', 'bass']
    savename = h5_pathname + "Files/" + songname_conformity + ".h5"
    try:
        if(not os.path.exists(savename)):
            for i in range(len(keynames)):
                df = csv_to_df(fnames[i])
                df_to_hdf5(df, savename, keynames[i])
    except:
        print(f"Problematic file: {songname}")
        if(os.path.exists(savename)):
            os.remove(savename)

        try:
            train.remove(songname)
            train_conformity.remove(songname_conformity)
        except:
            pass
        
        try:
            validate.remove(songname)
            validate_conformity.remove(songname_conformity)
        except:
            pass

        try:
            test.remove(songname)
            test_conformity.remove(songname_conformity)
        except:
            pass

In [None]:
conv_train = [csv_to_hdf5.options(memory=MAX_MEM).remote(train[i],
    train_conformity[i]) for i in range(len(train))]
conv_train_get = ray.get(conv_train)

conv_val = [csv_to_hdf5.options(memory=MAX_MEM).remote(validate[i],
    validate_conformity[i]) for i in range(len(validate))]
conv_val_get = ray.get(conv_val)

conv_test = [csv_to_hdf5.options(memory=MAX_MEM).remote(test[i],
    test_conformity[i]) for i in range(len(test))]
conv_test_get = ray.get(conv_test)

print("Done.")

In [None]:
hdf5_metadata = {}
hdf5_metadata['train'] = train_conformity
hdf5_metadata['validate'] = validate_conformity
hdf5_metadata['test'] = test_conformity

metadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"
with open(metadata_filename, "w") as metadata_file:
    json.dump(hdf5_metadata, metadata_file, indent=4)