# SSCS dataset conversion to HDF5

In [None]:
import re
import numpy as np
import pandas as pd
import json
import zipfile
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
zipname = "Datasets/SynthSalienceChoralSet_v1.zip"
h5_pathname = "Datasets/HDF5/"

In [None]:
def name_conformity(name):
    return re.sub("[~\"#%&*:<>?/\\{|}]", "", name)

In [None]:
def sscs_get_split(split='train'):
    splitname = "Datasets/SynthSalienceChoralSet_dataSplits.json"
    if(split.lower() == 'train' or split.lower() == 'validate' or
       split.lower() == 'test'):
        return json.load(open(splitname, 'r'))[split.lower()]
    else:
        raise NameError("Split should be 'train', 'validate' or 'test'.")
    
train = sscs_get_split()
validate = sscs_get_split('validate')
test = sscs_get_split('test')

train_conformity = [name_conformity(name) for name in train]
validate_conformity = [name_conformity(name) for name in validate]
test_conformity = [name_conformity(name) for name in test]

hdf5_metadata = {}
hdf5_metadata['train'] = train_conformity
hdf5_metadata['validate'] = validate_conformity
hdf5_metadata['test'] = test_conformity

metadata_filename = h5_pathname + "SynthSalienceChoralSet_hdf5_dataSplits.json"
with open(metadata_filename, "w") as metadata_file:
    json.dump(hdf5_metadata, metadata_file)


In [None]:
def csv_to_df(songname):
    with zipfile.ZipFile(zipname) as zf:
        fname = "sscs/" + songname
        with zf.open(fname) as f:
            df = pd.read_csv(f, header=None, engine='pyarrow')
        df = df.T
        return df
  
def csv_to_hdf5(songname, songname_conformity):
    fnames   = [songname + "_mix.csv",
                songname + "_S.csv",
                songname + "_A.csv",
                songname + "_T.csv",
                songname + "_B.csv"] 
    keynames = ['mix', 'voice/soprano', 'voice/alto',
                'voice/tenor', 'voice/bass']
    savename = h5_pathname + "Files/" + songname_conformity + ".h5"
    for i in range(5):
        df = csv_to_df(fnames[i])
        df.to_hdf(savename, keynames[i], mode='a',
                  format='table', complevel=9, complib='blosc')
        del(df)
        

In [None]:
def sscs_plot(dataframe):

    aspect_ratio = (3/8)*dataframe.shape[1]/dataframe.shape[0]
    fig, ax = plt.subplots(figsize=(13, 7))
    im = ax.imshow(dataframe, interpolation='nearest', aspect=aspect_ratio,
        cmap = mpl.colormaps['BuPu'])
    ax.invert_yaxis()
    plt.show()

In [None]:
for i in range(len(train)):
    csv_to_hdf5(train[i], train_conformity[i])

In [None]:
for i in range(len(validate)):
    csv_to_hdf5(validate[i], validate_conformity[i])

In [None]:
for i in range(len(test)):
    csv_to_hdf5(test[i], test_conformity[i])