In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd

In [None]:
root_dir = 'd:/soundofai/nsynth-guitar-subset/train/'
dataset_file = 'd:/soundofai/nsynth-guitar-subset/train/examples.json'
audio_dir = 'd:/soundofai/nsynth-guitar-subset/train/audio/'

In [None]:
index_to_qualities = {
    0: 'bright',
    1: 'dark',
    2: 'distortion',
    3: 'fast_decay',
    4: 'long_release',
    5: 'multiphonic',
    6: 'nonlinear_env',
    7: 'percussive',
    8: 'reverb',
    9: 'tempo_sync'
}

qualities_to_index = dict((v, k) for k, v in index_to_qualities.items())

index_to_source = {
    0: 'acoustic',
    1: 'electronic',
    2: 'synthetic'
}

source_to_index = dict((v, k) for k, v in index_to_source.items())

In [None]:
with open(dataset_file, 'r') as f:
    data = json.load(f)

In [None]:
# all labels
_, v = next(iter(data.items()))

list(v.keys())

In [None]:
ignore_cols = [
    'note', # unique identifier, not relevant
    'sample_rate', # sample rate is 16000 for all notes
    'instrument_source', # numeric value of instrument_source_str, duplicate
    'instrument_family', # numeric value of instrument_family_str, duplicate
    'instrument_family_str', # we can ignore, since we are looking only at 3
    'qualities', # numeric values of qualities_str, duplicate
    'qualities_str', # list of qualities will be added as independent columns
    'note_str', # unique identifier, not relevant
    'instrument', # unique identifier, not relevant
    'instrument_str', # unique identifier, not relevant
    'instrument_source_str' # will add as qualities
]

In [None]:
columns = [x for x in list(v.keys()) if x not in ignore_cols]
columns = columns + list(qualities_to_index.keys()) + list(source_to_index.keys())
print(columns)

In [None]:
dataset = dict((c, []) for c in columns)

for key, value in data.items():
    dataset['pitch'].append(value.get('pitch') or 0)
    dataset['velocity'].append(value.get('velocity') or 0)
    
    for q in qualities_to_index.keys():
        if q in value.get('qualities_str'):
            dataset[q].append(1)
        else:
            dataset[q].append(0)
    
    for s in source_to_index.keys():
        if s == value.get('instrument_source_str'):
            dataset[s].append(1)
        else:
            dataset[s].append(0)

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
df.loc[:, 'pitch'].hist()
plt.ylabel('Count')
plt.xlabel('Pitch')
plt.subplot(1, 2, 2)
df.loc[:, 'velocity'].hist()
plt.ylabel('Count')
plt.xlabel('Velocity')
plt.show()

In [None]:
qualities = df.iloc[:, 2:]

In [None]:
qualities.sum()

In [None]:
# tempo sync does not seem very meaningful
qualities = qualities.drop(columns=['tempo_sync'])
qualities.corr()

In [None]:
q_cols = list(qualities.columns)
q_cols

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(1, 1, 1)
cax = ax.matshow(qualities.corr())
fig.colorbar(cax)
ax.set_xticks(list(range(0, len(q_cols))))
ax.set_yticks(list(range(0, len(q_cols))))
ax.set_xticklabels(q_cols)
ax.set_yticklabels(q_cols)
plt.show()