In [None]:
import librosa
import numpy
import os
import pandas
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import traceback
from IPython.display import clear_output
import json
import gc

In [None]:
genus_vs_file_path = pandas.read_csv('genus_vs_file_path.csv', index_col=0)

In [None]:
genus_vs_file_path.head()

In [None]:
genus_vs_file_path.shape

In [None]:
genus_vs_file_path = genus_vs_file_path[genus_vs_file_path.groupby('genus').genus.transform(len) > 5].reset_index()

In [None]:
genus_vs_file_path.shape

In [None]:
genus_list = sorted(genus_vs_file_path['genus'].unique().tolist())
n_classes = len(genus_list)
id_to_genus = {v: k for v, k in enumerate(genus_list)}
genus_to_id = {k: v for v, k in enumerate(genus_list)}
print('Number of unique Genus: {}'.format(n_classes))

In [None]:
with open('genus_to_id.json', 'w') as f:
    json.dump(genus_to_id, f)

In [None]:
with open('id_to_genus.json', 'w') as f:
    json.dump(id_to_genus, f)

In [None]:
genus_vs_file_path = genus_vs_file_path.replace({'genus': genus_to_id})

In [None]:
from sklearn.model_selection import train_test_split
data_X = genus_vs_file_path.file_path
data_y = genus_vs_file_path.genus
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.53, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.57, random_state=42)
assert len(y_train.unique()) == n_classes

In [None]:
# X_train_spect = numpy.empty((0, 862, 128))
# X_test_spect = numpy.empty((0, 862, 128))
# X_val_spect = numpy.empty((0, 862, 128))
# y_train_spect = numpy.empty((0, 278))
# y_test_spect = numpy.empty((0, 278))
# y_val_spect = numpy.empty((0, 278))

In [None]:
def save_npz(X_data, y_data, file_name):
    X_accumulator = numpy.empty((0, 862, 128))
    y_accumulator = numpy.empty((0, 278))
    progress_counter = 0
    error_file_list = []
    for index, file_path in X_data.items():
        clear_output(wait=True)
        y, sr = librosa.load(file_path)
        spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=2048, hop_length=512)
        spect = librosa.power_to_db(spect, ref=numpy.max)
        spect = spect.T
        if spect.shape[0] == 862:
            X_accumulator = numpy.append(X_accumulator, [spect], axis=0)
            row_y = numpy.zeros(n_classes)
            row_y[y_data[index]] = 1
            y_accumulator = numpy.append(y_accumulator, [row_y], axis=0)
            progress_counter += 1
            print('Processed {} of {} files'.format(progress_counter, X_data.shape[0]))
        else:
            error_file_list.append('Path: {}, Label: {}'.format(file_path, id_to_genus[y_data[index]]))
    numpy.savez(file_name, X_accumulator, y_accumulator)
    return error_file_list

In [None]:
error_file_list = save_npz(X_train, y_train, 'train_data')
print(error_file_list)
gc.collect()

In [None]:
error_file_list = save_npz(X_val, y_val, 'validation_data')
gc.collect()

In [None]:
save_npz(X_test, y_test, 'test_data')
gc.collect()