In [1]:
import numpy
import os
import librosa
import pandas
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import traceback
from IPython.display import clear_output
import json
from sklearn.model_selection import train_test_split
import gc

In [2]:
genus_vs_file_path = pandas.read_csv('genus_vs_file_path.csv', index_col=0)

In [4]:
genus_vs_file_path.shape

(6208, 4)

In [5]:
genus_list = sorted(genus_vs_file_path['genus'].unique().tolist())
n_classes = len(genus_list)
id_to_genus = {v: k for v, k in enumerate(genus_list)}
genus_to_id = {k: v for v, k in enumerate(genus_list)}
print('Number of unique Genus: {}'.format(n_classes))

Number of unique Genus: 141


In [6]:
data_X = genus_vs_file_path.file_path
data_y = genus_vs_file_path.genus
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, stratify=data_y, test_size=0.2, random_state=100)
assert len(y_train.unique()) == n_classes

In [5]:
def transform_dataset(x):
    x_raw = librosa.core.db_to_power(x, ref=1.0)
    x_log = numpy.log(x_raw)
    return x_log

In [9]:
def save_npz(X_data, y_data, file_name, n_classes):
    X_accumulator = numpy.empty((0, 862, 128))
    y_accumulator = numpy.empty((0, n_classes))
    progress_counter = 0
    error_file_list = []
    for index, file_path in X_data.items():
        clear_output(wait=True)
        y, sr = librosa.load(file_path)
        spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=2048, hop_length=512)
        spect = librosa.power_to_db(spect, ref=numpy.max)
        spect = spect.T
        if spect.shape[0] == 862:
            X_accumulator = numpy.append(X_accumulator, [spect], axis=0)
            row_y = numpy.zeros(n_classes)
            row_y[genus_to_id[y_data[index]]] = 1
            y_accumulator = numpy.append(y_accumulator, [row_y], axis=0)
            progress_counter += 1
            print('Processed {} of {} files'.format(progress_counter, X_data.shape[0]))
        else:
            error_file_list.append('Path: {}, Label: {}'.format(file_path, y_data[index]))
    numpy.savez(file_name, transform_dataset(X_accumulator), y_accumulator)
    return error_file_list

In [11]:
error_file_list = save_npz(X_train, y_train, 'train_data', n_classes)
# print(error_file_list)
gc.collect()

Processed 6207 of 6208 files


18867

In [12]:
save_npz(X_test, y_test, 'test_data', n_classes)
# print(error_file_list)
gc.collect

Processed 1242 of 1242 files


512