In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import re
import scipy
import time
import collections
import itertools
import librosa
import pickle

### Find the directories of the data

In [5]:
base_path = 'F:\Aurora\Documents/nsynth'

#directory to training data and json file
train_dir= base_path + '/nsynth-train/audio'

#directory to training data and json file
valid_dir= base_path + '/nsynth-valid/audio'
# directory to training data and json file
test_dir= base_path + '/nsynth-test/audio'

### Sample files from each instrument family

In [6]:
# read the raw json files as given in the training set
# The JSON files has information about the instrument family, instrument source, pitch, 
# velocity, and the audio file name 
df_train_raw = pd.read_json(path_or_buf= base_path + '/nsynth-train/examples.json', orient='index')
df_train_raw = df_train_raw[df_train_raw.instrument_family != 9]

#Sample n files from each instrument family
n = 10
df_train_sample = df_train_raw.groupby('instrument_family', as_index=False, #group by instrument family
                               group_keys=False).apply(lambda df: df.sample(n)) #number of samples

### Save the sampled filenames for the train, validation and test dataset

In [None]:
filenames_train = df_train_sample.index.tolist()

# Tar ut alla filnamn från validation datasetet
path = base_path + "/nsynth-valid/examples.json"
df_valid = pd.read_json(path_or_buf=path, orient='index')

filenames_valid = df_valid.index.tolist()


# Tar ut alla filnamn från test datasetet
path = base_path + '/nsynth-test/examples.json'
df_test = pd.read_json(path_or_buf=path, orient='index')

# save the train file index as list
filenames_test = df_test.index.tolist()

### Method for extracting data from a file

In [None]:
def feature_extract(file):
    """
    Takes in a file name from the NSynth dataset and returns the melspectrogram of said file
    Returns a 126x13 array
    """
    
    #get wave representation
    y, sr = librosa.load(file, sr=16000)
        
    #Mel-frequency cepstral coefficients (MFCCs)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = np.transpose(mfcc)
    normalized_mfcc = (mfcc-np.min(mfcc))/(np.max(mfcc)-np.min(mfcc))

    #get the mel-scaled spectrogram
    #spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000) 

    #spectrogram = np.transpose(spectrogram)
    #normalized_spectrogram = (spectrogram-np.min(spectrogram))/(np.max(spectrogram)-np.min(spectrogram))

    return normalized_mfcc

## Prepare data


In [None]:
def prepare_files(dir, filenames, pickle_file_name, break_after):
    """
    Extracts data from all files specified in dir with a few exceptions

    :param dir: The directory containing filenames
    :param filenames: The list of filenames to run feature_extract on 
    :param pickle_file_name: The name of the file where the extracted data should be stored
    :param break_after: Upper limit of the number of files to include in the pickle file.
                        Used in order to speed up the process. Set this to -1 if you want
                        to include all files in filenames.
    """
    #create dictionary to store all test features
    dict_test = {}
    #loop over every file in the list
    a = 0
    for file in filenames:
        # Break after a certain number of files.
        # -1 means no break
        if a == break_after:
            break
        a += 1
        #extract the features
        features = feature_extract(dir + '/' + file + '.wav') #specify directory and .wav
        #add dictionary entry
        features = features.tolist()
        dict_test[file] = features
    
    #print(dict_test)
    dict_data_frame = pd.DataFrame.from_dict(dict_test, orient="index")

    #save the dataframe to a pickle file
    with open('CustomData/' + pickle_file_name, 'wb') as f:
        pickle.dump(dict_test, f)

### Prepare training data

In [None]:
# Prepare the training data
prepare_files(train_dir, filenames_train, 'traindata1000.pkl', -1)

### Prepare test data

In [None]:
# Prepare the test data
prepare_files(test_dir, filenames_test, 'testdata100.pkl', 100)

### Prepare validation data

In [None]:
# Prepare the validation data
prepare_files(valid_dir, filenames_valid, 'validdata100.pkl', 100)