# <center>Preprocessing the data</center>

### Libraries

In [13]:
import os
import json
import math
import librosa

### Constants

In [26]:
DATASET_PATH = '../assets/dataset/'
JSON_PATH = '../assets/data.json'
SAMPLE_RATE = 22050
DURATION = 30 # sec
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

### MFCC feature extraction

In [27]:
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):

    # dictionary to store data
    data = {
        'mapping': [],
        'mfcc': [],
        'labels': []
    }

    samples_per_seg = SAMPLES_PER_TRACK // num_segments
    
    # expected number of mfcc vectors per segment:
    # round the number to higher integer
    mfcc_vectors = math.ceil(samples_per_seg / hop_length)

    # loop through all genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        if dirpath is not dataset_path:
            # save the semantic label
            dirpath_components = dirpath.split('/')
            semantic_label = dirpath_components[-1]
            data['mapping'].append(semantic_label)
            print(f'Processing: {semantic_label}')

            # process files for a specific genre
            for file in filenames:
                file_path = os.path.join(dirpath, file)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    
                # process segments extracting mfcc and storing data
                for curr_seg in range(num_segments):
                    start_sample = samples_per_seg * curr_seg
                    finish_sample = start_sample + samples_per_seg
    
                    mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample],
                                               sr=sr,
                                               n_fft=n_fft,
                                               n_mfcc=n_mfcc,
                                               hop_length=hop_length)
                    
                    mfcc = mfcc.T # to make it easier to work with
    
                    # store mfcc for segment if it has the expected length 
                    if len(mfcc) == mfcc_vectors:
                        data['mfcc'].append(mfcc.tolist())
                        data['labels'].append(i-1)
    
                        # print(f'{file_path}, segment:{curr_seg+1}')

    with open(json_path, 'w') as fp:
        json.dump(data, fp, indent=4)

In [28]:
save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)

Processing: blues
Processing: classical
Processing: country
Processing: disco
Processing: hiphop
Processing: jazz
Processing: metal
Processing: pop
Processing: reggae
Processing: rock
