# Data pre-process

The drumkit sound dataset used for the training is available here:
https://drive.google.com/file/d/1_yXHmvD7nLrRfbItgmhTledlksneF028/view?usp=sharing

In [11]:
import librosa
import librosa.display
import numpy as np
from glob import glob
%matplotlib inline

N_FFT = 1024
HOP_LENGTH = 256 
SR = 16000
MELSPEC_SIZE = 128;

len_src = 3. 
ref_n_src = int(SR * len_src)

drum_dirs = [r.split('/')[-1].replace('\\', '/') for r in sorted(glob('./drumkit_dataset/*'))]
NB_CLASS = len(drum_dirs)

print (drum_dirs)

def get_melspec(filepath, hop_length=HOP_LENGTH, n_mels=128):
    y_tmp = np.zeros(ref_n_src)
    
    y, sr = librosa.core.load(filepath, sr=SR, mono=True)
    y = y[:ref_n_src]
    y_tmp[:len(y)] = y[:ref_n_src]
        
    # sfft -> mel conversion
    melspec = librosa.feature.melspectrogram(y=y_tmp, sr=sr,
                n_fft=N_FFT, hop_length=hop_length, n_mels=n_mels)
    S = librosa.power_to_db(melspec)  # Corrected line: removed np.max
        
    return S



['drumkit_dataset/0_kick', 'drumkit_dataset/1_snare', 'drumkit_dataset/2_hihat_closed', 'drumkit_dataset/3_hihat_open', 'drumkit_dataset/4_tom_low', 'drumkit_dataset/5_tom_mid', 'drumkit_dataset/6_tom_high', 'drumkit_dataset/7_clap', 'drumkit_dataset/8_rim']


In [31]:
from glob import glob
from tqdm import tqdm
from random import shuffle
import librosa
import numpy as np
import os

# Constants
N_FFT = 1024
HOP_LENGTH = 256 
SR = 16000
MELSPEC_SIZE = 128
len_src = 3  
ref_n_src = int(SR * len_src)

# Function to get Mel Spectrogram
def get_melspec(filepath, hop_length=HOP_LENGTH, n_mels=128):
    y_tmp = np.zeros(ref_n_src)
    y, sr = librosa.load(filepath, sr=SR, mono=True)
    y = y[:ref_n_src]
    y_tmp[:len(y)] = y[:ref_n_src]
    
    # SFFT -> Mel conversion
    melspec = librosa.feature.melspectrogram(y=y_tmp, sr=sr,
                                              n_fft=N_FFT, hop_length=hop_length, n_mels=n_mels)
    S = librosa.power_to_db(melspec)  # Corrected to use only one argument
    return S

# Get drum directories
drum_dirs = [os.path.basename(r) for r in sorted(glob('./drumkit_dataset/*'))]
print("Drum directories:", drum_dirs)

# Get file paths
filepaths = glob("./drumkit_dataset/*/*")
print(f"Total files found: {len(filepaths)}")

shuffle(filepaths)

drum_genres = []
drum_melspecs = []
NB_CLASS = len(drum_dirs)

# Process files
for filepath in tqdm(filepaths):
    dir_ = os.path.basename(os.path.dirname(filepath))
    
    try:
        genre = drum_dirs.index(dir_)  # Get genre index
        melspec = get_melspec(filepath, HOP_LENGTH, MELSPEC_SIZE)

        if melspec.shape[1] > MELSPEC_SIZE:
            melspec = melspec[:, :MELSPEC_SIZE]
        else:
            melspec.resize((MELSPEC_SIZE, MELSPEC_SIZE)) 

        drum_genres.append(genre)
        drum_melspecs.append(melspec)
    except Exception as e:
        print(f"Error processing file: {filepath} | Error: {e}")


Drum directories: ['0_kick', '1_snare', '2_hihat_closed', '3_hihat_open', '4_tom_low', '5_tom_mid', '6_tom_high', '7_clap', '8_rim']
Total files found: 2575


  0%|          | 0/2575 [00:00<?, ?it/s]

100%|██████████| 2575/2575 [00:14<00:00, 178.51it/s]


In [32]:
drum_genres = np.array(drum_genres)
print(drum_genres.shape)

from keras.utils import to_categorical

drum_genres = to_categorical(drum_genres, NB_CLASS)
print(drum_genres.shape)

drum_melspecs = np.array(drum_melspecs)
drum_melspecs = np.expand_dims(drum_melspecs, 3)
print(drum_melspecs.shape)


(2575,)
(2575, 9)
(2575, 128, 128, 1)


In [33]:
np.savez("drum_data_128.npz", melspecs=drum_melspecs, genres=drum_genres)

In [3]:
## loading from pre-processed npz file
# drum_melspecs = np.load("drum_data_128.npz")['melspecs']
# drum_genres = np.load("drum_data_128.npz")['genres']

# training

In [35]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from keras.layers import BatchNormalization,Activation
from keras.layers import ELU

from keras.models import Model
from keras import backend as K

SIZE = MELSPEC_SIZE

input_img = Input(shape=(SIZE, SIZE, 1)) # normalized, 128 x 128

x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(input_img) #nb_filter, nb_row, nb_col
x = BatchNormalization(axis=1)(x)
x = ELU(alpha=1.0)(x)
x = MaxPooling2D((4, 4))(x)

x = Conv2D(64, (3, 3), padding='same',kernel_initializer='he_normal')(x)
x = BatchNormalization(axis=1)(x)
x = ELU(alpha=1.0)(x)
x = MaxPooling2D((2, 2))(x)


# x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal')(x)
# x = BatchNormalization(axis=1)(x)
# x = ELU(alpha=1.0)(x)
# x = MaxPooling2D((2, 2), padding='same')(x)
# print K.int_shape(x)

x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)
x = BatchNormalization(axis=1)(x)
x = ELU(alpha=1.0)(x)
x = MaxPooling2D((2, 4))(x)

x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)
x = BatchNormalization(axis=1)(x)
x = ELU(alpha=1.0)(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(NB_CLASS)(x)
y = Activation("softmax")(x)

model = Model(input_img, y)

In [36]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 128, 128, 32)      320       
                                                                 
 batch_normalization (Batch  (None, 128, 128, 32)      512       
 Normalization)                                                  
                                                                 
 elu (ELU)                   (None, 128, 128, 32)      0         
                                                                 
 max_pooling2d (MaxPooling2  (None, 32, 32, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 32, 32, 64)        18496 

In [38]:
class_weight = {}
total = drum_genres.shape[0]
for i in range(NB_CLASS):
    nb = np.sum(np.argmax(drum_genres, axis=1) == i)
    class_weight[i] = total / float(nb) 
print (class_weight)
    

{0: 3.9737654320987654, 1: 3.5225718194254445, 2: 13.770053475935828, 3: 9.196428571428571, 4: 13.696808510638299, 5: 18.52517985611511, 6: 14.385474860335195, 7: 21.822033898305083, 8: 24.523809523809526}


In [39]:
nb_total = drum_melspecs.shape[0]
nb_train = int(nb_total * 0.9)
print (nb_total, nb_train)

train_melspecs = drum_melspecs[:nb_train]
train_genres = drum_genres[:nb_train]

val_melspecs = drum_melspecs[nb_train:]
val_genres = drum_genres[nb_train:]

2575 2317


In [40]:
train_melspecs.shape
train_genres.shape


(2317, 9)

In [43]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(verbose=1, patience=5)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['acc'])

model.fit(train_melspecs, train_genres, batch_size=64, 
          epochs=100, verbose=1, 
          shuffle=False,validation_data = (val_melspecs, val_genres), class_weight=class_weight, callbacks=[es])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: early stopping


<keras.src.callbacks.History at 0x2494f11ab30>

In [44]:
model.save("model/drum_spec_model_128.h5")

  saving_api.save_model(


## convert the model .5 tensorflow.js format

In [None]:
!pip install tensorflowjs

In [None]:
!tensorflowjs_converter --input_format=keras \
        C:/Users/smsha/Desktop/BeatboxAi/model/drum_spec_model_128.h5 \
        C:/Users/smsha/Desktop/BeatboxAi/model