In [1]:
import utils
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import GRU, Flatten, Concatenate,ELU,Permute,Dropout,MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
import os
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer
from keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Reshape, Bidirectional, Conv2D, MaxPooling2D, BatchNormalization, Dense, Flatten, Input, ZeroPadding2D
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from keras.metrics import AUC, F1Score, Precision, Accuracy
from sklearn.metrics import accuracy_score





# CRNN

In [2]:
checkpoint_filepath = r'D:\VNtraditionalmusicclassification\data\checkpoint'

def CRNN(input_tensor=None,include_top=True):
    # Determine proper input shape
    if K.image_data_format() == 'channels_first':
        input_shape = (1, 96, 1366)
    else:
        input_shape = (96, 1366, 1)

    if input_tensor is None:
        melgram_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            melgram_input = Input(tensor=input_tensor, shape=input_shape)
        else:
            melgram_input = input_tensor

    # Determine input axis
    if K.image_data_format() == 'channels_first':
        channel_axis = 1
        freq_axis = 2
        time_axis = 3
    else:
        channel_axis = 3
        freq_axis = 1
        time_axis = 2

    # Input block
    x = ZeroPadding2D(padding=(0, 37))(melgram_input)
    x = BatchNormalization(axis=freq_axis, name='bn_0_freq')(x)

    # Conv block 1
    x = Conv2D(64, 3, 3, padding='same', name='conv1' )(x)
    x = BatchNormalization(axis=channel_axis, name='bn1')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool1')(x)
    x = Dropout(0.2, name='dropout1')(x)

    # Conv block 2
    x = Conv2D(128, 3, 3, padding='same', name='conv2')(x)
    x = BatchNormalization(axis=channel_axis, name='bn2')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(3, 3), strides=(3, 3), name='pool2')(x)
    x = Dropout(0.2, name='dropout2')(x)

    # Conv block 3
    x = Conv2D(128, 3, 3, padding='same', name='conv3')(x)
    x = BatchNormalization(axis=channel_axis, name='bn3')(x)
    x = ELU()(x)
    x = MaxPooling2D((4,4), padding='same')(x)
    x = Dropout(0.2, name='dropout3')(x)

    # Conv block 4
    x = Conv2D(128, 3, 3, padding='same', name='conv4')(x)
    x = BatchNormalization(axis=channel_axis, name='bn4')(x)
    x = ELU()(x)
    x = MaxPooling2D((4,4), padding='same')(x)
    x = Dropout(0.2, name='dropout4')(x)

    # reshaping
    if K.image_data_format() == 'channels_last':
        x = Permute((3, 1, 2))(x)
    x = Reshape((1,1, 128))(x)
    # # Reshape to squeeze out the unnecessary dimensions
    x = Reshape((-1, 128))(x)  # Assuming the last dimension is 128

    # GRU block 1, 2, output
    x = GRU(32, return_sequences=True, name='gru1')(x)
    x = GRU(32, return_sequences=False, name='gru2')(x)
    x = Dropout(0.3)(x)
    if include_top:
        x = Dense(5, activation='sigmoid', name='output')(x)

    # Create model
    model = Model(melgram_input, x)
    if not os.path.exists(checkpoint_filepath + '/lcrnn'):
        os.makedirs(checkpoint_filepath + '/lcrnn')
    checkpoint1= tf.keras.callbacks.ModelCheckpoint(
    filepath= checkpoint_filepath + '/lcrnn' + '/lcrnn_{epoch:02d}_{val_accuracy:.4f}.weights.h5',
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=True,
    verbose=1
    )
    return model,checkpoint1

In [3]:
early = EarlyStopping(monitor='loss',
    patience= 10,
    verbose= 0,
    mode='auto',
    baseline= None,
    restore_best_weights= True)


In [4]:
# Directory containing audio files
audio_dir_ori = r'D:\VNtraditionalmusicclassification\data\VNTM3'

# Process audio files and retrieve log-mel spectrograms and labels
data, labels = utils.process_audio_files_crnn(audio_dir_ori)

In [5]:

# Concatenate the data
reshaped_data = np.transpose(data, (0, 2, 3, 1))

# Split data into training and combined validation-evaluation set
X_train, X_combined, y_train, y_combined = train_test_split(reshaped_data, labels, test_size=0.2, random_state=42, shuffle=True, stratify =labels)

# Split combined set into validation and evaluation sets
X_val, X_eval, y_val, y_eval = train_test_split(X_combined, y_combined, test_size=0.5, random_state=42, shuffle=True, stratify =y_combined)

In [6]:

# Label binarization
label_binarizer = LabelBinarizer()
y_train_one_hot_crnn = label_binarizer.fit_transform(y_train)
y_val_one_hot_crnn = label_binarizer.transform(y_val)
y_eval_one_hot_crnn = label_binarizer.transform(y_eval)



In [7]:
modelcrnn = tf.keras.Sequential()
modelcrnn ,checkpoint1= CRNN(include_top=True)

optimizer = Adam(learning_rate=0.001)
modelcrnn.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                   metrics=['accuracy'])
modelcrnn.summary()

In [8]:
modelcrnn.fit(X_train, y_train_one_hot_crnn, batch_size=32, epochs=100, validation_data=(X_val, y_val_one_hot_crnn), callbacks=[checkpoint1, early], shuffle=True)


Epoch 1/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 358ms/step - accuracy: 0.3825 - loss: 0.6220
Epoch 1: val_accuracy improved from -inf to 0.20000, saving model to D:\VNtraditionalmusicclassification\data\checkpoint/lcrnn/lcrnn_01_0.2000.weights.h5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 374ms/step - accuracy: 0.3831 - loss: 0.6209 - val_accuracy: 0.2000 - val_loss: 0.5595
Epoch 2/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 364ms/step - accuracy: 0.5836 - loss: 0.4010
Epoch 2: val_accuracy did not improve from 0.20000
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 372ms/step - accuracy: 0.5844 - loss: 0.4005 - val_accuracy: 0.2000 - val_loss: 0.5996
Epoch 3/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 366ms/step - accuracy: 0.7520 - loss: 0.2875
Epoch 3: val_accuracy improved from 0.20000 to 0.52800, saving model to D:\VNtraditionalmusicclassification\data\checkpoint/lcrn

<keras.src.callbacks.history.History at 0x2b5c754bad0>

In [9]:
# Evaluate the model
predict = modelcrnn.predict(X_eval)
# Calculate AUC ROC
auc_roc = AUC()
auc_roc.update_state(y_eval_one_hot_crnn, predict)
print("AUC ROC:", auc_roc.result().numpy())

# Calculate F1 Score
f1 = F1Score()
f1.update_state(y_eval_one_hot_crnn, predict)
print("F1 Score:", np.mean(f1.result().numpy()))

# Calculate Precision
pre = Precision()
pre.update_state(y_eval_one_hot_crnn, predict)
print("Precision Score", pre.result().numpy())

# Accuracy Score
acc = Accuracy()
acc.update_state(y_eval_one_hot_crnn, predict)
print("Accuracy Score", pre.result().numpy())

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step
AUC ROC: 0.9868181
F1 Score: 0.9483639
Precision Score 0.9516129
Accuracy Score 0.9516129


In [10]:
predicted_labels = label_binarizer.inverse_transform(predict)
for i in range(len(predicted_labels)):
    if predicted_labels[i] == y_eval[i]:
        print(predicted_labels[i], y_eval[i])

cheo cheo
cailuong cailuong
cheo cheo
chauvan chauvan
catru catru
hatxam hatxam
cheo cheo
cailuong cailuong
hatxam hatxam
cailuong cailuong
cailuong cailuong
hatxam hatxam
chauvan chauvan
cheo cheo
cheo cheo
catru catru
cheo cheo
hatxam hatxam
hatxam hatxam
chauvan chauvan
catru catru
chauvan chauvan
hatxam hatxam
hatxam hatxam
chauvan chauvan
catru catru
chauvan chauvan
cheo cheo
cheo cheo
hatxam hatxam
chauvan chauvan
hatxam hatxam
hatxam hatxam
cailuong cailuong
cailuong cailuong
catru catru
chauvan chauvan
cheo cheo
chauvan chauvan
catru catru
hatxam hatxam
cheo cheo
cheo cheo
chauvan chauvan
hatxam hatxam
hatxam hatxam
hatxam hatxam
cheo cheo
chauvan chauvan
cailuong cailuong
cailuong cailuong
chauvan chauvan
catru catru
hatxam hatxam
chauvan chauvan
cailuong cailuong
catru catru
cailuong cailuong
cheo cheo
cheo cheo
catru catru
cailuong cailuong
catru catru
chauvan chauvan
catru catru
cailuong cailuong
catru catru
catru catru
cheo cheo
cheo cheo
hatxam hatxam
cailuong cailuong
ha

# PCRNN

In [2]:
checkpoint_filepath = r'D:\ProjectMusicGenre\data\checkpoint'
dataset_root =r'D:\ProjectMusicGenre\data'
saved_model_path = r'D:\ProjectMusicGenre\data\model'


In [7]:
if not os.path.exists(dataset_root + "/checkpoint"):
    os.makedirs(dataset_root + "/checkpoint")
if not os.path.exists(checkpoint_filepath):
    os.makedirs(checkpoint_filepath)
if not os.path.exists(checkpoint_filepath + '/pcrnn'):
    os.makedirs(checkpoint_filepath + '/pcrnn')

In [1]:
audio_dir = 'D:\ProjectMusicGenre\data\dataset10seconds'
stft_img, stft_data, labels = utils.process_audio_files(audio_dir)


In [5]:
print(stft_data[0].shape)

(513, 128)


In [4]:


# Split data into training and combined validation-evaluation set
img_train, img_conbined, stft_train, stft_combined, y_train, y_combined = train_test_split(stft_img, stft_data, labels, test_size=0.2, random_state=42, shuffle=True, stratify =labels)

# Split combined set into validation and evaluation sets
img_val, img_eval, stft_val, stft_eval, y_val, y_eval = train_test_split(img_conbined, stft_combined, y_combined, test_size=0.5, random_state=42, shuffle=True, stratify =y_combined)



In [5]:
img_train, img_val, img_eval = np.array(img_train), np.array(img_val), np.array(img_eval)
stft_train, stft_val, stft_eval = np.array(stft_train), np.array(stft_val), np.array(stft_eval)


In [6]:
n_class = 5
input_shape = (513, 128,3)
tf.random.set_seed(42)

In [7]:

def pcrnn(model,input_shape,rnn_input_shape):
    # Define input layer
    input_img = Input(shape=input_shape)

    # Define CNN Block
    cnn = Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same')(input_img)
    cnn = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.1, name='dropout1')(cnn)
    
    cnn = Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same')(cnn)
    cnn = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.1, name='dropout2')(cnn)

    cnn = Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same')(cnn)
    cnn = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.1, name='dropout3')(cnn)

    cnn = Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same')(cnn)
    cnn = MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.1, name='dropout4')(cnn)

    cnn = Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='same')(cnn)
    cnn = MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(0.1, name='dropout5')(cnn)

    cnn = Flatten()(cnn)

    # Define Block 2
    rnn_input = Input(shape=rnn_input_shape)
    rnn = MaxPooling1D(pool_size=2,strides=2,data_format='channels_last')(rnn_input)
    rnn = Bidirectional(GRU(128))(rnn)
    rnn = Dropout(0.3)(rnn)


    # Concatenate CNN and RNN outputs
    concatenated_output = Concatenate()([cnn, rnn])

    # Define output layer
    output = Dense(5, activation='softmax')(concatenated_output)

    # Define the model
    model = Model(inputs=[input_img, rnn_input], outputs=output)
    if not os.path.exists(checkpoint_filepath + '/pcrnn'):
        os.makedirs(checkpoint_filepath + '/pcrnn')
    checkpoint1= tf.keras.callbacks.ModelCheckpoint(
    filepath= checkpoint_filepath + '/pcrnn' + '/pcrnn_{epoch:02d}_{val_accuracy:.4f}.weights.h5',
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=True,
    verbose=1
    )
    return model,checkpoint1

In [8]:
modelpcrnn = tf.keras.Sequential()
modelpcrnn, checkpoint1 = pcrnn(modelpcrnn, input_shape, rnn_input_shape=(513, 128))


In [9]:
modelpcrnn.compile(optimizer='adam',
                  loss='binary_crossentropy',
                   metrics=['accuracy'])
modelpcrnn.summary()

In [10]:
label_binarizer = LabelBinarizer()
y_train_one_hot = label_binarizer.fit_transform(y_train)
y_val_one_hot = label_binarizer.transform(y_val)
y_eval_one_hot = label_binarizer.transform(y_eval)

In [11]:
early = EarlyStopping(monitor='loss',
    patience= 5,
    verbose= 0,
    mode='auto',
    baseline= None,
    restore_best_weights= True)


In [13]:
history = modelpcrnn.fit(x=(img_train, stft_train),
                         y=y_train_one_hot,  
                         batch_size=32,
                         epochs=100,  
                         validation_data=((img_val,stft_val),y_val_one_hot),
                         callbacks=[checkpoint1, early]
)

Epoch 1/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step - accuracy: 0.6964 - loss: 0.2973
Epoch 1: val_accuracy improved from -inf to 0.58155, saving model to D:\ProjectMusicGenre\data\checkpoint/pcrnn/pcrnn_01_0.5816.weights.h5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 375ms/step - accuracy: 0.6969 - loss: 0.2968 - val_accuracy: 0.5816 - val_loss: 0.3946
Epoch 2/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - accuracy: 0.8890 - loss: 0.1348
Epoch 2: val_accuracy improved from 0.58155 to 0.83155, saving model to D:\ProjectMusicGenre\data\checkpoint/pcrnn/pcrnn_02_0.8316.weights.h5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 328ms/step - accuracy: 0.8891 - loss: 0.1347 - val_accuracy: 0.8316 - val_loss: 0.1726
Epoch 3/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - accuracy: 0.9350 - loss: 0.0916
Epoch 3: val_accuracy improved from 0.8315

In [14]:

# Evaluate the model
predict = modelpcrnn.predict([img_eval, stft_eval])
predicted_labels = label_binarizer.inverse_transform(predict).tolist()

# Calculate AUC ROC
auc_roc = AUC()
auc_roc.update_state(y_eval_one_hot, predict)
print("AUC ROC:", auc_roc.result().numpy())

# Calculate F1 Score
f1 = F1Score()
f1.update_state(y_eval_one_hot, predict)
print("F1 Score:", np.mean(f1.result().numpy()))

# Calculate Precision
pre = Precision()
pre.update_state(y_eval_one_hot, predict)
print("Precision Score", pre.result().numpy())

# Accuracy Score
acc = accuracy_score(y_eval, predicted_labels)
print("Accuracy Score", acc)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 90ms/step
AUC ROC: 0.9960858
F1 Score: 0.9852867
Precision Score 0.9853138
Accuracy Score 0.9853137516688919
