In [1]:
# ===== Importing Packages =====
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from joblib import dump, load


In [2]:
# ===== Prepare Data =====
cut_length = 0.5
_n_split = 4
FLAG_SAVE_MFCC = False    # set to True if want to store MFCC images

audio_dir = './data/audio/'
image_dir = './data/image/'
label_dir = './data/label/'

f_annotation = None
if FLAG_SAVE_MFCC:
    f_annotation = open('./data/total_annotation.csv', 'w')
    print('file name, label', file=f_annotation)

audio_list = os.listdir(audio_dir)
print('Num of audio files:', len(audio_list))
mfcc_list = []
label_list = []
state = 0

# for all audio files
print('===== Preprocess Data =====')
for audio in audio_list:
    # load audio
    print('Processing', audio, '...')
    y, sr = librosa.load(audio_dir + audio)

    if state == 0:
        state = 1
    elif state == 1:
        state = 2
        y = np.concatenate((np.zeros(int(sr*0.05)), y)) # shift right to avoid overfitting
    elif state == 2:
        state = 3
    elif state == 3:
        state = 0
        y = y[int(sr*0.05):]    # shift left to avoid overfitting

    # get label file
    song_name = audio[:-6]
    f_label = open(label_dir + song_name + '.csv', 'r')

    # get the first label
    temp_label = f_label.readline().split(',')
    temp_label = f_label.readline().split(',')
    temp_label[0] = float(temp_label[0])
    temp_label[1] = float(temp_label[1])

    # cut to small pieces with length of cut_length seconds
    for i in range(len(y) // int(cut_length * sr)):
        y_cut = y[i * int(cut_length * sr): (i + 1) * int(cut_length * sr)]

        # MFCC
        mfcc = librosa.feature.mfcc(y=y_cut, sr=sr, n_mfcc=20)
        mfcc_list.append(mfcc)
        img_name = ''

        # save as image if FLAG_SAVE_MFCC is True
        if FLAG_SAVE_MFCC:
            if i < 10:
                img_name = audio[:-4] + '_0' + str(i) + '.png'
            else:
                img_name = audio[:-4] + '_' + str(i) + '.png'
            # save as image
            plt.figure(figsize=(5, 2), dpi=100)
            librosa.display.specshow(mfcc)
            plt.tight_layout()
            plt.savefig(image_dir + img_name)
            plt.close()

        # record label
        mid_time = (i + 0.5) * cut_length
        while (temp_label[0] != '' and temp_label[1] < mid_time):   # if the clip is after the cur label, move to next label
            temp_label = f_label.readline().split(',')
            if temp_label[0] == '':
                break
            temp_label[0] = float(temp_label[0])
            temp_label[1] = float(temp_label[1])
        label = 'no lyrics'
        if (temp_label[0] != '' and temp_label[0] < mid_time < temp_label[1]):  # some lyrics
            label = temp_label[2]
        label_list.append(label)

        if FLAG_SAVE_MFCC:  
            print(img_name, label, sep=', ', file=f_annotation)
    
    f_label.close()
    
if FLAG_SAVE_MFCC:
    f_annotation.close()
print('Done!')

print('===== Prepare Dataset =====')
# flatten the mfccs
temp_mfccs_list = []
for i in range(len(mfcc_list)):
    temp_mfccs_list.append(mfcc_list[i].flatten())

# prepare train and test dataset
mfcc_array = np.array(temp_mfccs_list)
print('mfcc_array:', mfcc_array.shape)
label_array = np.array(label_list)
print('label_array:', label_array.shape)
X_train, X_test, y_train, y_test = train_test_split(mfcc_array, label_array, test_size=0.2, random_state=23) # DON'T Modify Random_state!
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

# encode labels
le = preprocessing.LabelEncoder()
label_array = le.fit_transform(label_array)
label_classes = le.classes_
print('number of classes:', len(label_classes))
print(label_classes)


Num of audio files: 65
===== Preprocess Data =====
Processing 最長的電影01.wav ...
Processing 最長的電影02.wav ...
Processing 最長的電影03.wav ...
Processing 最長的電影04.wav ...
Processing 最長的電影05.wav ...
Processing 最長的電影06.wav ...
Processing 最長的電影07.wav ...
Processing 最長的電影08.wav ...
Processing 最長的電影09.wav ...
Processing 最長的電影10.wav ...
Processing 最長的電影11.wav ...
Processing 最長的電影12.wav ...
Processing 蒲公英的約定01.wav ...
Processing 蒲公英的約定02.wav ...
Processing 蒲公英的約定03.wav ...
Processing 蒲公英的約定04.wav ...
Processing 蒲公英的約定05.wav ...
Processing 蒲公英的約定06.wav ...
Processing 蒲公英的約定07.wav ...
Processing 蒲公英的約定08.wav ...
Processing 蒲公英的約定09.wav ...
Processing 蒲公英的約定10.wav ...
Processing 蒲公英的約定11.wav ...
Processing 蒲公英的約定12.wav ...
Processing 蒲公英的約定13.wav ...
Processing 蒲公英的約定14.wav ...
Processing 蒲公英的約定15.wav ...
Processing 最長的電影13.wav ...
Processing 最長的電影14.wav ...
Processing 最長的電影15.wav ...
Processing 最長的電影16.wav ...
Processing 最長的電影17.wav ...
Processing 最長的電影18.wav ...
Processing 彩虹01.wav ...
Processing 彩虹02.wav

In [3]:
# ========== SVM ==========
print('===== Training SVM Model =====')

# # ========== Uncomment this block to train SVM model ==========
# svm_model = SVC(kernel='linear', max_iter=6000)
# # Perform K-fold cross-validation 
# kfold = KFold(n_splits=_n_split)
# for train_index, val_index in kfold.split(X_train):
#     X_train_kfold, X_val_kfold = X_train[train_index], X_train[val_index]
#     y_train_kfold, y_val_kfold = y_train[train_index], y_train[val_index]

#     # train
#     svm_model.fit(X_train_kfold, y_train_kfold)

#     # see validation score
#     pred_val = svm_model.predict(X_val_kfold)
#     score = accuracy_score(y_val_kfold, pred_val)

#     print("Validation score:", score)
# dump(svm_model, './model/svm_model.joblib')
# # ========== end of training block ==========

# ========== Uncomment this block to load trained SVM model ==========
svm_model = load('./model/svm_model.joblib')
# ========== end of loading block ==========

# evaluate with the test set
print('===== Evaluate SVM Model =====')
pred_test = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, pred_test)
precision_macro = precision_score(y_test, pred_test, average='macro')
precision_micro = precision_score(y_test, pred_test, average='micro')
precision_weighted = precision_score(y_test, pred_test, average='weighted')
recall_macro = recall_score(y_test, pred_test, average='macro')
recall_micro = recall_score(y_test, pred_test, average='micro')
recall_weighted = recall_score(y_test, pred_test, average='weighted')
print("Test accuracy:", accuracy)
print("Test precision_macro:", precision_macro)
print("Test precision_micro:", precision_micro)
print("Test precision_weighted:", precision_weighted)
print("Test recall_macro:", recall_macro)
print("Test recall_micro:", recall_micro)
print("Test recall_weighted:", recall_weighted)

===== Training SVM Model =====




Validation score: 0.800997506234414




Validation score: 0.8129675810473815




Validation score: 0.8133000831255195




Validation score: 0.7883272364482873
===== Evaluate SVM Model =====
Test accuracy: 0.7906899418121364
Test precision_macro: 0.7901647749180237
Test precision_micro: 0.7906899418121364
Test precision_weighted: 0.8072738180877355
Test recall_macro: 0.6795775760812488
Test recall_micro: 0.7906899418121364
Test recall_weighted: 0.7906899418121364


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# # ========== Decision Tree ==========
# print('===== Training Decision Tree Model =====')

# # ========== Uncomment this block to train Decision Tree model ==========
# decision_tree_model = DecisionTreeClassifier()
# # Perform K-fold cross-validation 
# kfold = KFold(n_splits=_n_split)
# for train_index, val_index in kfold.split(X_train):
#     X_train_kfold, X_val_kfold = X_train[train_index], X_train[val_index]
#     y_train_kfold, y_val_kfold = y_train[train_index], y_train[val_index]

#     # train
#     decision_tree_model.fit(X_train_kfold, y_train_kfold)

#     # see validation score
#     pred_val = decision_tree_model.predict(X_val_kfold)
#     score = accuracy_score(y_val_kfold, pred_val)

#     print("Validation score:", score)
# dump(decision_tree_model, './model/decision_tree_model.joblib')
# # ========== end of training block ==========

# # # ========== Uncomment this block to load trained Decision Tree model ==========
# # decision_tree_model = load('./model/decision_tree_model.joblib')
# # # ========== end of loading block ==========

# # evaluate with the test set
# print('===== Evaluate Decision Tree Model =====')
# pred_test = decision_tree_model.predict(X_test)
# accuracy = accuracy_score(y_test, pred_test)
# precision_macro = precision_score(y_test, pred_test, average='macro')
# precision_micro = precision_score(y_test, pred_test, average='micro')
# precision_weighted = precision_score(y_test, pred_test, average='weighted')
# recall_macro = recall_score(y_test, pred_test, average='macro')
# recall_micro = recall_score(y_test, pred_test, average='micro')
# recall_weighted = recall_score(y_test, pred_test, average='weighted')
# print("Test accuracy:", accuracy)
# print("Test precision_macro:", precision_macro)
# print("Test precision_micro:", precision_micro)
# print("Test precision_weighted:", precision_weighted)
# print("Test recall_macro:", recall_macro)
# print("Test recall_micro:", recall_micro)
# print("Test recall_weighted:", recall_weighted)

In [5]:
# ========== MLP ==========
print('===== Training MLP Model =====')

# # ========== Uncomment this block to train MLP model ==========
# MLP_model = MLPClassifier(hidden_layer_sizes=(1024, 128), max_iter=800)
# # Perform K-fold cross-validation 
# kfold = KFold(n_splits=_n_split)
# for train_index, val_index in kfold.split(X_train):
#     X_train_kfold, X_val_kfold = X_train[train_index], X_train[val_index]
#     y_train_kfold, y_val_kfold = y_train[train_index], y_train[val_index]

#     # train
#     MLP_model.fit(X_train_kfold, y_train_kfold)

#     # see validation score
#     pred_val = MLP_model.predict(X_val_kfold)
#     score = accuracy_score(y_val_kfold, pred_val)

#     print("Validation score:", score)
# dump(MLP_model, './model/MLP_model.joblib')
# # ========== end of training block ==========

# ========== Uncomment this block to load trained MLP model ==========
MLP_model = load('./model/MLP_model.joblib')
# ========== end of loading block ==========

# evaluate with the test set
print('===== Evaluate MLP Model =====')
pred_test = MLP_model.predict(X_test)
accuracy = accuracy_score(y_test, pred_test)
precision_macro = precision_score(y_test, pred_test, average='macro')
precision_micro = precision_score(y_test, pred_test, average='micro')
precision_weighted = precision_score(y_test, pred_test, average='weighted')
recall_macro = recall_score(y_test, pred_test, average='macro')
recall_micro = recall_score(y_test, pred_test, average='micro')
recall_weighted = recall_score(y_test, pred_test, average='weighted')
print("Test accuracy:", accuracy)
print("Test precision_macro:", precision_macro)
print("Test precision_micro:", precision_micro)
print("Test precision_weighted:", precision_weighted)
print("Test recall_macro:", recall_macro)
print("Test recall_micro:", recall_micro)
print("Test recall_weighted:", recall_weighted)

===== Training MLP Model =====
Validation score: 0.856691604322527
Validation score: 0.8365752285951787
Validation score: 0.8515378221113882
Validation score: 0.820917858330562
===== Evaluate MLP Model =====
Test accuracy: 0.8327514546965918
Test precision_macro: 0.778471248768338
Test precision_micro: 0.8327514546965918
Test precision_weighted: 0.8529033755917654
Test recall_macro: 0.7554771865529839
Test recall_micro: 0.8327514546965918
Test recall_weighted: 0.8327514546965918


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# ===== DEMO with the original song clips =====
demo_dir = './demo/'
test_files = os.listdir(demo_dir)

for test_file in test_files:
    if test_file[-4:] != '.wav':
        continue
    
    print('test file:', test_file)

    svm_predict_list = []
    decision_tree_predict_list = []
    MLP_predict_list = []

    # load the test file
    y, sr = librosa.load(demo_dir + test_file)

    # cut to small pieces with length of cut_length seconds
    for i in range(len(y) // int(cut_length * sr)):
        y_cut = y[i * int(cut_length * sr): (i + 1) * int(cut_length * sr)]
        mfcc = librosa.feature.mfcc(y=y_cut, sr=sr, n_mfcc=20)
        mfcc_reshape = mfcc.reshape(1, -1)
        # SVM
        svm_predict = svm_model.predict(mfcc_reshape)
        svm_predict_list.append( (i*cut_length, (i+1)*cut_length, svm_predict[0]) )
        # # Decision Tree
        # decision_tree_predict = decision_tree_model.predict(mfcc_reshape)
        # decision_tree_predict_list.append( (i*cut_length, (i+1)*cut_length, decision_tree_predict[0]) )
        # MLP
        MLP_predict = MLP_model.predict(mfcc_reshape)
        MLP_predict_list.append( (i*cut_length, (i+1)*cut_length, MLP_predict[0]) )

    # SVM
    f_svm_result = open(demo_dir + 'output/' + test_file[:-4] + '_svm_no_vote.txt', 'w')
    print('start, end, label', file=f_svm_result)
    start, end, label = svm_predict_list.pop(0)
    while len(svm_predict_list) > 0:
        if label == svm_predict_list[0][2]:
            end = svm_predict_list[0][1]
            svm_predict_list.pop(0)
        else:
            print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_svm_result)
            start, end, label = svm_predict_list.pop(0)
    print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_svm_result)
    f_svm_result.close()

    # # Decision Tree
    # f_decision_tree_result = open(demo_dir + 'output/' + test_file[:-4] + '_decision_tree_no_vote.txt', 'w')
    # print('start, end, label', file=f_decision_tree_result)
    # start, end, label = decision_tree_predict_list.pop(0)
    # while len(decision_tree_predict_list) > 0:
    #     if label == decision_tree_predict_list[0][2]:
    #         end = decision_tree_predict_list[0][1]
    #         decision_tree_predict_list.pop(0)
    #     else:
    #         print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_decision_tree_result)
    #         start, end, label = decision_tree_predict_list.pop(0)
    # print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_decision_tree_result)
    # f_decision_tree_result.close()

    # MLP
    f_MLP_result = open(demo_dir + 'output/' + test_file[:-4] + '_MLP_no_vote.txt', 'w')
    print('start, end, label', file=f_MLP_result)
    start, end, label = MLP_predict_list.pop(0)
    while len(MLP_predict_list) > 0:
        if label == MLP_predict_list[0][2]:
            end = MLP_predict_list[0][1]
            MLP_predict_list.pop(0)
        else:
            print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_MLP_result)
            start, end, label = MLP_predict_list.pop(0)
    print(str(round(start, 2)) + ', ' + str(round(end, 2)) + ', ' + str(label), file=f_MLP_result)
    f_MLP_result.close()


test file: 最長的電影00.wav
test file: 蒲公英的約定00.wav
test file: 不用麻煩了.wav
test file: 再給我兩分鐘.wav
test file: 決定命運的硬幣.wav
test file: 校長都看我幾.wav
test file: 彩虹00.wav
test file: 青花瓷00.wav
test file: 牛仔很忙00.wav


In [7]:
# # ===== prepare dataset and dataloader for ResNet50 =====
# class CustomImageDataset(Dataset):
#     def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
#         self.img_labels = pd.read_csv(annotations_file)
#         self.img_dir = img_dir
#         self.transform = transform
#         self.target_transform = target_transform

#     def __len__(self):
#         return len(self.img_labels)

#     def __getitem__(self, idx):
#         img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
#         image = read_image(img_path)[0:3, :, :]
#         label = self.img_labels.iloc[idx, 1]
#         if self.transform:
#             image = self.transform(image)
#         if self.target_transform:
#             label = self.target_transform(label)
#         return image, label
    
# dataset = CustomImageDataset(annotations_file='./data/total_annotation.csv', img_dir='./data/image/')
# # split to train and test
# print(len(dataset))
# train_size = int(0.8 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# print('Num of train data:', len(train_dataset))
# print('Num of test data:', len(test_dataset))
# # dataloader
# train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# # # ===== Display one image and label =====
# # train_features, train_labels = next(iter(train_dataloader))
# # img = train_features[0].permute(1, 2, 0)
# # label = train_labels[0]
# # plt.imshow(img)
# # plt.axis('off')
# # plt.show()
# # print(f"Label: {label}")
