In [58]:
%matplotlib inline

import tensorflow as tf
from keras import models
from keras import layers
import numpy as np


from score import Audio


## Preparing data

In [62]:
music_path = r"D:\CODE\Project\Music_score\src\data\music\download\4n9Ie3ac6BM.mp3"
audio = Audio(music_path, duration=60)
_, _, mfcc_stats = audio.get_mfcc(n_mfcc=20)

padding_size = 259 - np.array(mfcc_stats).shape[2]
padded_stats = np.pad(mfcc_stats, ((0, 0), (0, 0), (0, padding_size)))
data = np.array(padded_stats).transpose(1, 2, 0)

data.shape

(7, 20)

In [63]:
TRACKS, SEGMENTS, FEATURES = 20, 259, 7
data = data.reshape(SEGMENTS, TRACKS*FEATURES)
data = np.expand_dims(data, axis = 0)
data.shape

(1, 1, 140)

## Create a model

In [56]:

def create_model():
    model = models.Sequential()
    model.add(layers.Masking(mask_value=0, input_shape=(SEGMENTS, TRACKS*FEATURES)))
    model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
    model.add(layers.LSTM(64))
    model.add(layers.Dense(16, activation='softmax'))
    model.compile(optimizer='adam', loss='mse')
    model.summary()
    
    return model
    
def create_feature_extraction_model():
    # 編碼器部分
    input_layer = layers.Input(shape=(SEGMENTS, TRACKS * FEATURES))
    x = layers.Masking(mask_value=0)(input_layer)
    x = layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(x)
    x = layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(x)
    x = layers.LSTM(64)(x)
    encoded = layers.Dense(16, activation='relu')(x)
    
    # 特徵提取模型
    feature_extractor = models.Model(input_layer, encoded)
    feature_extractor.compile(optimizer='adam', loss='mse')
    
    # 顯示模型摘要
    feature_extractor.summary()
    
    return feature_extractor
    
model = create_model()
fe_model = create_feature_extraction_model()





In [57]:
fe_model.predict(data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)