In [None]:
from file_encoder import FileEncoder
from data_manager import DataManager
from audio_utils import SAMPLE_RATE, N_MELS
import numpy as np
import sys
import tensorflow as tf
from tensorflow.keras import layers, models, Model
from tensorflow.keras.optimizers import Adam


TARGET_DIR = "../thirdparty/「波音リツ」歌声データベースVer2/DATABASE"
OUTPUT_DIR = "../master/ust/json"


need_encode = True

manager = DataManager()

if need_encode:
    encoder = FileEncoder(TARGET_DIR, OUTPUT_DIR)
    names, lyric_indexs, durations, notenums, y = encoder.encode()
    manager.save(lyric_indexs, durations, notenums, names, y)
    
lyric_indexs, duration_indexs, notenum_indexs, names, y = manager.load()
(train_lyric, train_duration, train_notenum), (test_lyric, test_duration, test_notenum), y_train, y_test = manager.get_train_and_test_data()

max_lyric_index = np.max(lyric_indexs)

assert len(y) > 0 and len(lyric_indexs) > 0 and len(duration_indexs) > 0 and len(notenum_indexs) > 0

print("building model...")
# 入力データの形状を定義

#print("train_lyric", train_lyric)
#print("train_duration.", train_duration)
#print("train_notenum", train_notenum)


print("train_lyric.shape", train_lyric.shape)
print("train_duration.shape", train_duration.shape)
print("train_notenum.shape", train_notenum.shape)

lyric_input = tf.keras.Input(shape=train_lyric.shape[1:], name="lyric_input", dtype="int32")
duration_input = tf.keras.Input(shape=train_duration.shape[1:], name="duration_input", dtype="float32")
notenum_input = tf.keras.Input(shape= train_notenum.shape[1:], name="notenum_input", dtype="float32")

# Embedding 層
lyric_embedded = layers.Embedding(input_dim=max_lyric_index + 1, output_dim=128)(lyric_input)

duration_reshaped = layers.Reshape((-1, 1))(duration_input)  # (batch_size, 784, 1)
notenum_reshaped = layers.Reshape((-1, 1))(notenum_input)    # (batch_size, 784, 1)

# 入力を結合
merged = layers.Concatenate()([lyric_embedded, duration_reshaped, notenum_reshaped])

# LSTM 層
lstm_out = layers.LSTM(256, return_sequences=True)(merged)
lstm_out = layers.LSTM(256, return_sequences=True)(lstm_out)

# 出力層: 13660 フレーム × 128 次元（n_mels）
n_mels = 128  # メルスペクトログラムの周波数次元
output = layers.Dense(128)(lstm_out)
output = layers.Reshape(y_train.shape[1:])(output)

# モデルの構築
model = tf.keras.Model(
    inputs=[lyric_input, duration_input, notenum_input],
    outputs=output,
    name="vocaloid_model"
)

optimizer = Adam()

model.compile(
    optimizer=optimizer, 
    loss='mse',
    metrics=['mae']
)

model.summary()

model.fit(
    [train_lyric, train_duration, train_notenum],  # 入力データ（リスト形式）
    y_train,  # 出力データ
    batch_size=64,
    epochs=200,
)

manager.save_model(model)

print("Done!")

Data length: 12
note length example: 460


In [2]:
2048 * 128 / 1280


204.8

In [6]:
2048 * 128 / 

204.8

In [7]:
2048 / 256

8.0