# 這篇程式碼是用來記錄一位講者的聲音向量
藉由輸入特定講者的聲音資料集，利用已訓練的模型預測該講者的聲音向量。<br>
把所有預測向量平均後儲存到後台，達到增加可辨識講者的效果。

In [None]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install tqdm

In [None]:
import os, librosa
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def load_audio(path):
    # 載入輸入音檔
    wav, sr = librosa.load(path, sr=22050)
    intervals = librosa.effects.split(wav, top_db=20)
    # 去掉無聲片段
    wav_output = []
    for sliced in intervals:
        wav_output.extend(wav[sliced[0]:sliced[1]])
    wav_output = np.array(wav_output)
    return wav_output, sr

def load_model(model_path):
    # 載入模型，取模型最後第二層為輸出向量(拔掉全連接層)
    # 2/23前的模型，layer名稱沒固定，如果發生錯誤請自行修改一下(記得改回來)
    layer_name = 'global_max_pooling'
    model = tf.keras.models.load_model(model_path)
    return tf.keras.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    
def expect_prediction(data_path, model):
    # 計算預測向量
    feature = []
    for audio in tqdm(os.listdir(data_path)):
        # 載入音檔並轉成梅爾頻譜
        wav_output, sr = load_audio(os.path.join(data_path, audio))
        data = librosa.feature.mfcc(y=wav_output, sr=sr, n_mfcc=128, hop_length=256).astype(np.float32)
        data = data[np.newaxis, ..., np.newaxis]
        # 儲存預測向量
        feature.append(model.predict(data))
    # 返回預測向量之平均
    return np.mean(np.array(feature), axis=0)

In [None]:
model = load_model('./models/train_mfcc1/resnet.h5')
feature2 = expect_prediction('./source/test/Speaker0', model)

In [None]:
import math

NAME = 'cmp_mcff1'
log = open('./logs/{}.log'.format(NAME), 'w')
for path in os.listdir('./source/test/'):
    wav_input, sr = load_audio('./source/test/{}/{}_1.wav'.format(path, path))
    data = librosa.feature.mfcc(y=wav_input, sr=sr, n_mfcc=128, hop_length=256).astype(np.float32)
    data = data[np.newaxis, ..., np.newaxis]
    feature1 = model.predict(data)
    dist = np.dot(feature1[0], feature2[0]) / (np.linalg.norm(feature1[0]) * np.linalg.norm(feature2[0]))
    dist = 1 - math.acos(dist) * 2 / math.pi
    print('Speaker0 與 {} 相似度為: {}'.format(path, dist), file = log)
    print('Speaker0 與 {} 相似度為: {}'.format(path, dist))
log.close()