In [7]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from safetensors.torch import load
import random
from IPython.display import display, Audio, Markdown
from espnet2.fileio.sound_scp import SoundScpReader
from tqdm import tqdm

model_path = "./PromptEncoder/output/model_2023-12-07_143028/model.safetensors"
with open(model_path, "rb") as f:
    data = f.read()
loaded = load(data)

dump_path = '/mnt/data/users/snegishi/M2/Satoru-Negishi/espnet/egs2/jvs_ms_prompttts++/tts1/dump/22k/raw/tr_no_dev'

vector_size = 256


# プロンプトエンコーダ

In [8]:
from transformers.modeling_outputs import ModelOutput


# BERTのトークナイザー読み込み
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

#モデルの定義-------------------------------------------------------------------------------------------
class BERTClass(torch.nn.Module):
    def __init__(self, pretrained_model, num_labels ,loss_function=None,):
        super(BERTClass, self).__init__()
        self.l1 = pretrained_model
        self.l2 = torch.nn.Linear(768, 512)
        self.relu1 = torch.nn.ReLU()
        self.l3 = torch.nn.Linear(512, 256) #<<<
        self.relu2 = torch.nn.ReLU()
        self.l4 = torch.nn.Linear(256,256)
        self.loss_function = loss_function
        self.config = pretrained_model.config
        
        self.config.num_labels = num_labels

    def forward(self, input_ids, attention_mask=None, position_ids=None, token_type_ids=None, output_attentions=False, output_hidden_states=False, list=None):
        output_1= self.l1(input_ids, attention_mask = attention_mask, position_ids = position_ids, token_type_ids = token_type_ids, output_attentions = output_attentions, output_hidden_states = output_hidden_states)
        output_2 = self.l2(output_1.last_hidden_state[:,0,:])
        output_2r = self.relu1(output_2)
        output_3 = self.l3(output_2r)
        output_3r = self.relu2(output_3)
        output = self.l4(output_3r)
        
        y = torch.autograd.Variable(torch.Tensor(output.size(0)).cuda().fill_(1.0))
        loss = None

        if list is not None and self.loss_function is not None:
            loss = self.loss_function(output, list,y)

        attentions=None
        if output_attentions:
            attentions=output_1.attentions

        hidden_states=None
        if output_hidden_states:
            hidden_states=output_1.hidden_states

        return ModelOutput(
            logits = output,
            loss = loss,
            last_hidden_state=output_1.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

# loss_fct = CosineSimilarity(reduction='mean')
loss_fct = torch.nn.CosineEmbeddingLoss()
pretrained_model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
model = BERTClass(pretrained_model, num_labels=vector_size ,loss_function=loss_fct)
# model.load_state_dict(torch.load(model_bin))
model.load_state_dict(loaded)


<All keys matched successfully>

# 複数話者音声合成器

In [9]:
from espnet2.bin.tts_inference import Text2Speech
from espnet2.train.dataset import ESPnetDataset
import os

# 実験ディレクトリのパス
exp_dir = "./exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_prosody"

# 学習設定ファイルのパス
train_config_path = os.path.join(exp_dir, "config.yaml")

# モデルファイルのパス
model_path = os.path.join(exp_dir, "train.loss.ave_5best.pth")
# model_path = os.path.join(exp_dir, "train.loss.best.pth")

# ボコーダモデルの名前
vocoder_tag = "parallel_wavegan/jsut_parallel_wavegan.v1"

# text2speechインスタンスの作成
# text2speech = Text2Speech.from_pretrained(
#     train_config=train_config_path,
#     model_file=model_path,
#     vocoder_tag=vocoder_tag,
#     use_style_embedding=True
# )
text2speech = Text2Speech.from_pretrained(
    train_config=train_config_path,
    model_file=model_path,
    vocoder_tag=vocoder_tag,
    )



# 音声合成関数

In [10]:
def TTS(spembs,text="おはようございます。よろしくお願いします。",comp = False):
    """
    spembsと読み上げテキストで音声を合成し表示する
    """
    # result = text2speech(text, spembs=spembs)
    result = text2speech(text=text)
    display(Markdown("- 合成"))
    display(Audio(result["wav"].cpu(), rate=text2speech.fs))
    
    # if comp:
    #     comp3(xvector,text)

In [11]:
def inference(prompt,tts=False,comp=False):
    """
    話者プロンプトからspembsを推定する
    """
    encoded_sequences = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**encoded_sequences)
    print(outputs)
    spkembs = outputs.logits
    
    # print(spkembs)
    if tts:
        TTS(spembs=spkembs,comp=comp)
    # else:
    #     return xvector[0]

# コーパス話者データの読み込み

In [12]:
df = pd.read_csv('./PromptEncoder/data/id_and_NL_emb.csv',header=0)

In [13]:
df

Unnamed: 0,id,NL,emb
0,10,おとなしく幼さを感じる女子中学生,"[0.103718491, -0.00934963916, -0.106388815, -0..."
1,11,中性的な声の青年,"[0.10368413, -0.00943414, -0.10669071, -0.0295..."
2,14,少年ぽさを感じる女の子,"[0.103654917, -0.00974015798, -0.105651211, -0..."
3,15,20歳程度のお姉さん,"[0.10427354, -0.00957899239, -0.105432502, -0...."
4,18,ゆっくり間を開けて話す女性,"[0.10311142, -0.00847343, -0.10722332, -0.0286..."
...,...,...,...
74,92,早口で話すかすれた声のおばさん,"[0.103464354, -0.00918067495, -0.10675806, -0...."
75,95,20～30歳ぐらいの女性,"[0.103017924, -0.00849153176, -0.107310012, -0..."
76,96,高く細い声の女の人,"[0.103096794, -0.00915894711, -0.107083128, -0..."
77,97,ふわふわした高い声の男性,"[0.103617905, -0.00870063514, -0.106498085, -0..."


In [14]:
embs = df['emb'].values.tolist()
for emb,i in zip(embs, range(len(embs))):
    emb = emb.replace('\n','').replace('[','').replace(']','').replace(',','').split()
    emb = [float(x) for x in emb]
    embs[i] = emb
df  = pd.DataFrame({'id':df['id'],'NL':df['NL'],'emb':embs})

# speech抽出

In [15]:
def sound_reader(rate, audio):
    speech = np.expand_dims(audio, 0)
    speech_lengths = np.array([speech.shape[0]])
    return speech, speech_lengths

In [16]:
spk2utt = dict()
with open(os.path.join(dump_path, "spk2utt"), "r") as reader:
    for line in reader:
        details = line.split()
        spk2utt[details[0]] = details[1:]
wav_scp = SoundScpReader(os.path.join(dump_path, "wav.scp"), np.float32)
for speaker in tqdm(spk2utt):
    style_embeds = []
    for utt in spk2utt[speaker]:
        rate, audio = wav_scp[utt]
        speech, speech_lengths = sound_reader(rate, audio)
        break

100%|██████████| 98/98 [00:00<00:00, 4449.71it/s]


In [17]:
speech

array([[-0.00231934, -0.00259399, -0.00241089, ...,  0.00082397,
         0.00073242,  0.00091553]], dtype=float32)

# 音声合成

In [18]:
inference(df["NL"][3],tts=True)

ModelOutput([('logits', tensor([[ 1.8493e-01, -1.7390e-02, -1.8958e-01, -5.0626e-02,  1.4217e-01,
          3.6218e-01,  2.3346e-02,  5.9731e-02, -3.1348e-01, -6.3672e-02,
         -2.6061e-01, -2.7105e-05, -1.6257e-01,  1.4026e-01, -3.8290e-02,
          1.6834e-01, -1.3030e-01, -2.1093e-01,  8.1493e-02, -4.2336e-02,
         -7.1276e-03, -4.2020e-03,  1.3281e-01,  1.7482e-01,  1.8692e-01,
         -3.9292e-02,  6.5265e-02, -1.7059e-01, -1.2706e-01,  8.0389e-03,
          3.0436e-02, -1.7557e-01, -4.3374e-03,  9.3686e-02, -3.0473e-01,
         -3.9417e-02,  5.0352e-02,  2.7266e-02,  5.7573e-02, -2.2064e-01,
          4.5618e-03, -1.2887e-01,  1.8201e-01,  5.1200e-04,  1.0299e-02,
         -2.5425e-01, -1.6939e-01,  1.8325e-02, -1.1725e-01,  8.4785e-02,
          2.0731e-01, -7.6176e-02,  1.2444e-01,  1.4873e-01, -1.3769e-01,
         -1.1162e-01,  8.1439e-02,  4.8526e-02, -6.5803e-02, -1.0755e-01,
         -3.6803e-02, -5.2666e-02,  2.9110e-01, -9.8572e-02, -2.8575e-03,
          1.07

- 合成