In [None]:
hifitts_path = './hi_fi_tts_v0'

# Формирование единого датафрейма по всему датасету HiFiTTS

In [None]:
import sys
sys.path.append('tacotron2/')
from tacotron2.hparams import create_hparams
from tacotron2.layers import TacotronSTFT
import json
import pandas as pd
import os
from io import BytesIO
from scipy.io.wavfile import write


hp = create_hparams()

stft = TacotronSTFT(
    hp.filter_length, 
    hp.hop_length, 
    hp.win_length,
    hp.n_mel_channels, 
    hp.sampling_rate, 
    hp.mel_fmin,
    hp.mel_fmax
)

def read_json(json_path):
    dataset_type = json_path.split('_')[-1].replace('.json', '')
    with open(json_path, encoding='utf-8') as f:
        cond = "[" + f.read().replace("}\n{", "},\n{") + "]"
        json_data = json.loads(cond)
        for item in json_data:
            item['dataset_type'] = dataset_type
    return json_data

def flac_to_mel(load_flac_path, save_mel_path, dataset_type, txt_line):
    flac_data, sample_rate = librosa.load(load_flac_path)
    melspec_1 = librosa.feature.melspectrogram(y=flac_data,sr=sample_rate)
    if melspec_1.shape[1] >= 1000:
        return False
    with open('./hifitts/' + dataset_type + '.txt', 'a') as f:
        f.write(txt_line)
    audio = librosa.feature.inverse.mel_to_audio(melspec_1, sr=sample_rate)
    #sf.write(save_wav_path, audio, sample_rate)
    buf = BytesIO()
    write(buf, sample_rate, audio)
    buffered_audio = buf.getvalue()
    buf.close()
    buf_data, sr = sf.read(buffered_audio)
    floated_data = torch.FloatTensor(buf_data.astype(np.float32))
    norm_data = floated_data / hp.max_wav_value
    norm_data = norm_data.unsqueeze(0)
    norm_data = torch.autograd.Variable(norm_data, requires_grad=False)
    melspec_2 = stft.mel_spectrogram(norm_data)
    melspec_2 = torch.squeeze(melspec_2, 0)
    np.save(save_mel_path, melspec_2)

In [None]:
manifests = [manifest for manifest in os.listdir(hifitts_path) if 'manifest' in manifest]
manifest_paths = [f'{hifitts_path}/{manifest}' for manifest in manifests]
manifest_jsons = [read_json(manifest_path) for manifest_path in manifest_paths]
manifest_dfs = [pd.DataFrame(manifest_json) for manifest_json in manifest_jsons]
manifests_df = pd.concat(manifest_dfs, axis=0)

df = manifests_df.reset_index(drop=True).copy()
df['reader_id'] = df['audio_filepath'].apply(lambda x: x.split('/')[1].split('_')[0])
readers_list = [reader_id for reader_id in df.reader_id.unique()]
readers_dict = {reader_id: str(readers_list.index(reader_id)) for reader_id in readers_list}
df['reader_id_norm'] = df['reader_id'].apply(lambda x: readers_dict[x])
df['mel_path'] = 'mels/' + df.index.astype('string') + '_' + df['dataset_type'] + '_' + df['reader_id']
df['txt_line'] = df['mel_path'] + '|' + df['text'] + '|' + df['reader_id_norm'] + '\n'

df = df[['dataset_type', 'reader_id', 'reader_id_norm', 'text', 'audio_filepath', 'mel_path', 'txt_line']]
df = df[df['dataset_type'] != 'dev']
df.head()

In [None]:
df.shape

# Создание мелспектограмм и текстовых файлов

In [None]:
os.mkdir('./hifitts')
os.mkdir('./hifitts/mels')

tmp_df = df.copy()
tmp_df['line_for_create_mel'] = \
    tmp_df['audio_filepath'] + '&' + \
    tmp_df['mel_path'] + '&' + \
    tmp_df['dataset_type'] + '&' + \
    tmp_df['txt_line']
tmp_df['line_for_create_mel'].apply(lambda x: flac_to_mel(
    x.split('&')[0], 
    x.split('&')[1], 
    x.split('&')[2],
    x.split('&')[3],
))