Skip to content
This repository has been archived by the owner on Jan 18, 2024. It is now read-only.

Commit

Permalink
Minor bugfix.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomiinek committed Apr 7, 2020
1 parent 9a43f79 commit fdb43fb
Show file tree
Hide file tree
Showing 7 changed files with 104,550 additions and 104,470 deletions.
114,326 changes: 57,163 additions & 57,163 deletions data/css10/train.txt

Large diffs are not rendered by default.

1,280 changes: 640 additions & 640 deletions data/css10/val.txt

Large diffs are not rendered by default.

92,014 changes: 46,007 additions & 46,007 deletions data/css_comvoi/train.txt

Large diffs are not rendered by default.

1,312 changes: 656 additions & 656 deletions data/css_comvoi/val.txt

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions data/prepare_css_spectrograms.py
@@ -0,0 +1,77 @@
import sys
import os
import numpy as np

sys.path.insert(0, "../")

from utils import audio
from params.params import Params as hp


if __name__ == '__main__':
import argparse
import re

parser = argparse.ArgumentParser()
parser.add_argument("--css10_directory", type=str, default="css10", help="Base directory of CSS10.")
parser.add_argument("--css_comvoi_directory", type=str, default="css_comvoi", help="Base directory of CSS10 with Common Voice.")
parser.add_argument("--comvoi_directory", type=str, default="comvoi_clean", help="Base directory of Common Voice.")
parser.add_argument("--sample_rate", type=int, default=22050, help="Sample rate.")
parser.add_argument("--num_fft", type=int, default=1102, help="Number of FFT frequencies.")
parser.add_argument("--num_mels", type=int, default=80, help="Number of mel bins.")
parser.add_argument("--stft_window_ms", type=float, default=50, help="STFT window size.")
parser.add_argument("--stft_shift_ms", type=float, default=12.5, help="STFT window shift.")
parser.add_argument("--no_preemphasis", action='store_false', help="Do not use preemphasis.")
parser.add_argument("--preemphasis", type=float, default=0.97, help="Strength of preemphasis.")

args = parser.parse_args()

hp.sample_rate = args.sample_rate
hp.num_fft = args.num_fft

files_to_solve = [
(args.css10_directory, "train.txt"),
(args.css10_directory, "val.txt"),
(args.css_comvoi_directory, "train.txt"),
(args.css_comvoi_directory, "val.txt"),
]

spectrogram_dirs = [os.path.join(args.comvoi_directory, 'spectrograms'),
os.path.join(args.comvoi_directory, 'linear_spectrograms'),
os.path.join(args.css10_directory, 'spectrograms'),
os.path.join(args.css10_directory, 'linear_spectrograms')]
for x in spectrogram_dirs:
if not os.path.exists(x): os.makedirs(x)

metadata = []
for d, fs in files_to_solve:
with open(os.path.join(d,fs), 'r', encoding='utf-8') as f:
metadata.append((d, fs, [line.rstrip().split('|') for line in f]))

print(f'Please wait, this may take a very long time.')
for d, fs, m in metadata:
print(f'Creating spectrograms for: {fs}')

with open(os.path.join(d, fs), 'w', encoding='utf-8') as f:
for i in m:
idx, s, l, a, _, _, raw_text, ph = i
spec_name = idx + '.npy'
audio_path = os.path.join(d, a)
audio_data = audio.load(audio_path)

splitted_a = a.split("/")
if splitted_a[0] == "..":
mel_path_partial = os.path.join(splitted_a[0], splitted_a[1], "spectrograms", spec_name)
lin_path_partial = os.path.join(splitted_a[0], splitted_a[1], "linear_spectrograms", spec_name)
else:
mel_path_partial = os.path.join("spectrograms", spec_name)
lin_path_partial = os.path.join("linear_spectrograms", spec_name)

mel_path = os.path.join(d, mel_path_partial)
if not os.path.exists(mel_path):
np.save(mel_path, audio.spectrogram(audio_data, True))
lin_path = os.path.join(d, lin_path_partial)
if not os.path.exists(lin_path):
np.save(lin_path, audio.spectrogram(audio_data, False))

print(f'{idx}|{s}|{l}|{a}|{mel_path_partial}|{lin_path_partial}|{raw_text}|{ph}', file=f)
6 changes: 3 additions & 3 deletions modules/tacotron2.py
Expand Up @@ -368,15 +368,15 @@ def forward(self, text, text_length, target, target_length, speakers, languages,
speaker_prediction = self._reversal_classifier(encoded) if hp.reversal_classifier else None

# decode
if languages is not None:
if languages is not None and languages.dim() == 3:
languages = torch.argmax(languages, dim=2) # convert one-hot into indices
decoded = self._decoder(encoded, text_length, target, teacher_forcing_ratio, speakers, languages)
prediction, stop_token, alignment = decoded
pre_prediction = prediction.transpose(1,2)
post_prediction = self._postnet(pre_prediction, target_length)

# mask output paddings
target_mask = lengths_to_mask(target_length, target.size(2))
target_mask = utils.lengths_to_mask(target_length, target.size(2))
stop_token.masked_fill_(~target_mask, 1000)
target_mask = target_mask.unsqueeze(1).float()
pre_prediction = pre_prediction * target_mask
Expand All @@ -398,7 +398,7 @@ def inference(self, text, speaker=None, language=None):
encoded = self._encoder(embedded, torch.LongTensor([text.size(1)]), language)

# decode with respect to speaker and language embeddings
if language is not None:
if language is not None and language.dim() == 3:
language = torch.argmax(language, dim=2) # convert one-hot into indices
prediction = self._decoder.inference(encoded, speaker, language)

Expand Down
5 changes: 4 additions & 1 deletion train.py
Expand Up @@ -211,6 +211,9 @@ def __getattr__(self, name):
# find out number of unique speakers and languages
hp.speaker_number = 0 if not hp.multi_speaker else dataset.train.get_num_speakers()
hp.language_number = 0 if not hp.multi_language else len(hp.languages)
# save all found speakers to hyper parameters
if hp.multi_speaker and not args.checkpoint:
hp.unique_speakers = dataset.train.unique_speakers

# acquire dataset-dependent constants, these should probably be the same while going from checkpoint
if not args.checkpoint:
Expand Down Expand Up @@ -239,7 +242,7 @@ def __getattr__(self, name):
{'params': encoder_params, 'lr': hp.learning_rate_encoder}
], lr=hp.learning_rate, weight_decay=hp.weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, hp.learning_rate_decay_each // len(train_data), gamma=hp.learning_rate_decay)
criterion = TacotronLoss(hp.guided_attention_steps, hp.guided_attention_toleration, hp.guided_attention_gain, hp.language_number)
criterion = TacotronLoss(hp.guided_attention_steps, hp.guided_attention_toleration, hp.guided_attention_gain)

# load model weights and optimizer, scheduler states from checkpoint state dictionary
initial_epoch = 0
Expand Down

0 comments on commit fdb43fb

Please sign in to comment.