Minor bugfix.

Tomiinek · Apr 7, 2020 · fdb43fb · fdb43fb
1 parent 9a43f79
commit fdb43fb
Show file tree

Hide file tree

Showing 7 changed files with 104,550 additions and 104,470 deletions.
diff --git a/data/css10/train.txt b/data/css10/train.txt
diff --git a/data/css10/val.txt b/data/css10/val.txt
diff --git a/data/css_comvoi/train.txt b/data/css_comvoi/train.txt
diff --git a/data/css_comvoi/val.txt b/data/css_comvoi/val.txt
diff --git a/data/prepare_css_spectrograms.py b/data/prepare_css_spectrograms.py
@@ -0,0 +1,77 @@
+import sys
+import os
+import numpy as np
+
+sys.path.insert(0, "../")
+
+from utils import audio
+from params.params import Params as hp
+
+
+if __name__ == '__main__':
+    import argparse
+    import re
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--css10_directory", type=str, default="css10", help="Base directory of CSS10.")
+    parser.add_argument("--css_comvoi_directory", type=str, default="css_comvoi", help="Base directory of CSS10 with Common Voice.")
+    parser.add_argument("--comvoi_directory", type=str, default="comvoi_clean", help="Base directory of Common Voice.")
+    parser.add_argument("--sample_rate", type=int, default=22050, help="Sample rate.")
+    parser.add_argument("--num_fft", type=int, default=1102, help="Number of FFT frequencies.")
+    parser.add_argument("--num_mels", type=int, default=80, help="Number of mel bins.")
+    parser.add_argument("--stft_window_ms", type=float, default=50, help="STFT window size.")
+    parser.add_argument("--stft_shift_ms", type=float, default=12.5, help="STFT window shift.")
+    parser.add_argument("--no_preemphasis", action='store_false', help="Do not use preemphasis.")
+    parser.add_argument("--preemphasis", type=float, default=0.97, help="Strength of preemphasis.")
+
+    args = parser.parse_args()
+
+    hp.sample_rate = args.sample_rate
+    hp.num_fft = args.num_fft
+
+    files_to_solve = [
+        (args.css10_directory, "train.txt"),
+        (args.css10_directory, "val.txt"),
+        (args.css_comvoi_directory, "train.txt"),
+        (args.css_comvoi_directory, "val.txt"),
+    ]
+
+    spectrogram_dirs = [os.path.join(args.comvoi_directory, 'spectrograms'), 
+                        os.path.join(args.comvoi_directory, 'linear_spectrograms'),
+                        os.path.join(args.css10_directory, 'spectrograms'), 
+                        os.path.join(args.css10_directory, 'linear_spectrograms')]
+    for x in spectrogram_dirs:
+        if not os.path.exists(x): os.makedirs(x)
+
+    metadata = []
+    for d, fs in files_to_solve:
+        with open(os.path.join(d,fs), 'r', encoding='utf-8') as f:
+            metadata.append((d, fs, [line.rstrip().split('|') for line in f]))
+
+    print(f'Please wait, this may take a very long time.')
+    for d, fs, m in metadata:  
+        print(f'Creating spectrograms for: {fs}')
+
+        with open(os.path.join(d, fs), 'w', encoding='utf-8') as f:
+            for i in m:
+                idx, s, l, a, _, _, raw_text, ph = i
+                spec_name = idx + '.npy'      
+                audio_path = os.path.join(d, a)       
+                audio_data = audio.load(audio_path)
+
+                splitted_a = a.split("/")
+                if splitted_a[0] == "..":
+                    mel_path_partial = os.path.join(splitted_a[0], splitted_a[1], "spectrograms", spec_name)
+                    lin_path_partial = os.path.join(splitted_a[0], splitted_a[1], "linear_spectrograms", spec_name)
+                else:
+                    mel_path_partial = os.path.join("spectrograms", spec_name)
+                    lin_path_partial = os.path.join("linear_spectrograms", spec_name)
+
+                mel_path = os.path.join(d, mel_path_partial)
+                if not os.path.exists(mel_path):
+                    np.save(mel_path, audio.spectrogram(audio_data, True))
+                lin_path = os.path.join(d, lin_path_partial)
+                if not os.path.exists(lin_path):
+                    np.save(lin_path, audio.spectrogram(audio_data, False))
+
+                print(f'{idx}|{s}|{l}|{a}|{mel_path_partial}|{lin_path_partial}|{raw_text}|{ph}', file=f)
diff --git a/modules/tacotron2.py b/modules/tacotron2.py
@@ -368,15 +368,15 @@ def forward(self, text, text_length, target, target_length, speakers, languages,
         speaker_prediction = self._reversal_classifier(encoded) if hp.reversal_classifier else None
 
         # decode 
-        if languages is not None:
+        if languages is not None and languages.dim() == 3:
             languages = torch.argmax(languages, dim=2) # convert one-hot into indices
         decoded = self._decoder(encoded, text_length, target, teacher_forcing_ratio, speakers, languages)
         prediction, stop_token, alignment = decoded
         pre_prediction = prediction.transpose(1,2)
         post_prediction = self._postnet(pre_prediction, target_length)
 
         # mask output paddings
-        target_mask = lengths_to_mask(target_length, target.size(2))
+        target_mask = utils.lengths_to_mask(target_length, target.size(2))
         stop_token.masked_fill_(~target_mask, 1000)
         target_mask = target_mask.unsqueeze(1).float()
         pre_prediction = pre_prediction * target_mask
@@ -398,7 +398,7 @@ def inference(self, text, speaker=None, language=None):
         encoded = self._encoder(embedded, torch.LongTensor([text.size(1)]), language)
 
         # decode with respect to speaker and language embeddings
-        if language is not None:
+        if language is not None and language.dim() == 3:
             language = torch.argmax(language, dim=2) # convert one-hot into indices
         prediction = self._decoder.inference(encoded, speaker, language)
 

diff --git a/train.py b/train.py
@@ -211,6 +211,9 @@ def __getattr__(self, name):
     # find out number of unique speakers and languages
     hp.speaker_number = 0 if not hp.multi_speaker else dataset.train.get_num_speakers()
     hp.language_number = 0 if not hp.multi_language else len(hp.languages)
+    # save all found speakers to hyper parameters
+    if hp.multi_speaker and not args.checkpoint:
+        hp.unique_speakers = dataset.train.unique_speakers
 
     # acquire dataset-dependent constants, these should probably be the same while going from checkpoint
     if not args.checkpoint:
@@ -239,7 +242,7 @@ def __getattr__(self, name):
             {'params': encoder_params, 'lr': hp.learning_rate_encoder}
         ], lr=hp.learning_rate, weight_decay=hp.weight_decay)
     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, hp.learning_rate_decay_each // len(train_data), gamma=hp.learning_rate_decay)
-    criterion = TacotronLoss(hp.guided_attention_steps, hp.guided_attention_toleration, hp.guided_attention_gain, hp.language_number)
+    criterion = TacotronLoss(hp.guided_attention_steps, hp.guided_attention_toleration, hp.guided_attention_gain)
 
     # load model weights and optimizer, scheduler states from checkpoint state dictionary
     initial_epoch = 0