# PIP INSTALLS

In [1]:
!pip install pandas tensorboardX sentencepiece soundfile librosa==0.9.1 g2p_en matplotlib

[0m

# IMPORTS

In [2]:
import io
import csv
import os
from pathlib import Path
from typing import Tuple, Union

from typing import Optional, List, Dict
import zipfile
import tempfile
from dataclasses import dataclass
from itertools import groupby

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio.datasets.utils import _extract_tar

import argparse
import logging
import shutil
from tempfile import NamedTemporaryFile
from collections import Counter, defaultdict

import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import os

from fairseq.examples.speech_to_text.data_utils import save_df_to_tsv, create_zip, gen_config_yaml, gen_vocab, get_zip_manifest, load_tsv_to_dicts
from fairseq.data.audio.audio_utils import convert_waveform, TTSSpectrogram, TTSMelScale, parse_path, read_from_stored_zip, is_npy_data
from fairseq_cli import train
from fairseq import options
from fairseq.distributed import utils as distributed_utils
from fairseq.examples.speech_synthesis import generate_waveform

log = logging.getLogger(__name__)

SPLITS = ["train", "dev", "test"]

out_path = '/workspace/production/LJ/'
in_path = '/workspace/LJSpeech-1.1/wavs'

# DATASETS

In [3]:
class LJSPEECH(Dataset):
    """*LJSpeech-1.1* :cite:`ljspeech17` dataset.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        url (str, optional): The URL to download the dataset from.
            (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``)
        folder_in_archive (str, optional):
            The top-level directory of the dataset. (default: ``"wavs"``)
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
    """

    def __init__(
        self,
    ) -> None:

        self._parse_filesystem()

    def _parse_filesystem(self) -> None:
        root = '/workspace/LJSpeech-1.1'

        self._path = root + '/wavs'
        self._metadata_path = root + "/metadata.csv"

        with open(self._metadata_path, "r", newline="") as metadata:
            flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
            self._flist = list(flist)

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            str:
                Normalized Transcript
        """
        line = self._flist[n]
        fileid, transcript, normalized_transcript = line
        fileid_audio = self._path +'/'+(fileid + ".wav")

        # Load audio
        waveform, sample_rate = torchaudio.load(fileid_audio)

        return (
            waveform,
            sample_rate,
            transcript,
            normalized_transcript,
        )


    def __len__(self) -> int:
        return len(self._flist)

# Audio Manifest

In [4]:
def processAudioManifest():
    out_root = Path(out_path).absolute()
    out_root.mkdir(parents=True, exist_ok=True)

    # Generate TSV manifest
    print("Generating manifest...")
    # following FastSpeech's splits
    dataset = LJSPEECH()
    id_to_split = {}
    for x in dataset._flist:
        id_ = x[0]
        speaker = id_.split("-")[0]
        id_to_split[id_] = {
            "LJ001": "test", "LJ002": "test", "LJ003": "dev"
        }.get(speaker, "train")
    manifest_by_split = {split: defaultdict(list) for split in SPLITS}
    progress = tqdm(enumerate(dataset), total=len(dataset))
    for i, (waveform, _, utt, normalized_utt) in progress:
        sample_id = dataset._flist[i][0]
        split = id_to_split[sample_id]
        manifest_by_split[split]["id"].append(sample_id)
        audio_path = f"{dataset._path}/{sample_id}.wav"
        manifest_by_split[split]["audio"].append(audio_path)
        manifest_by_split[split]["n_frames"].append(len(waveform[0]))
        manifest_by_split[split]["tgt_text"].append(normalized_utt)
        manifest_by_split[split]["speaker"].append("ljspeech")
        manifest_by_split[split]["src_text"].append(utt)

    manifest_root = Path(out_path).absolute()
    manifest_root.mkdir(parents=True, exist_ok=True)
    for split in SPLITS:
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest_by_split[split]),
            manifest_root / f"{split}.audio.tsv"
        )

In [5]:
processAudioManifest()

Generating manifest...


100%|████████████████████████████████████████████████████████████████████████████| 13100/13100 [00:57<00:00, 228.39it/s]


# Feature Manifest Dependencies

In [6]:
def trim_or_pad_to_target_length(
        data_1d_or_2d: np.ndarray, target_length: int
) -> np.ndarray:
    assert len(data_1d_or_2d.shape) in {1, 2}
    delta = data_1d_or_2d.shape[0] - target_length
    if delta >= 0:  # trim if being longer
        data_1d_or_2d = data_1d_or_2d[: target_length]
    else:  # pad if being shorter
        if len(data_1d_or_2d.shape) == 1:
            data_1d_or_2d = np.concatenate(
                [data_1d_or_2d, np.zeros(-delta)], axis=0
            )
        else:
            data_1d_or_2d = np.concatenate(
                [data_1d_or_2d, np.zeros((-delta, data_1d_or_2d.shape[1]))],
                axis=0
            )
    return data_1d_or_2d


def extract_logmel_spectrogram(
        waveform: torch.Tensor, sample_rate: int,
        output_path: Optional[Path] = None, win_length: int = 1024,
        hop_length: int = 256, n_fft: int = 1024,
        win_fn: callable = torch.hann_window, n_mels: int = 80,
        f_min: float = 0., f_max: float = 8000, eps: float = 1e-5,
        overwrite: bool = False, target_length: Optional[int] = None
):
    if output_path is not None and output_path.is_file() and not overwrite:
        return

    spectrogram_transform = TTSSpectrogram(
        n_fft=n_fft, win_length=win_length, hop_length=hop_length,
        window_fn=win_fn
    )
    mel_scale_transform = TTSMelScale(
        n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
        n_stft=n_fft // 2 + 1
    )
    spectrogram = spectrogram_transform(waveform)
    mel_spec = mel_scale_transform(spectrogram)
    logmel_spec = torch.clamp(mel_spec, min=eps).log()
    assert len(logmel_spec.shape) == 3 and logmel_spec.shape[0] == 1
    logmel_spec = logmel_spec.squeeze().t()  # D x T -> T x D
    if target_length is not None:
        logmel_spec = trim_or_pad_to_target_length(logmel_spec, target_length)

    if output_path is not None:
        np.save(output_path.as_posix(), logmel_spec)
    else:
        return logmel_spec


def extract_pitch(
        waveform: torch.Tensor, sample_rate: int,
        output_path: Optional[Path] = None, hop_length: int = 256,
        log_scale: bool = True, phoneme_durations: Optional[List[int]] = None
):
    if output_path is not None and output_path.is_file():
        return

    try:
        import pyworld
    except ImportError:
        raise ImportError("Please install PyWORLD: pip install pyworld")

    _waveform = waveform.squeeze(0).double().numpy()
    pitch, t = pyworld.dio(
        _waveform, sample_rate, frame_period=hop_length / sample_rate * 1000
    )
    pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate)

    if phoneme_durations is not None:
        pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations))
        try:
            from scipy.interpolate import interp1d
        except ImportError:
            raise ImportError("Please install SciPy: pip install scipy")
        nonzero_ids = np.where(pitch != 0)[0]
        if len(nonzero_ids) == 0:
            print((f"{output_path} has all empty values in the pitch contour"))
            return
        elif len(nonzero_ids) == 1:
            print((f"{output_path} has only one non-zero values in the pitch contour"))
            return
        else:
            interp_fn = interp1d(
                nonzero_ids,
                pitch[nonzero_ids],
                fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
                bounds_error=False,
            )
            pitch = interp_fn(np.arange(0, len(pitch)))
        d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations]))
        pitch = np.array(
            [
                np.mean(pitch[d_cumsum[i-1]: d_cumsum[i]])
                for i in range(1, len(d_cumsum))
            ]
        )
        assert len(pitch) == len(phoneme_durations)

    if log_scale:
        pitch = np.log(pitch + 1)

    if output_path is not None:
        np.save(output_path.as_posix(), pitch)
    else:
        return pitch


def extract_energy(
        waveform: torch.Tensor, output_path: Optional[Path] = None,
        hop_length: int = 256, n_fft: int = 1024, log_scale: bool = True,
        phoneme_durations: Optional[List[int]] = None
):
    if output_path is not None and output_path.is_file():
        return

    assert len(waveform.shape) == 2 and waveform.shape[0] == 1
    waveform = waveform.view(1, 1, waveform.shape[1])
    waveform = F.pad(
        waveform.unsqueeze(1), [n_fft // 2, n_fft // 2, 0, 0],
        mode="reflect"
    )
    waveform = waveform.squeeze(1)

    fourier_basis = np.fft.fft(np.eye(n_fft))
    cutoff = int((n_fft / 2 + 1))
    fourier_basis = np.vstack(
        [np.real(fourier_basis[:cutoff, :]),
         np.imag(fourier_basis[:cutoff, :])]
    )

    forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
    forward_transform = F.conv1d(
        waveform, forward_basis, stride=hop_length, padding=0
    )

    real_part = forward_transform[:, :cutoff, :]
    imag_part = forward_transform[:, cutoff:, :]
    magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
    energy = torch.norm(magnitude, dim=1).squeeze(0).numpy()

    if phoneme_durations is not None:
        energy = trim_or_pad_to_target_length(energy, sum(phoneme_durations))
        d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations]))
        energy = np.array(
            [
                np.mean(energy[d_cumsum[i - 1]: d_cumsum[i]])
                for i in range(1, len(d_cumsum))
            ]
        )
        assert len(energy) == len(phoneme_durations)

    if log_scale:
        energy = np.log(energy + 1)

    if output_path is not None:
        np.save(output_path.as_posix(), energy)
    else:
        return energy


def get_global_cmvn(feature_root: Path, output_path: Optional[Path] = None):
    mean_x, mean_x2, n_frames = None, None, 0
    feature_paths = feature_root.glob("*.npy")
    for p in tqdm(feature_paths):
        with open(p, 'rb') as f:
            frames = np.load(f).squeeze()

        n_frames += frames.shape[0]

        cur_mean_x = frames.sum(axis=0)
        if mean_x is None:
            mean_x = cur_mean_x
        else:
            mean_x += cur_mean_x

        cur_mean_x2 = (frames ** 2).sum(axis=0)
        if mean_x2 is None:
            mean_x2 = cur_mean_x2
        else:
            mean_x2 += cur_mean_x2

    mean_x /= n_frames
    mean_x2 /= n_frames
    var_x = mean_x2 - mean_x ** 2
    std_x = np.sqrt(np.maximum(var_x, 1e-10))

    if output_path is not None:
        with open(output_path, 'wb') as f:
            np.savez(f, mean=mean_x, std=std_x)
    else:
        return {"mean": mean_x, "std": std_x}


def ipa_phonemize(text, lang="en-us", use_g2p=False):
    if use_g2p:
        assert lang == "en-us", "g2pE phonemizer only works for en-us"
        try:
            from g2p_en import G2p
            g2p = G2p()
            return " ".join("|" if p == " " else p for p in g2p(text))
        except ImportError:
            raise ImportError(
                "Please install phonemizer: pip install g2p_en"
            )
    else:
        try:
            from phonemizer import phonemize
            from phonemizer.separator import Separator
            return phonemize(
                text, backend='espeak', language=lang,
                separator=Separator(word="| ", phone=" ")
            )
        except ImportError:
            raise ImportError(
                "Please install phonemizer: pip install phonemizer"
            )


@dataclass
class ForceAlignmentInfo(object):
    tokens: List[str]
    frame_durations: List[int]
    start_sec: Optional[float]
    end_sec: Optional[float]


def get_mfa_alignment_by_sample_id(
        textgrid_zip_path: str, sample_id: str, sample_rate: int,
        hop_length: int, silence_phones: List[str] = ("sil", "sp", "spn")
) -> ForceAlignmentInfo:
    try:
        import tgt
    except ImportError:
        raise ImportError("Please install TextGridTools: pip install tgt")

    filename = f"{sample_id}.TextGrid"
    out_root = Path(tempfile.gettempdir())
    tgt_path = out_root / filename
    with zipfile.ZipFile(textgrid_zip_path) as f_zip:
        f_zip.extract(filename, path=out_root)
    textgrid = tgt.io.read_textgrid(tgt_path.as_posix())
    os.remove(tgt_path)

    phones, frame_durations = [], []
    start_sec, end_sec, end_idx = 0, 0, 0
    for t in textgrid.get_tier_by_name("phones")._objects:
        s, e, p = t.start_time, t.end_time, t.text
        # Trim leading silences
        if len(phones) == 0:
            if p in silence_phones:
                continue
            else:
                start_sec = s
        phones.append(p)
        if p not in silence_phones:
            end_sec = e
            end_idx = len(phones)
        r = sample_rate / hop_length
        frame_durations.append(int(np.round(e * r) - np.round(s * r)))
    # Trim tailing silences
    phones = phones[:end_idx]
    frame_durations = frame_durations[:end_idx]

    return ForceAlignmentInfo(
        tokens=phones, frame_durations=frame_durations, start_sec=start_sec,
        end_sec=end_sec
    )


def get_mfa_alignment(
        textgrid_zip_path: str, sample_ids: List[str], sample_rate: int,
        hop_length: int
) -> Dict[str, ForceAlignmentInfo]:
    return {
        i: get_mfa_alignment_by_sample_id(
            textgrid_zip_path, i, sample_rate, hop_length
        ) for i in tqdm(sample_ids)
    }


def get_unit_alignment(
        id_to_unit_tsv_path: str, sample_ids: List[str]
) -> Dict[str, ForceAlignmentInfo]:
    id_to_units = {
        e["id"]: e["units"] for e in load_tsv_to_dicts(id_to_unit_tsv_path)
    }
    id_to_units = {i: id_to_units[i].split() for i in sample_ids}
    id_to_units_collapsed = {
        i: [uu for uu, _ in groupby(u)] for i, u in id_to_units.items()
    }
    id_to_durations = {
        i: [len(list(g)) for _, g in groupby(u)] for i, u in id_to_units.items()
    }

    return {
        i: ForceAlignmentInfo(
            tokens=id_to_units_collapsed[i], frame_durations=id_to_durations[i],
            start_sec=None, end_sec=None
        )
        for i in sample_ids
    }


def get_feature_value_min_max(feature_paths: List[str]):
    v_min, v_max = 1e-8, -1e-8
    for p in tqdm(feature_paths):
        _path, slice_ptr = parse_path(p)
        assert len(slice_ptr) == 2
        byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
        assert is_npy_data(byte_data)
        path_or_fp = io.BytesIO(byte_data)
        features = np.load(path_or_fp).squeeze()
        v_min = min(v_min, features.min().item())
        v_max = max(v_max, features.max().item())
    return v_min, v_max

# LOG-MEL SPECTROGRAM FEATURE MANIFEST

In [7]:
def processFeatureManifest(args):
    assert "train" in args.splits
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)

    print("Fetching data...")
    audio_manifest_root = Path(args.audio_manifest_root).absolute()
    samples = []
    for s in args.splits:
        for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"):
            e["split"] = s
            samples.append(e)
    sample_ids = [s["id"] for s in samples]

    # Get alignment info
    id_to_alignment = None
    if args.textgrid_zip is not None:
        assert args.id_to_units_tsv is None
        id_to_alignment = get_mfa_alignment(
            args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length
        )
    elif args.id_to_units_tsv is not None:
        # assume identical hop length on the unit sequence
        id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids)

    # Extract features and pack features into ZIP
    feature_name = "logmelspec80"
    zip_path = out_root / f"{feature_name}.zip"
    pitch_zip_path = out_root / "pitch.zip"
    energy_zip_path = out_root / "energy.zip"
    gcmvn_npz_path = out_root / "gcmvn_stats.npz"
    if zip_path.exists() and gcmvn_npz_path.exists():
        print(f"{zip_path} and {gcmvn_npz_path} exist.")
    else:
        feature_root = out_root / feature_name
        feature_root.mkdir(exist_ok=True)
        pitch_root = out_root / "pitch"
        energy_root = out_root / "energy"
        if args.add_fastspeech_targets:
            pitch_root.mkdir(exist_ok=True)
            energy_root.mkdir(exist_ok=True)
        print("Extracting Mel spectrogram features...")
        for sample in tqdm(samples):
            waveform, sample_rate = torchaudio.load(sample["audio"])
            waveform, sample_rate = convert_waveform(
                waveform, sample_rate, normalize_volume=args.normalize_volume,
                to_sample_rate=args.sample_rate
            )
            sample_id = sample["id"]
            target_length = None
            if id_to_alignment is not None:
                a = id_to_alignment[sample_id]
                target_length = sum(a.frame_durations)
                if a.start_sec is not None and a.end_sec is not None:
                    start_frame = int(a.start_sec * sample_rate)
                    end_frame = int(a.end_sec * sample_rate)
                    waveform = waveform[:, start_frame: end_frame]
            extract_logmel_spectrogram(
                waveform, sample_rate, feature_root / f"{sample_id}.npy",
                win_length=args.win_length, hop_length=args.hop_length,
                n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min,
                f_max=args.f_max, target_length=target_length
            )
            if args.add_fastspeech_targets:
                assert id_to_alignment is not None
                extract_pitch(
                    waveform, sample_rate, pitch_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
                extract_energy(
                    waveform, energy_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, n_fft=args.n_fft,
                    log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
        print("ZIPing features...")
        create_zip(feature_root, zip_path)
        get_global_cmvn(feature_root, gcmvn_npz_path)
        shutil.rmtree(feature_root)
        if args.add_fastspeech_targets:
            create_zip(pitch_root, pitch_zip_path)
            shutil.rmtree(pitch_root)
            create_zip(energy_root, energy_zip_path)
            shutil.rmtree(energy_root)

    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4
    if args.add_fastspeech_targets:
        pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path)
        energy_paths, energy_lengths = get_zip_manifest(energy_zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    id_to_cer = None
    if args.cer_threshold is not None:
        assert Path(args.cer_tsv_path).is_file()
        id_to_cer = {
            x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path)
        }
    manifest_by_split = {split: defaultdict(list) for split in args.splits}
    for sample in tqdm(samples):
        sample_id, split = sample["id"], sample["split"]

        if args.snr_threshold is not None and "snr" in sample \
                and sample["snr"] < args.snr_threshold:
            continue
        if args.cer_threshold is not None \
                and id_to_cer[sample_id] > args.cer_threhold:
            continue

        normalized_utt = sample["tgt_text"]
        if id_to_alignment is not None:
            normalized_utt = " ".join(id_to_alignment[sample_id].tokens)
        elif args.ipa_vocab:
            normalized_utt = ipa_phonemize(
                normalized_utt, lang=args.lang, use_g2p=args.use_g2p
            )
        manifest_by_split[split]["id"].append(sample_id)
        manifest_by_split[split]["audio"].append(audio_paths[sample_id])
        manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id])
        manifest_by_split[split]["tgt_text"].append(normalized_utt)
        manifest_by_split[split]["speaker"].append(sample["speaker"])
        manifest_by_split[split]["src_text"].append(sample["src_text"])
        if args.add_fastspeech_targets:
            assert id_to_alignment is not None
            duration = " ".join(
                str(d) for d in id_to_alignment[sample_id].frame_durations
            )
            manifest_by_split[split]["duration"].append(duration)
            manifest_by_split[split]["pitch"].append(pitch_paths[sample_id])
            manifest_by_split[split]["energy"].append(energy_paths[sample_id])
    for split in args.splits:
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest_by_split[split]),
            out_root / f"{split}.tsv"
        )
    # Generate vocab
    vocab_name, spm_filename = None, None
    if id_to_alignment is not None or args.ipa_vocab:
        vocab = Counter()
        for t in manifest_by_split["train"]["tgt_text"]:
            vocab.update(t.split(" "))
        vocab_name = "vocab.txt"
        with open(out_root / vocab_name, "w") as f:
            for s, c in vocab.most_common():
                f.write(f"{s} {c}\n")
    else:
        spm_filename_prefix = "spm_char"
        spm_filename = f"{spm_filename_prefix}.model"
        with NamedTemporaryFile(mode="w") as f:
            for t in manifest_by_split["train"]["tgt_text"]:
                f.write(t + "\n")
            f.flush()  # needed to ensure gen_vocab sees dumped text
            gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char")
    # Generate speaker list
    speakers = sorted({sample["speaker"] for sample in samples})
    speakers_path = out_root / "speakers.txt"
    with open(speakers_path, "w") as f:
        for speaker in speakers:
            f.write(f"{speaker}\n")
    # Generate config YAML
    win_len_t = args.win_length / args.sample_rate
    hop_len_t = args.hop_length / args.sample_rate
    extra = {
        "sample_rate": args.sample_rate,
        "features": {
            "type": "spectrogram+melscale+log",
            "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft,
            "window_fn": "hann", "win_length": args.win_length,
            "hop_length": args.hop_length, "sample_rate": args.sample_rate,
            "win_len_t": win_len_t, "hop_len_t": hop_len_t,
            "f_min": args.f_min, "f_max": args.f_max,
            "n_stft": args.n_fft // 2 + 1
        }
    }
    if len(speakers) > 1:
        extra["speaker_set_filename"] = "speakers.txt"
    if args.add_fastspeech_targets:
        pitch_min, pitch_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in pitch_paths.values()]
        )
        energy_min, energy_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in energy_paths.values()]
        )
        extra["features"]["pitch_min"] = pitch_min
        extra["features"]["pitch_max"] = pitch_max
        extra["features"]["energy_min"] = energy_min
        extra["features"]["energy_max"] = energy_max
    gen_config_yaml(
        out_root, spm_filename=spm_filename, vocab_name=vocab_name,
        audio_root=out_root.as_posix(), input_channels=None,
        input_feat_per_channel=None, specaugment_policy=None,
        cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra
    )

In [8]:
parser = argparse.ArgumentParser()
parser.add_argument("--audio-manifest-root", "-m", required=True, type=str)
parser.add_argument("--output-root", "-o", required=True, type=str)
parser.add_argument("--splits", "-s", type=str, nargs="+",
                    default=["train", "dev", "test"])
parser.add_argument("--ipa-vocab", action="store_true")
parser.add_argument("--use-g2p", action="store_true")
parser.add_argument("--lang", type=str, default="en-us")
parser.add_argument("--win-length", type=int, default=1024)
parser.add_argument("--hop-length", type=int, default=256)
parser.add_argument("--n-fft", type=int, default=1024)
parser.add_argument("--n-mels", type=int, default=80)
parser.add_argument("--f-min", type=int, default=20)
parser.add_argument("--f-max", type=int, default=8000)
parser.add_argument("--sample-rate", type=int, default=22050)
parser.add_argument("--normalize-volume", "-n", action="store_true")
parser.add_argument("--textgrid-zip", type=str, default=None)
parser.add_argument("--id-to-units-tsv", type=str, default=None)
parser.add_argument("--add-fastspeech-targets", action="store_true")
parser.add_argument("--snr-threshold", type=float, default=None)
parser.add_argument("--cer-threshold", type=float, default=None)
parser.add_argument("--cer-tsv-path", type=str, default="")
args = parser.parse_args(['--audio-manifest-root', out_path, '--output-root', out_path, '--ipa-vocab', '--use-g2p']) #, '--add-fastspeech-targets'])

print(args)

Namespace(audio_manifest_root='/workspace/process/LJ/', output_root='/workspace/process/LJ/', splits=['train', 'dev', 'test'], ipa_vocab=True, use_g2p=True, lang='en-us', win_length=1024, hop_length=256, n_fft=1024, n_mels=80, f_min=20, f_max=8000, sample_rate=22050, normalize_volume=False, textgrid_zip=None, id_to_units_tsv=None, add_fastspeech_targets=False, snr_threshold=None, cer_threshold=None, cer_tsv_path='')


In [12]:
processFeatureManifest(args)

Fetching data...
/workspace/process/LJ/logmelspec80.zip and /workspace/process/LJ/gcmvn_stats.npz exist.
Fetching ZIP manifest...


100%|████████████████████████████████████████████████████████████████████████████| 13100/13100 [00:23<00:00, 552.03it/s]


Generating manifest...


100%|███████████████████████████████████████████████████████████████████████████| 13100/13100 [2:36:59<00:00,  1.39it/s]


In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA TITAN RTX'

# TRAIN

In [6]:
trainParser = options.get_training_parser()
args = options.parse_args_and_arch(trainParser, [out_path, '--save-dir', out_path, 
                          '--config-yaml', 'config.yaml', '--train-subset', 'train', '--valid-subset', 'dev', 
                          '--num-workers', '3', '--max-tokens', '30000', '--max-update', '200000', 
                          '--task', 'text_to_speech', '--criterion', 'tacotron2', '--arch', 'tts_transformer',
                          '--clip-norm', '5.0', '--n-frames-per-step', '4', '--bce-pos-weight', '5.0',
                          '--dropout', '0.1', '--attention-dropout', '0.1', '--activation-dropout', '0.1',
                          '--encoder-normalize-before', '--decoder-normalize-before', 
                          '--optimizer', 'adam', '--lr', '2e-3', '--lr-scheduler', 'inverse_sqrt', '--warmup-updates', '4000',
                          '--seed', '1', '--update-freq', '5', '--eval-inference', '--best-checkpoint-metric', 'mcd_loss'])

print(args)

Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='tacotron2', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='text_to_speech', num_workers=3, skip_invalid_size_inputs_valid_test=False, max_tokens=30000, batch_si

In [7]:
cfg = train.convert_namespace_to_omegaconf(args)

if cfg.common.use_plasma_view:
    server = PlasmaStore(path=cfg.common.plasma_path)
    logger.info(
        f"Started plasma server pid {server.server.pid} {cfg.common.plasma_path}"
    )

if args.profile:
    with torch.cuda.profiler.profile():
        with torch.autograd.profiler.emit_nvtx():
            distributed_utils.call_main(cfg, main)
else:
    distributed_utils.call_main(cfg, train.main)

2023-07-24 18:21:44 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': N

KeyboardInterrupt: 

# Inference

In [22]:
waveformParser = generate_waveform.make_parser()
args = options.parse_args_and_arch(waveformParser, [out_path, '--config-yaml', 'config.yaml', '--gen-subset', 'test', '--task', 'text_to_speech', '--path', out_path+'checkpoint_avg_last_5.pt', '--max-tokens', '50000', '--spec-bwd-max-iter', '32', '--dump-waveforms', '--results-path', out_path+'results', '--dump-target', '--vocoder', 'hifigan'])

print(args)

Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None, lr_scheduler='fixed', scoring='bleu', task='text_to_speech', num_workers=1, skip_invalid_size_inputs_valid_test=False, max_tokens=50000, batch_size=No

In [23]:
generate_waveform.main(args)

2023-07-24 21:36:33 | INFO | fairseq.examples.speech_synthesis.generate_waveform | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None, lr_scheduler='fixed', scoring='bleu', task='text_to_speech', num_wo

AssertionError: 

NameError: name 'vocoder_cfg' is not defined