# Prepare filelists for ISSAI TTS speakers 


In [None]:
import json 


def write_records_to_json(records, filename):
  """
  Writes a dictionary of records to a JSON file.

  Args:
    records: A dictionary where keys are speaker IDs and values are file paths.
    filename: The name of the JSON file to write to.
  """

  with open(filename, 'w') as f:
    json.dump(records, f)

def read_records_from_json(filename):
  """
  Reads a dictionary of records from a JSON file.

  Args:
    filename: The name of the JSON file to read from.

  Returns:
    A dictionary of records.
  """

  with open(filename, 'r') as f:
    return json.load(f)


speaker_metadata = read_records_from_json("../../../metadata/speaker_metadata.json")
speaker_metadata

In [None]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
dir_data = "/home/temduck/vits2_unofficial/"
config = "../config.yaml"
symlink = "F1"
n_val = 100
n_test = 500

## Get hyperparameters from config file


In [None]:
import pandas as pd
from utils.hparams import get_hparams_from_file

hps = get_hparams_from_file(config)

## Read dataset

Here ISSAI dataset speakers 

In [None]:
speakers_datasets = {}
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id] = pd.read_csv(dir_data+speaker_metadata[speaker_id],
                                                names=["file", "text"], header=0)

# EDA speakers text 

In [None]:
from tqdm import tqdm
from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['cleaned_text']
        for character in text:
            charset[character.lower()] += 1
    return charset


In [None]:
import re
import pandas as pd 
from phonemizer.backend import EspeakBackend
import text_normalizer as nums_normalizer
from symbols import cyrillic_mapping
from tqdm import tqdm 
tqdm.pandas()


speaker_metadata = read_records_from_json("../../../metadata/speaker_metadata.json")
speakers_datasets = {}
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id] = pd.read_csv(dir_data+speaker_metadata[speaker_id],
                                                names=["file", "text"], header=0)


_numerals = [
    (re.compile(x[0]), x[1])
    for x in [
        (r'\b(3[01]|[12][0-9]|[1-9])\s(қаңтар|ақпан|наурыз|сәуір|мамыр|маусым|шілде|тамыз|қыркүйек|қазан|қараша|желтоқсан)', '_replace_nums_pair_word'), # nums pair kazakh month
        (r'\b\d{4}\s(жыл)', '_replace_nums_pair_word'), # nums pair kazakh year        
        (r'\b\d{1,3}\b', '_replace_nums'), # hundreds 
        (r'[а-яА-ЯӘәҒғҚқҢңӨөҰұҮүҺһІі]+\d+', '_remove_nums'), # kazakh word with digit
        (r'\d+[а-яА-ЯӘәҒғҚқҢңӨөҰұҮүҺһІі]+', '_remove_nums'), # digit with kazakh word 
        (r'\d+-[інші|ыншы|сыншы|ші|шы]', '_replace_ordinal_nums'),  # ordianal numerals with suffix 
        (r'\d+-(ден|тан|тен)', '_replace_group_nums') # group numerals with suffix 
    ]
]

_issaitts_trash = [
    ((re.compile("%s" % x[0], re.IGNORECASE), x[1]))
    for x in [
        ('–|—|−|－', '-'),
        ("\n|noise|ʨ|ɕ|»|–|«|—|̆|“|”|…|−|－|●", '')
    ]
]


def expand_numbers(text):
    for regex, replacement_func_name in _numerals:
        replacement_func = getattr(nums_normalizer, replacement_func_name)
        text = regex.sub(replacement_func, text)
    return text

def remove_trash(text):
    for regex, replacement in _issaitts_trash:
        text = re.sub(regex, replacement, text)
    return text
def lowercase(text):
    return text.lower()


def kazakh_cleaners_issaitts(text):
    """Pipeline for Kazakh tts speakers datasets text, including num2words, + punctuation + g2p"""
    table = str.maketrans(dict.fromkeys('#$%&\'()*+/:;<=>@[\\]^_`{|}~—…"«»“”'))
    text = lowercase(text)
    text = expand_numbers(text)
    text = text.translate(table)
    text = remove_trash(text)
    text = ''.join(cyrillic_mapping.get(char, char) for char in text)
    return text 


for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id]["normalized_text"] = speakers_datasets[speaker_id]["text"].progress_apply(kazakh_cleaners_issaitts)


## Text cleaners

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [None]:
# Get index of tokenize_text
text_cleaners = hps.data.text_cleaners

token_idx = text_cleaners.index("tokenize_text")
token_cleaners = text_cleaners[token_idx:]
print(token_cleaners)


# Extract phonemize_text
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = text_cleaners[:token_idx]
text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

In [None]:
from text import tokenizer
from torchtext.vocab import Vocab
import torchtext

speaker_id = "M2"
torchtext.disable_torchtext_deprecation_warning()
data = speakers_datasets[speaker_id]
text_norm = data["text"].tolist()
for cleaners in text_cleaners:
    print(f"Cleaning with {cleaners} ...")
    if cleaners[0] == "phonemize_text":
        text_norm = tokenizer(text_norm, Vocab, cleaners, language=hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, Vocab, cleaners, language=hps.data.language)
            text_norm[idx] = temp

data = data.assign(cleaned_text=text_norm)
data.head()

## Generate and save vocabulary


In [None]:
from torchtext.vocab import build_vocab_from_iterator
from utils.task import load_vocab, save_vocab
from text.symbols import special_symbols, UNK_ID
from typing import List


def yield_tokens(cleaned_text: List[str]):
    for text in cleaned_text:
        yield text.split()


text_norm = data["cleaned_text"].tolist()
vocab = build_vocab_from_iterator(yield_tokens(text_norm), specials=special_symbols)
vocab.set_default_index(UNK_ID)

vocab_file = f"../vocab_{speaker_id}.txt"
save_vocab(vocab, vocab_file)

vocab = load_vocab(vocab_file)
print(f"Size of vocabulary: {len(vocab)}")
print(vocab.get_itos())

## Token cleaners


In [None]:
from text import detokenizer

text_norm = data["cleaned_text"].tolist()
for idx, text in enumerate(text_norm):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    text_norm[idx] = temp

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(tokens=text_norm)
data.head()

## Save train, val, test filelists


In [None]:
from tqdm import tqdm
from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['normalized_text']
        for character in text:
            charset[character.lower()] += 1
    return charset

In [None]:
from phonemizer import phonemize
from typing import List
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend
separator = Separator(word="<space>", phone=" ")
_punctuation = ';:,.!?¡¿—…"«»“”'
_preserved_symbols_re = re.compile(rf"[{_punctuation}]|<.*?>")
backend = EspeakBackend(language="kk", preserve_punctuation=True, with_stress=True, punctuation_marks=_preserved_symbols_re)
def phonemize_text(text: List[str] | str, *args, language="kk", **kwargs):
    return phonemize(text, language=language, backend="espeak", separator=separator, strip=True, preserve_punctuation=True, punctuation_marks=_preserved_symbols_re, with_stress=True, njobs=8)

for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id]["cleaned_text"] = speakers_datasets[speaker_id]["normalized_text"].progress_apply(lambda text: backend.phonemize([text], strip=True, separator=separator)[0])
# clean_text  = phonemize_text(speakers_datasets[speaker_id]["normalized_text"].head().tolist())

In [None]:
from text import tokenizer
from torchtext.vocab import Vocab
import torchtext
tokenizer('e n d e ʃ ˈe<space>q ɑ ɫ ˈɑ j<space>m e n ˈɪ<space>ʒ ˈʊ m ə s q ɑ<space>ˈɑ ɫ d ə?', Vocab, ["add_spaces"], language="kk")

In [None]:
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id]["cleaned_text"] = speakers_datasets[speaker_id]["cleaned_text"].progress_apply(lambda text: tokenizer(text, Vocab, ["add_spaces"], language="kk"))

In [None]:
data = pd.concat(speakers_datasets.values()).reset_index(drop=True)

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from utils.task import load_vocab, save_vocab
from text.symbols import special_symbols, UNK_ID
from typing import List

def yield_tokens(cleaned_text: List[str]):
    for text in cleaned_text:
        yield text.split()


text_norm = data["cleaned_text"].tolist()
vocab = build_vocab_from_iterator(yield_tokens(text_norm), specials=special_symbols)
vocab.set_default_index(UNK_ID)

vocab_file = f"../vocab.txt"
save_vocab(vocab, vocab_file)

vocab = load_vocab(vocab_file)
print(f"Size of vocabulary: {len(vocab)}")
print(vocab.get_itos())

In [None]:
from text import detokenizer

text_norm = data["cleaned_text"].tolist()
for idx, text in enumerate(text_norm):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    text_norm[idx] = temp

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(tokens=text_norm)
data.head()

In [None]:
def get_tokens(text):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    return "\t".join(map(str, temp))

In [None]:
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id]["tokens"] = speakers_datasets[speaker_id]["cleaned_text"].progress_apply(get_tokens)

In [None]:
speaker_id = "M2"
data = speakers_datasets[speaker_id]
data = data[["file", "tokens"]]
# data["text"] =  data["text"].str.strip()
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("/home/temduck/vits2_unofficial/datasets/issai_base/filelists/issai_speakers/{}_train_filelist.txt".format(speaker_id), sep="|", index=False, header=False)
data_val.to_csv("/home/temduck/vits2_unofficial/datasets/issai_base/filelists/issai_speakers/{}_val_filelist.txt".format(speaker_id), sep="|", index=False, header=False)
data_test.to_csv("/home/temduck/vits2_unofficial/datasets/issai_base/filelists/issai_speakers/{}_test_filelist.txt".format(speaker_id), sep="|", index=False, header=False)

In [None]:
columns_s = ["file", "normalized_text", "cleaned_text", "tokens"] 
for speaker_id in speaker_metadata.keys():
    speakers_datasets[speaker_id].to_csv(f"/home/temduck/vits2_unofficial/metadata/{speaker_id}_file_nomalized_cleaned_tokens.csv", sep="|", index=False, header=False)