# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
import phonemizer
from phonemizer.separator import Separator

global_phonemizer = phonemizer.backend.EspeakBackend(language='hi', preserve_punctuation=True,  with_stress=True)
separator = Separator(phone='', word='', syllable='')

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to



### Process dataset

In [4]:
config['dataset_params']['tokenizer']

'bert-base-uncased'

In [5]:
def extract_hindi_features(text):
    backend = 'espeak'
    language = 'hi'
    # Correctly initialize the Separator
    separator = Separator(phone='', word='', syllable='')
    try:
        # Use separate parameters for phone, word, and syllable separators
        phonemes = global_phonemizer.phonemize([text], separator=separator, njobs=8)
        # Tokenize the text
        tokens = []
        current_word = ""
        for char in text:
            if char == " ":
                if current_word:
                    tokens.append(current_word)
                    current_word = ""
            else:
                current_word += char
        if current_word:
            tokens.append(current_word)
        return {
            'tokens': tokens,
            'words': text.split(),
            'phonemes': phonemes[0]  # phonemize returns a list, so we take the first element
        }
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

text = "मैं स्कूल जा रहा हूँ।"
result = extract_hindi_features(text)
print(result)

{'tokens': ['मैं', 'स्कूल', 'जा', 'रहा', 'हूँ।'], 'words': ['मैं', 'स्कूल', 'जा', 'रहा', 'हूँ।'], 'phonemes': 'mɛ̃skˈuːlɟˈaːɾˌəhaːhũ'}


In [6]:
from datasets import load_dataset
dataset = load_dataset("graelo/wikipedia", "20230601.hi",trust_remote_code=True)['train'] # you can use other version of this dataset

Downloading builder script:   0%|          | 0.00/4.81k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/201k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/230M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160068 [00:00<?, ? examples/s]

In [7]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [8]:
import os
num_shards = 300

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: extract_hindi_features(t['text']), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [10]:
from multiprocessing import Pool as ProcessPool
from concurrent.futures import TimeoutError

In [13]:
max_workers = 64 
with ProcessPool(processes=max_workers) as pool:
    pool.map(process_shard, range(num_shards))

Shard 6 already exists!Shard 2 already exists!Shard 16 already exists!Shard 26 already exists!Shard 22 already exists!Shard 30 already exists!Shard 20 already exists!Shard 10 already exists!Shard 8 already exists!Shard 0 already exists!Shard 14 already exists!Shard 4 already exists!Shard 32 already exists!Shard 40 already exists!Shard 28 already exists!Shard 24 already exists!Shard 18 already exists!Shard 46 already exists!Shard 12 already exists!Shard 34 already exists!Shard 36 already exists!


Shard 38 already exists!Shard 50 already exists!Shard 44 already exists!Shard 42 already exists!Shard 48 already exists!
Shard 64 already exists!
Shard 54 already exists!Shard 52 already exists!Shard 60 already exists!Shard 70 already exists!Shard 62 already exists!Shard 74 already exists!Shard 56 already exists!Shard 58 already exists!
Shard 68 already exists!
Shard 76 already exists!Shard 66 already exists!


Shard 72 already exists!Shard 88 already exists!
Shard 78 already exists!

Shard 86

### Collect all shards to form the processed dataset

In [14]:
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 160068
})

In [15]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_265 loaded
shard_267 loaded
shard_285 loaded
shard_279 loaded
shard_295 loaded
shard_284 loaded
shard_291 loaded
shard_281 loaded
shard_266 loaded
shard_197 loaded
shard_245 loaded
shard_229 loaded
shard_247 loaded
shard_243 loaded
shard_159 loaded
shard_107 loaded
shard_253 loaded
shard_299 loaded
shard_273 loaded
shard_297 loaded
shard_191 loaded
shard_278 loaded
shard_287 loaded
shard_289 loaded
shard_293 loaded
shard_3 loaded
shard_283 loaded
shard_217 loaded
shard_249 loaded
shard_294 loaded
shard_277 loaded
shard_261 loaded
shard_271 loaded
shard_269 loaded
shard_228 loaded
shard_275 loaded
shard_179 loaded
shard_163 loaded
shard_290 loaded
shard_280 loaded
shard_244 loaded
shard_196 loaded
shard_263 loaded
shard_183 loaded
shard_225 loaded
shard_219 loaded
shard_259 loaded
shard_137 loaded
shard_246 loaded
shard_231 loaded
shard_257 loaded
shard_251 loaded
shard_255 loaded
shard_235 loaded
shard_252 loaded
shard_241 loaded
shard_69 loaded
shard_298 loaded
shard_199 loaded


In [16]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/5 shards):   0%|          | 0/160068 [00:00<?, ? examples/s]

Dataset saved to wikipedia_20220301.hi.processed


In [17]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'tokens', 'words', 'phonemes'],
    num_rows: 160068
})

In [18]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset and add input_ids
def tokenize_function(examples):
    return tokenizer(examples['phonemes'], padding="max_length", truncation=True, return_tensors="pt")

# Apply the tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=8)

# Add input_ids to the dataset
dataset = tokenized_dataset




Map (num_proc=8):   0%|          | 0/160068 [00:00<?, ? examples/s]

In [19]:
dataset

Dataset({
    features: ['id', 'url', 'title', 'tokens', 'words', 'phonemes', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 160068
})

In [20]:
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/6 shards):   0%|          | 0/160068 [00:00<?, ? examples/s]

Dataset saved to wikipedia_20220301.hi.processed


### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [21]:
# Load the dataset from disk
loaded_dataset = load_from_disk(config['data_folder'])

# Display the dataset columns
print(loaded_dataset.column_names)

['id', 'url', 'title', 'tokens', 'words', 'phonemes', 'input_ids', 'token_type_ids', 'attention_mask']


In [31]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [32]:
special_token = config['dataset_params']['word_separator']

In [33]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

100%|██████████| 1250/1250 [00:19<00:00, 65.48it/s]


In [37]:
lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    lower_tokens.append(t)  # Keep the original token

lower_tokens = list(set(lower_tokens))  # Ensure uniqueness


100%|██████████| 1622/1622 [00:00<00:00, 47319.43it/s]


In [39]:
token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])  # Decode the token to get the word
    new_t = tokenizer.encode(word, add_special_tokens=False)[0]  # Encode the original word without lowercasing
    token_maps[t] = {'word': word, 'token': lower_tokens.index(t)}  # Use original token for mapping



100%|██████████| 1622/1622 [00:00<00:00, 11805.67it/s]


In [40]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [41]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [42]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))