# Notebook for preprocessing Persian Wikipedia dataset

In [1]:
!pip install pebble -q

### Initialize configuration

In [2]:
import yaml

config_path = "Configs/config_fa.yml"  # Persian config
config = yaml.safe_load(open(config_path))

### Initialize phonemizer and tokenizer

In [3]:
# !pip install git+https://git@github.com/SadeghKrmi/pernorm.git
# !pip install git+https://git@github.com/SadeghKrmi/zirneshane.git
# !pip install git+https://git@github.com/SadeghKrmi/vaguye.git
# !pip install git+https://git@github.com/SadeghKrmi/hamnevise.git

In [4]:
from phonemize_fa import phonemize

In [5]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer'])  # Persian BERT tokenizer

### Load Persian Wikipedia dataset

In [6]:
from load_persian_dataset import load_persian_wikipedia
dataset = load_persian_wikipedia("./datasets/wikipedia-fa-cleaned-samples.txt")

Loading Persian Wikipedia from ./datasets/wikipedia-fa-cleaned-samples.txt...
Loaded 258 lines from Persian Wikipedia
Created dataset with 258 examples


### Setup multiprocessing for dataset processing

In [7]:
root_directory = "./datasets/wiki_phoneme_fa"  # set up root directory for multiprocessor processing

In [None]:
import os
num_shards = 2000  # Adjusted for 1.3M lines (approx 650 lines per shard)

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [9]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You may need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you may need to change the timeout to a longer value.

In [None]:
max_workers = 32  # change this to the number of CPU cores your machine has

with ProcessPool(max_workers=max_workers) as pool:
    # Increased timeout to 300s (5 mins) to be safe
    future = pool.map(process_shard, range(num_shards), timeout=300)
    for result in future.result():
        pass


### Collect all shards to form the processed dataset

In [11]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_3 loaded
shard_15 loaded
shard_11 loaded
shard_5 loaded
shard_9 loaded
shard_17 loaded
shard_0 loaded
shard_13 loaded
shard_10 loaded
shard_16 loaded
shard_18 loaded
shard_19 loaded
shard_1 loaded
shard_14 loaded
shard_12 loaded
shard_2 loaded
shard_7 loaded
shard_6 loaded
shard_8 loaded
shard_4 loaded


In [12]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/1 shards):   0%|          | 0/258 [00:00<?, ? examples/s]

Dataset saved to wikipedia_fa.processed


In [13]:
# check the dataset size
dataset

Dataset({
    features: ['input_ids', 'phonemes'],
    num_rows: 258
})

In [14]:
print(dataset[1]['phonemes'])

['viːkiːpedˈjɒː', 'tɒː', 'ˈdo', 'ordiːbeˈheʃt', 'ˈjek', 'heˈzɒːr', 'o', 'siːˈsæd', 'o', 'næˈvæd', 'o', 'ˈjek', 'drbrɡiːrndeˈje', 'deˈviːst', 'o', 'hæfˈtɒːd', 'o', 'ˈnæ', 'zæˈbɒːn', 'bɒː', 'ˌbiːʃ', 'æz', 'ˈdæh', 'mæqɒːˈleː', 'buːdeːˈˌæst', '.']


### Generate token maps for Persian vocabulary
We need to create mappings from BERT token IDs to reduced vocabulary for efficient training.

In [None]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [16]:
special_token = config['dataset_params']['word_separator']

In [17]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 41.39it/s]


In [18]:
# For Persian BERT, we don't need to lowercase (parsbert is already uncased)
# Just create direct mapping

lower_tokens = list(set(unique_index))

In [19]:
# create the mapping for Persian tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    token_maps[t] = {'word': word, 'token': lower_tokens.index(t)}

100%|██████████| 1638/1638 [00:00<00:00, 59953.14it/s]


In [20]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_fa_maps.pkl


### Test the dataset with dataloader

In [21]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [22]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))