# Notebook for preprocessing Persian Wikipedia dataset

### Initialize configuration

In [1]:
import yaml

config_path = "Configs/config_fa.yml"  # Persian config
config = yaml.safe_load(open(config_path))

### Initialize phonemizer and tokenizer

In [2]:
from phonemize_fa import phonemize

📥 Loading dictionary files from: /root/Persian-PL-BERT/.venv/lib/python3.12/site-packages/vaguye/persian-dict
✅ Loaded persian-primary.json
✅ Loaded persian-secondary.json
📚 Total entries loaded: 64183


In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer'])  # Persian BERT tokenizer

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Load Persian Wikipedia dataset

In [4]:
from load_persian_dataset import load_persian_wikipedia
dataset = load_persian_wikipedia("./datasets/wikipedia-fa.txt")

Loading Persian Wikipedia from ./datasets/wikipedia-fa.txt...
Loaded 104122 lines from Persian Wikipedia
Created dataset with 104122 examples


### Setup multiprocessing for dataset processing

In [5]:
root_directory = "./wiki_phoneme_fa"  # set up root directory for multiprocessor processing

In [6]:
import os
num_shards = 2000  # Adjusted for 1.3M lines (approx 650 lines per shard)

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [7]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

ModuleNotFoundError: No module named 'pebble'

#### Note: You may need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you may need to change the timeout to a longer value.

In [None]:
max_workers = 32  # change this to the number of CPU cores your machine has

with ProcessPool(max_workers=max_workers) as pool:
    # Increased timeout to 300s (5 mins) to be safe
    future = pool.map(process_shard, range(num_shards), timeout=300)
    
    iterator = iter(future)
    while True:
        try:
            next(iterator)
        except StopIteration:
            break
        except TimeoutError as error:
            print("Shard processing timed out")
        except Exception as error:
            print(f"Shard processing failed: {error}")

### Collect all shards to form the processed dataset

In [None]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

In [None]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

In [None]:
# check the dataset size
dataset

### Generate token maps for Persian vocabulary
We need to create mappings from BERT token IDs to reduced vocabulary for efficient training.

In [None]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [None]:
special_token = config['dataset_params']['word_separator']

In [None]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

In [None]:
# For Persian BERT, we don't need to lowercase (parsbert is already uncased)
# Just create direct mapping

lower_tokens = list(set(unique_index))

In [None]:
# create the mapping for Persian tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    token_maps[t] = {'word': word, 'token': lower_tokens.index(t)}

In [None]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

### Test the dataset with dataloader

In [None]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

In [None]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))