In [1]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="malaysia-ai/Multilingual-TTS", 
#     repo_type="dataset",
#     allow_patterns="*/*.parquet",
#     local_dir="./Multilingual-TTS",
# )

In [2]:
from glob import glob
import pandas as pd
import json
import os
import torch
import IPython.display as ipd

torch.set_grad_enabled(False)

from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

def new_path(f):
    splitted = f.split('/')
    folder = f.split('/')[0]
    folder = folder + '_neucodec'
    new_f = os.path.join(folder, '/'.join(splitted[1:]))
    new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
    return new_f

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def loop(files):
    files, _ = files
    data = []
    for f in tqdm(files):
        df = pd.read_parquet(f).to_dict(orient = 'records')
        for i in range(len(df)):
            token_filename = new_path(df[i]['audio_filename'])
            if not os.path.exists(token_filename):
                continue
            df[i]['token_filename'] = token_filename
            data.append(df[i])
    return data

In [4]:
files = glob('Multilingual-TTS/*/*.parquet')
files = [f for f in files if 'Malaysian-TTS-v2' not in f]
data = multiprocessing(files, loop, cores = 30)
len(data)

100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
100%|██████████| 2/2 [00:03<00:00,  1.91s/it]
100%|██████████| 2/2 [00:04<00:00,  2.09s/it]
100%|██████████| 2/2 [00:01<00:00,  1.92it/s]
100%|██████████| 2/2 [00:04<00:00,  2.38s/it]
100%|██████████| 2/2 [00:06<00:00,  3.37s/it]
100%|██████████| 2/2 [00:02<00:00,  1.35s/it]
100%|██████████| 2/2 [00:07<00:00,  3.59s/it]
100%|██████████| 2/2 [00:07<00:00,  3.73s/it]
100%|██████████| 2/2 [00:00<00:00,  9.57it/s]

100%|██████████| 2/2 [00:08<00:00,  4.42s/it]
100%|██████████| 2/2 [00:09<00:00,  4.51s/it]
100%|██████████| 2/2 [00:09<00:00,  4.54s/it]
100%|██████████| 2/2 [00:00<00:00,  2.23it/s]
100%|██████████| 2/2 [00:09<00:00,  4.78s/it]
100%|██████████| 2/2 [00:10<00:00,  5.08s/it]
100%|██████████| 2/2 [00:10<00:00,  5.38s/it]
100%|██████████| 2/2 [00:11<00:00,  5.56s/it]
100%|██████████| 2/2 [00:11<00:00,  5.60s/it]
100%|██████████| 2/2 [00:11<00:00,  5.71s/it]
100%|██████████| 2/2 [00:11<00:00

17871888

In [5]:
malaysian = pd.read_parquet('Multilingual-TTS/Malaysian-TTS-v2/train-00000-of-00001.parquet').to_dict(orient = 'records')

In [6]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-1.7B-Base')
extra = [AddedToken('<|speech_start|>')]
for i in range(65536):
    extra.append(AddedToken(f'<|s_{i}|>'))
tokenizer.add_tokens(extra)

65537

In [7]:
data[0]

{'audio_filename': 'ORAA-MUPE-ASR_audio/ORAA-MUPE-ASR-data-train-00044-of-00074_0.mp3',
 'text': 'Tchan, tchan, tchan, tchan.',
 'speaker': 'ORAA-MUPE-ASR_audio_MA_HV186',
 'token_filename': 'ORAA-MUPE-ASR_audio_neucodec/ORAA-MUPE-ASR-data-train-00044-of-00074_0.json'}

In [13]:
columns = {
    'input_ids': 'uint32',
}

def loop(rows):
    rows, index = rows
    out_root = f'tokenized-4k-qwen3/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            text = row['text']
            
            try:
                with open(row['token_filename']) as fopen:
                    token = json.load(fopen)
            except:
                continue

            if len(text.split()) > len(token):
                continue

            left = row['speaker'] +': ' + text
            
            token = ''.join([f'<|s_{t}|>' for t in token])
            prompt = f'<|im_start|>{left}<|speech_start|>{token}<|im_end|>'
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            input_ids = outputs['input_ids']
            input_ids = np.array(input_ids).astype(np.uint32)

            out.write({
                'input_ids': input_ids,
            })
            

In [11]:
combined = data + malaysian

In [17]:
loop((combined[:10], 0))
dataset = LocalDataset('tokenized-4k-qwen3/tokenized-0')
dataset[0]

100%|██████████| 10/10 [00:00<00:00, 1405.08it/s]


{'input_ids': array([151644,    868,   6029,   5251,     52,   1740,     12,   1911,
            49,  29688,  85311,   2039,     53,     16,     23,     21,
            25,    350,   5658,     11,    259,   5658,     11,    259,
          5658,     11,    259,   5658,     13, 151669, 192623, 203889,
        204262, 204040, 207452, 208679, 207405, 206385, 208296, 191069,
        191628, 208186, 206095, 207422, 212668, 206024, 206155, 206043,
        207100, 207116, 205976, 194860, 190700, 192801, 174381, 203002,
        190882, 189855, 206728, 185211, 188483, 168478, 190142, 174920,
        210075, 206515, 176531, 190460, 152255, 186878, 205775, 158540,
        207384, 210167, 211636, 193935, 178120, 168844, 204862, 184823,
        207866, 168951, 172121, 212763, 203303, 208471, 215608, 210356,
        193184, 157092, 193683, 200924, 173784, 198986, 176701, 171774,
        199388, 159174, 153356, 154366, 198416, 158203, 170704, 200190,
        178956, 170747, 170494, 199368, 178427, 171

In [None]:
multiprocessing(combined, loop, cores = 40, returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av