In [9]:
from glob import glob
import pandas as pd
import json
import os
import torch
import IPython.display as ipd

torch.set_grad_enabled(False)

from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

def new_path(f):
    splitted = f.split('/')
    folder = f.split('/')[0]
    folder = folder + '_neucodec'
    new_f = os.path.join(folder, '/'.join(splitted[1:]))
    new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
    return new_f

In [2]:
from datasets import load_dataset

ds = load_dataset("Scicom-intl/ExpressiveSpeech", "default")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

data/train-00001-of-00004.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

data/train-00002-of-00004.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

data/train-00003-of-00004.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/507607 [00:00<?, ? examples/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained('Scicom-intl/Multilingual-TTS-1.7B-Base')
extra = [AddedToken('<|description|>'), AddedToken('<|description_category|>')]
tokenizer.add_tokens(extra)

tokenizer_config.json:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

2

In [13]:
def loop(rows):
    rows, _ = rows
    data = []

    selected = ['token_filename', 'text', 'speaker', 'description', 'description_category']
    for r in tqdm(rows):
        token_filename = new_path_neucodec(r['audio_filename'])
        try:
            with open(token_filename) as fopen:
                d = json.load(fopen)
            r['token_filename'] = token_filename
            new_r = {}
            for s in selected:
                new_r[s] = r[s]
            data.append(new_r)
        except:
            pass
    return data

In [16]:
try:
    with open('cache-expressive.json') as fopen:
        data = json.load(fopen)
except:
    rows = ds['train'].to_list()
    data = multiprocessing(rows, loop, cores = 20)
    with open('cache-expressive.json', 'w') as fopen:
        json.dump(data, fopen)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [00:02<00:00, 9441.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [00:02<00:00, 9148.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [00:02<00:00, 9321.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25380/25380 [00:02<00:00, 9104.23it/s]
100%|███████████████████████

481252

In [35]:
# not_in = set()
# for r in rows:
#     s = new_path_neucodec(r['audio_filename'])
#     if s not in token_filenames:
#         not_in.add(s)

In [37]:
# from collections import defaultdict

# counts = defaultdict(int)
# for s in not_in:
#     counts[os.path.split(s)[0]] += 1

# counts

In [38]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [39]:
data[0]

{'token_filename': 'multilingual-tts_audio_neucodec/multilingual-tts-data-train-00000-of-00004-c699f04e5a4da714_2787.json',
 'text': 'Мария (Spanish) sagte "Ich liebe Phở" (Vietnamese), während sie in der Berliner (German) Sonne das Buch "El Quijote" (Spanish) las.',
 'speaker': 'multilingual-tts_audio_Grace',
 'description': 'The audio features a middle-aged female with a distinct Germanic accent, speaking in a neutral tone at a very slow pace. Her speech is consistently fluent and clear throughout, with a very low pitch that lends a deep, resonant quality to her voice. The recording environment is very confined, giving the audio a tight, enclosed sound. Despite the monotone delivery, the clarity and quality of the speech are excellent, making it easy to follow and understand. The background is free of any distracting noise, enhancing the overall listening experience. The content seems to blend elements from different languages and cultures, suggesting a multicultural or multilingual 

In [40]:
import time

sequence_length = 1024 * 10

def loop(files, block_size=sequence_length):
    rows, index = files
    out_root = f'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []

    def flush_and_reset(new_ids, new_position):
        nonlocal count, temp, position_ids
        o = collator(temp, position_ids)
        if o['input_ids'].shape[0] > 0:
            out.write(o)
        temp = [new_ids]
        position_ids = [new_position]
        count = len(new_ids)

    def add_prompt(prompt):
        nonlocal count, temp, position_ids
        outputs = tokenizer(prompt, add_special_tokens=False)
        ids = outputs['input_ids']
        position = range(len(ids))
        if count + len(ids) > block_size:
            flush_and_reset(ids, position)
        else:
            temp.append(ids)
            position_ids.append(position)
            count += len(ids)

    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            text = row['text']
            try:
                with open(row['token_filename']) as fopen:
                    token = json.load(fopen)
            except:
                continue

            if len(text.split()) > len(token):
                continue

            left = row['speaker'] + ': ' + text
            token = ''.join([f'<|s_{t}|>' for t in token])

            prompts = [
                f'<|im_start|>{left}<|description|>{row["description"]}<|speech_start|>{token}<|im_end|>',
                f'<|im_start|>{left}<|description|>{row["description_category"]}<|speech_start|>{token}<|im_end|>',
                f'<|im_start|>{token}<|description|>{row["description"]}<|im_end|>',
                f'<|im_start|>{token}<|description_category|>{row["description_category"]}<|im_end|>',
            ]
            for prompt in prompts:
                add_prompt(prompt)

        if temp:
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [41]:
loop((data[:100], 0))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 201.01it/s]


In [44]:
dataset = LocalDataset('gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-0')
len(dataset)

23

In [46]:
tokenizer.decode(dataset[0]['input_ids'])

'<|im_start|>multilingual-tts_audio_Grace: Мария (Spanish) sagte "Ich liebe Phở" (Vietnamese), während sie in der Berliner (German) Sonne das Buch "El Quijote" (Spanish) las.<|description|>The audio features a middle-aged female with a distinct Germanic accent, speaking in a neutral tone at a very slow pace. Her speech is consistently fluent and clear throughout, with a very low pitch that lends a deep, resonant quality to her voice. The recording environment is very confined, giving the audio a tight, enclosed sound. Despite the monotone delivery, the clarity and quality of the speech are excellent, making it easy to follow and understand. The background is free of any distracting noise, enhancing the overall listening experience. The content seems to blend elements from different languages and cultures, suggesting a multicultural or multilingual context, possibly from a literary or cultural discussion. This audio could be categorized as a cultural or literary commentary, suitable for

In [47]:
multiprocessing(data, loop, cores = 10, returned = False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48125/48125 [03:41<00:00, 217.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 245.72it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48125/48125 [03:47<00:00, 211.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48125/48125 [04:39<00:00, 171.93it/s]
100%|███████████████████████

In [48]:
folders = sorted(glob('gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-0',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-1',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-2',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-3',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-4',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-5',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-6',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-7',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-8',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-9',
 'gfs/01be5b33/tokenized-10k-qwen3-expressive/tokenized-10']

In [49]:
!rm -rf gfs/01be5b33/multipacking-10k-expressive

In [50]:
with MDSWriter(out='gfs/01be5b33/multipacking-10k-expressive', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11278/11278 [00:15<00:00, 712.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11294/11294 [00:20<00:00, 556.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11250/11250 [00:20<00:00, 535.78it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11263/11263 [00:24<00:00, 461.09it/s]
100%|███████████████████████

In [51]:
dataset = LocalDataset('gfs/01be5b33/multipacking-10k-expressive')
(len(dataset) * 10240) / 1e9

1.15283968

In [None]:
# !hf upload Scicom-intl/expressive-multipacking-10k \
# gfs/01be5b33/multipacking-10k-expressive --repo-type=dataset --private