In [1]:
from glob import glob
import pandas as pd
import json
import os
import torch
import IPython.display as ipd

torch.set_grad_enabled(False)

from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

def new_path(f):
    splitted = f.split('/')
    base_folder = splitted[0] + '_trim'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

def new_path_neucodec(f):
    splitted = f.split('/')
    folder = f.split('/')[0]
    folder = folder + '_neucodec'
    new_f = os.path.join(folder, '/'.join(splitted[1:]))
    new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
    return new_f

  import pynvml  # type: ignore[import]


In [None]:
from datasets import load_dataset

ds = load_dataset("Scicom-intl/Emilia-YODAS-Voice-Conversion")

In [None]:
rows = ds['train'].to_list()

In [None]:
f = new_path_neucodec(new_path(rows[0]['reference_audio']))

In [13]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-1.7B-Base')
extra = [AddedToken('<|speech_start|>')]
for i in range(65536):
    extra.append(AddedToken(f'<|s_{i}|>'))
tokenizer.add_tokens(extra)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

65537

In [None]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [None]:
import time

sequence_length = 1024 * 10
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'emilia-yodas/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            try:
                with open(new_path_neucodec(new_path(row['reference_audio']))) as fopen:
                    left = json.load(fopen)
            except:
                continue
            
            try:
                with open(new_path_neucodec(new_path(row['target_audio']))) as fopen:
                    right = json.load(fopen)
            except:
                continue

            left_text = row['reference_text']
            right_text = row['target_text']

            if len(left_text.split()) > len(left):
                continue

            if len(right_text.split()) > len(right):
                continue
            
            left_token = ''.join([f'<|s_{t}|>' for t in left])
            right_token = ''.join([f'<|s_{t}|>' for t in right])
            
            left_prompt = f'<|im_start|>{left_text}<|speech_start|>{left_token}<|im_end|>'
            right_prompt = f'<|im_start|>{right_text}<|speech_start|>{right_token}<|im_end|>'

            prompt = left_prompt + right_prompt
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [None]:
multiprocessing(rows, loop, cores = 40, returned = False)

In [4]:
folders = sorted(glob('emilia-yodas/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [5]:
!rm -rf multipacking-emilia-yodas

In [6]:
with MDSWriter(out='multipacking-emilia-yodas', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85024/85024 [00:17<00:00, 4762.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85556/85556 [00:17<00:00, 4858.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85644/85644 [00:17<00:00, 4796.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85189/85189 [00:17<00:00, 4887.86it/s]
100%|███████████████████████

In [7]:
dataset = LocalDataset('multipacking-emilia-yodas')
len(dataset)

2743166

In [14]:
tokenizer.decode(dataset[0]['input_ids'])

'<|im_start|>而更重要的事情是今年瑞赤人影年,也就是我们陈部长的破关之年。这种破关之年,诸事不利,行事倒行逆施。虽然他本人可能没有这个心态,也没有这个想法,但是他当他去实行他的业务的时候,就是会产生这样的结果。<|speech_start|><|s_50598|><|s_52429|><|s_55786|><|s_36294|><|s_3312|><|s_52470|><|s_22196|><|s_19085|><|s_21880|><|s_10782|><|s_12345|><|s_9741|><|s_5945|><|s_6062|><|s_35192|><|s_50328|><|s_35301|><|s_52422|><|s_22093|><|s_26500|><|s_43788|><|s_45848|><|s_30220|><|s_30486|><|s_16100|><|s_7154|><|s_56213|><|s_55613|><|s_48686|><|s_15161|><|s_13385|><|s_12907|><|s_15661|><|s_29166|><|s_28333|><|s_21210|><|s_2958|><|s_1852|><|s_17229|><|s_6953|><|s_39209|><|s_18473|><|s_44379|><|s_6937|><|s_18254|><|s_39467|><|s_50987|><|s_18201|><|s_41598|><|s_9269|><|s_34219|><|s_17664|><|s_23066|><|s_55594|><|s_3189|><|s_19494|><|s_27403|><|s_1818|><|s_17166|><|s_6192|><|s_12875|><|s_9768|><|s_37254|><|s_51761|><|s_50583|><|s_40289|><|s_2529|><|s_36342|><|s_51581|><|s_42039|><|s_12838|><|s_29862|><|s_25195|><|s_49122|><|s_41938|><|s_17273|><|s_788|><|s_553|><|s_34325|><|s_33613|><|s_1813|><|

In [8]:
!du -hs multipacking-emilia-yodas

198G	multipacking-emilia-yodas


In [9]:
!rm -rf emilia-yodas

In [None]:
!hf upload Scicom-intl/Emilia-YODAS-multipacking-10k \
multipacking-emilia-yodas --repo-type=dataset --private

It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`hf upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.
Start hashing 3160 files.
Finished hashing 3160 files.
Processing Files (0 / 0)      : |                  |  0.00B /  0.00B            
New Data Upload               : |                  |  0.00B /  0.00B            [A

  ...lia-yodas/shard.00002.mds:   4%|▌             | 2.59MB / 67.1MB            [A[A


  ...lia-yodas/shard.00000.mds:   4%|▌             | 2.61MB / 67.1MB            [A[A[A

  ...lia-yodas/shard.00002.mds:   4%|▌             | 2.59MB / 67.1MB            [A[A


Processing Files (0 / 2)      :   0%|              | 5.20MB /  212GB, 3.71MB/s  [A[A[A
New Data Upload               :   1