In [1]:
from glob import glob
import pandas as pd
import json
import os
import torch
import IPython.display as ipd

torch.set_grad_enabled(False)

from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

def new_path(f):
    splitted = f.split('/')
    base_folder = splitted[0] + '_trim'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

def new_path_neucodec(f):
    splitted = f.split('/')
    folder = f.split('/')[0]
    folder = folder + '_neucodec'
    new_f = os.path.join(folder, '/'.join(splitted[1:]))
    new_f = new_f.replace('.mp3', '.json').replace('.wav', '.json')
    return new_f

  import pynvml  # type: ignore[import]


In [2]:
from datasets import load_dataset

ds = load_dataset("Scicom-intl/Malaysian-Emilia")

README.md:   0%|          | 0.00/752 [00:00<?, ?B/s]

data/train-00000-of-00015.parquet:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

data/train-00001-of-00015.parquet:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

data/train-00002-of-00015.parquet:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

data/train-00003-of-00015.parquet:   0%|          | 0.00/44.8M [00:00<?, ?B/s]

data/train-00004-of-00015.parquet:   0%|          | 0.00/44.6M [00:00<?, ?B/s]

data/train-00005-of-00015.parquet:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

data/train-00006-of-00015.parquet:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

data/train-00007-of-00015.parquet:   0%|          | 0.00/62.6M [00:00<?, ?B/s]

data/train-00008-of-00015.parquet:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

data/train-00009-of-00015.parquet:   0%|          | 0.00/68.8M [00:00<?, ?B/s]

data/train-00010-of-00015.parquet:   0%|          | 0.00/68.8M [00:00<?, ?B/s]

data/train-00011-of-00015.parquet:   0%|          | 0.00/60.8M [00:00<?, ?B/s]

data/train-00012-of-00015.parquet:   0%|          | 0.00/59.6M [00:00<?, ?B/s]

data/train-00013-of-00015.parquet:   0%|          | 0.00/57.9M [00:00<?, ?B/s]

data/train-00014-of-00015.parquet:   0%|          | 0.00/57.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8664602 [00:00<?, ? examples/s]

In [3]:
rows = ds['train'].to_list()

In [4]:
len(rows)

8664602

In [5]:
files = set()
for r in tqdm(rows):
    if 'malaysian-chinese' in r['reference_audio'].split('/')[0]:
        continue
    files.add(r['reference_audio'])
len(files)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8664602/8664602 [00:03<00:00, 2390703.29it/s]


1414505

In [6]:
not_exists = []
for r in tqdm(rows):
    if 'malaysian-chinese' in r['reference_audio'].split('/')[0]:
        continue
    f = new_path_neucodec(new_path(r['reference_audio']))
    if not os.path.exists(f):
        not_exists.append(f)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8664602/8664602 [00:24<00:00, 360866.62it/s]


In [9]:
f = new_path_neucodec(new_path(rows[0]['reference_audio']))

In [10]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-1.7B-Base')
extra = [AddedToken('<|speech_start|>')]
for i in range(65536):
    extra.append(AddedToken(f'<|s_{i}|>'))
tokenizer.add_tokens(extra)

65537

In [11]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [12]:
import time

sequence_length = 1024 * 10
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'malaysian-emilia/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            if 'malaysian-chinese' in row['reference_audio'].split('/')[0]:
                continue

            try:
                with open(new_path_neucodec(new_path(row['reference_audio']))) as fopen:
                    left = json.load(fopen)
            except:
                continue
            
            try:
                with open(new_path_neucodec(new_path(row['target_audio']))) as fopen:
                    right = json.load(fopen)
            except:
                continue

            left_text = row['reference_text']
            right_text = row['target_text']

            if len(left_text.split()) > len(left):
                continue

            if len(right_text.split()) > len(right):
                continue
            
            left_token = ''.join([f'<|s_{t}|>' for t in left])
            right_token = ''.join([f'<|s_{t}|>' for t in right])
            
            left_prompt = f'<|im_start|>{left_text}<|speech_start|>{left_token}<|im_end|>'
            right_prompt = f'<|im_start|>{right_text}<|speech_start|>{right_token}<|im_end|>'

            prompt = left_prompt + right_prompt
            
            outputs = tokenizer(prompt, add_special_tokens = False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [None]:
multiprocessing(rows, loop, cores = 40, returned = False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216615/216615 [00:00<00:00, 3426690.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216615/216615 [00:00<00:00, 3044664.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216615/216615 [00:00<00:00, 3058171.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216615/216615 [00:00<00:00, 2898906.42it/s]
100%|███████████████████████

In [None]:
folders = sorted(glob('malaysian-emilia/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [None]:
!rm -rf multipacking-malaysian-emilia

In [None]:
with MDSWriter(out='multipacking-malaysian-emilia', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

In [None]:
dataset = LocalDataset('multipacking-malaysian-emilia')

In [19]:
!du -hs multipacking-malaysian-emilia

58G	multipacking-malaysian-emilia


In [18]:
len(dataset)

797570

In [None]:
!hf upload Scicom-intl/Malaysian-Emilia-multipacking-10k multipacking-malaysian-emilia --repo-type=dataset --private

It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`hf upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.
Start hashing 915 files.
