In [1]:
from glob import glob
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
folders = sorted(glob('tokenized-4k-qwen3/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-4k-qwen3/tokenized-0',
 'tokenized-4k-qwen3/tokenized-1',
 'tokenized-4k-qwen3/tokenized-2',
 'tokenized-4k-qwen3/tokenized-3',
 'tokenized-4k-qwen3/tokenized-4',
 'tokenized-4k-qwen3/tokenized-5',
 'tokenized-4k-qwen3/tokenized-6',
 'tokenized-4k-qwen3/tokenized-7',
 'tokenized-4k-qwen3/tokenized-8',
 'tokenized-4k-qwen3/tokenized-9',
 'tokenized-4k-qwen3/tokenized-10',
 'tokenized-4k-qwen3/tokenized-11',
 'tokenized-4k-qwen3/tokenized-12',
 'tokenized-4k-qwen3/tokenized-13',
 'tokenized-4k-qwen3/tokenized-14',
 'tokenized-4k-qwen3/tokenized-15',
 'tokenized-4k-qwen3/tokenized-16',
 'tokenized-4k-qwen3/tokenized-17',
 'tokenized-4k-qwen3/tokenized-18',
 'tokenized-4k-qwen3/tokenized-19',
 'tokenized-4k-qwen3/tokenized-20',
 'tokenized-4k-qwen3/tokenized-21',
 'tokenized-4k-qwen3/tokenized-22',
 'tokenized-4k-qwen3/tokenized-23',
 'tokenized-4k-qwen3/tokenized-24',
 'tokenized-4k-qwen3/tokenized-25',
 'tokenized-4k-qwen3/tokenized-26',
 'tokenized-4k-qwen3/tokenized-27',
 '

In [3]:
dataset = LocalDataset(folders[0])
dataset[0]

{'input_ids': array([151644,    868,   6029,   5251,     52,   1740,     12,   1911,
            49,  29688,  85311,   2039,     53,     16,     23,     21,
            25,    350,   5658,     11,    259,   5658,     11,    259,
          5658,     11,    259,   5658,     13, 151669, 192623, 203889,
        204262, 204040, 207452, 208679, 207405, 206385, 208296, 191069,
        191628, 208186, 206095, 207422, 212668, 206024, 206155, 206043,
        207100, 207116, 205976, 194860, 190700, 192801, 174381, 203002,
        190882, 189855, 206728, 185211, 188483, 168478, 190142, 174920,
        210075, 206515, 176531, 190460, 152255, 186878, 205775, 158540,
        207384, 210167, 211636, 193935, 178120, 168844, 204862, 184823,
        207866, 168951, 172121, 212763, 203303, 208471, 215608, 210356,
        193184, 157092, 193683, 200924, 173784, 198986, 176701, 171774,
        199388, 159174, 153356, 154366, 198416, 158203, 170704, 200190,
        178956, 170747, 170494, 199368, 178427, 171

In [5]:
dataset[0]['input_ids'].shape[0]

130

In [4]:
def loop(folders):
    folders, _ = folders
    data = []
    for f in folders:
        dataset = LocalDataset(f)
        for i in tqdm(range(len(dataset))):
            l = dataset[i]['input_ids'].shape[0]
            data.append({
                'f': f,
                'l': l,
                'i': i,
            })
    return data

In [5]:
data = loop((folders[:1],0))

100%|██████████| 464077/464077 [00:12<00:00, 36059.91it/s]


In [6]:
data[-10:]

[{'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 501, 'i': 464067},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 504, 'i': 464068},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 553, 'i': 464069},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 461, 'i': 464070},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 534, 'i': 464071},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 389, 'i': 464072},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 519, 'i': 464073},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 393, 'i': 464074},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 459, 'i': 464075},
 {'f': 'tokenized-4k-qwen3/tokenized-0', 'l': 451, 'i': 464076}]

In [7]:
data = multiprocessing(folders, loop, cores = len(folders))
len(data)

100%|██████████| 37/37 [00:00<00:00, 2664.97it/s]it/s]s]
100%|██████████| 464077/464077 [00:16<00:00, 27593.85it/s]
100%|██████████| 464071/464071 [01:42<00:00, 4543.02it/s]
100%|██████████| 464062/464062 [01:48<00:00, 4259.45it/s]
100%|██████████| 464070/464070 [01:49<00:00, 4237.68it/s]
100%|██████████| 464035/464035 [01:49<00:00, 4224.13it/s]
100%|██████████| 464076/464076 [01:51<00:00, 4150.68it/s]
100%|██████████| 463891/463891 [01:52<00:00, 4140.57it/s]
100%|██████████| 464064/464064 [01:52<00:00, 4142.10it/s]
100%|██████████| 464043/464043 [01:52<00:00, 4143.04it/s]
100%|██████████| 464067/464067 [01:55<00:00, 4032.56it/s]
100%|██████████| 464077/464077 [01:56<00:00, 3996.37it/s]
100%|██████████| 463742/463742 [01:56<00:00, 3992.09it/s]
100%|██████████| 464071/464071 [01:57<00:00, 3963.97it/s]
100%|██████████| 464077/464077 [01:58<00:00, 3931.34it/s]
100%|██████████| 464068/464068 [01:58<00:00, 3923.39it/s]
100%|██████████| 464063/464063 [01:58<00:00, 3923.71it/s]
100%|█████████

18562408

In [8]:
import pandas as pd

pd.DataFrame(data[:10])

Unnamed: 0,f,l,i
0,tokenized-4k-qwen3/tokenized-0,130,0
1,tokenized-4k-qwen3/tokenized-0,324,1
2,tokenized-4k-qwen3/tokenized-0,77,2
3,tokenized-4k-qwen3/tokenized-0,174,3
4,tokenized-4k-qwen3/tokenized-0,110,4
5,tokenized-4k-qwen3/tokenized-0,383,5
6,tokenized-4k-qwen3/tokenized-0,102,6
7,tokenized-4k-qwen3/tokenized-0,591,7
8,tokenized-4k-qwen3/tokenized-0,246,8
9,tokenized-4k-qwen3/tokenized-0,457,9


In [9]:
import pandas as pd

pd.DataFrame(data).to_parquet('sort-merge.parquet')

In [10]:
!ls -lh sort-merge.parquet

-rw-rw-r-- 1 ubuntu ubuntu 100M Nov 11 06:36 sort-merge.parquet


In [11]:
!du -hs tokenized-4k-qwen3

24G	tokenized-4k-qwen3


In [12]:
!hf upload Scicom-intl/sort-multilingual-tts sort-merge.parquet sort-merge.parquet --repo-type=dataset

Processing Files (0 / 0)      : |                  |  0.00B /  0.00B            
New Data Upload               : |                  |  0.00B /  0.00B            [A

  sort-merge.parquet          :   4%|▌             | 3.82MB /  104MB            [A[A

Processing Files (0 / 1)      :   4%|▌             | 3.82MB /  104MB,   ???B/s  [A[A

Processing Files (0 / 1)      :  13%|█▊            | 13.2MB /  104MB, 46.9MB/s  [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊            | 13.2MB /  104MB            [A[A

  sort-merge.parquet          :  13%|█▊   

In [18]:
!hf upload Scicom-intl/sort-multilingual-tts tokenized-4k-qwen3 tokenized-4k-qwen3 --repo-type=dataset

It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`hf upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.
Start hashing 438 files.
Finished hashing 438 files.
Processing Files (0 / 0)      : |                  |  0.00B /  0.00B            
New Data Upload               : |                  |  0.00B /  0.00B            [A

  ...kenized-0/shard.00004.mds:   1%|▏             |  912kB / 67.1MB            [A[A


  ...kenized-0/shard.00001.mds:   1%|▏             |  908kB / 67.1MB            [A[A[A

  ...kenized-0/shard.00004.mds:   1%|▏             |  912kB / 67.1MB            [A[A


Processing Files (0 / 2)      :   0%|              | 1.82MB / 16.5GB, 1.14MB/s  [A[A[A
New Data Upload               :   0%|