In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools
import json
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

datasets = ['ayat_aktif_pasif', 'coding', 'malaysian_reasoning', 'meta_prompt', 'multiple_choice_qa']

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [2]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-72B-Instruct')

In [3]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", datasets[0])

In [4]:
data = []

ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'ayat_aktif_pasif')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['output']}
    ]
    data.append(messages)

In [5]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'coding')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [6]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'malaysian_reasoning')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'system', 'content': ds['train'][i]['system']},
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['reasoning']}
    ]
    data.append(messages)

In [7]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'meta_prompt')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [8]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'multiple_choice_qa')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [9]:
len(data)

225265

In [15]:
!rm -rf tokenized-qwen2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

sequence_length = 1024 * 16
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-qwen2/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            t = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(t, add_special_tokens=False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])

            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [17]:
loop((data[:100], 0))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3184.16it/s]


In [18]:
multiprocessing(data, loop, cores = 20, returned=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [19]:
from glob import glob

folders = sorted(glob('tokenized-qwen2/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-qwen2/tokenized-0',
 'tokenized-qwen2/tokenized-1',
 'tokenized-qwen2/tokenized-2',
 'tokenized-qwen2/tokenized-3',
 'tokenized-qwen2/tokenized-4',
 'tokenized-qwen2/tokenized-5',
 'tokenized-qwen2/tokenized-6',
 'tokenized-qwen2/tokenized-7',
 'tokenized-qwen2/tokenized-8',
 'tokenized-qwen2/tokenized-9',
 'tokenized-qwen2/tokenized-10',
 'tokenized-qwen2/tokenized-11',
 'tokenized-qwen2/tokenized-12',
 'tokenized-qwen2/tokenized-13',
 'tokenized-qwen2/tokenized-14',
 'tokenized-qwen2/tokenized-15',
 'tokenized-qwen2/tokenized-16',
 'tokenized-qwen2/tokenized-17',
 'tokenized-qwen2/tokenized-18',
 'tokenized-qwen2/tokenized-19',
 'tokenized-qwen2/tokenized-20']

In [20]:
!rm -rf multipacking-qwen2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
with MDSWriter(out='multipacking-qwen2', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1578/1578 [00:00<00:00, 3624.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 420/420 [00:00<00:00, 2323.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 418/418 [00:00<00:00, 2489.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 416/416 [00:00<00:00, 14958.21it/s]
100%|███████████████████████

In [22]:
dataset = LocalDataset('multipacking-qwen2')
len(dataset)

10253

In [23]:
dataset[0]

{'attention_mask': array([ 93,  93,  75,  75,  81,  82,  89,  89,  80,  80, 103, 103,  79,
         79, 101, 102,  79,  79,  80,  81,  99,  99,  84,  84,  83,  84,
         88,  89,  89,  90,  91,  91,  87,  88,  71,  72,  68,  67,  73,
         73,  99,  99, 101, 101,  93,  94,  92,  92,  79,  79,  80,  80,
         86,  85,  83,  83,  79,  79,  93,  93, 102, 102,  84,  84,  83,
         82,  87,  88,  81,  81,  93,  93,  91,  91,  85,  86,  86,  86,
         82,  82,  91,  91,  83,  84,  88,  87,  85,  85,  90,  90,  87,
         87,  84,  83,  99,  99,  81,  81,  76,  76,  99,  98,  97,  97,
         89,  89,  99,  99,  87,  88,  93,  93,  81,  82,  86,  85,  84,
         84,  80,  80,  81,  82,  81,  81,  73,  74,  86,  85,  85,  85,
         85,  85,  82,  81,  94,  93,  95,  95,  85,  85,  78,  78,  76,
         75,  74,  73,  96,  95,  91,  91,  86,  87,  78,  78,  85,  85,
         82,  81,  81,  81,  86,  87,  84,  84,  89,  89,  85,  85,  89,
         90,  84,  85,  88,  88, 