In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools
import json
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

datasets = ['ayat_aktif_pasif', 'coding', 'malaysian_reasoning', 'meta_prompt', 'multiple_choice_qa']

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [2]:
tokenizer = AutoTokenizer.from_pretrained('zai-org/GLM-4.5-Air')

In [3]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", datasets[0])

In [4]:
data = []

ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'ayat_aktif_pasif')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['output']}
    ]
    data.append(messages)

In [5]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'coding')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [6]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'meta_prompt')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [7]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'multiple_choice_qa')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [8]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'malaysian_reasoning')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'system', 'content': ds['train'][i]['system']},
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer'], 'reasoning_content': ds['train'][i]['reasoning']}
    ]
    data.append(messages)

In [16]:
len(data)

225265

In [17]:
!rm -rf tokenized-glm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

sequence_length = 1024 * 16
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-glm/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            t = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(t, add_special_tokens=False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])

            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [19]:
loop((data[:100], 0))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3308.44it/s]


In [20]:
multiprocessing(data, loop, cores = 20, returned=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [21]:
from glob import glob

folders = sorted(glob('tokenized-glm/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-glm/tokenized-0',
 'tokenized-glm/tokenized-1',
 'tokenized-glm/tokenized-2',
 'tokenized-glm/tokenized-3',
 'tokenized-glm/tokenized-4',
 'tokenized-glm/tokenized-5',
 'tokenized-glm/tokenized-6',
 'tokenized-glm/tokenized-7',
 'tokenized-glm/tokenized-8',
 'tokenized-glm/tokenized-9',
 'tokenized-glm/tokenized-10',
 'tokenized-glm/tokenized-11',
 'tokenized-glm/tokenized-12',
 'tokenized-glm/tokenized-13',
 'tokenized-glm/tokenized-14',
 'tokenized-glm/tokenized-15',
 'tokenized-glm/tokenized-16',
 'tokenized-glm/tokenized-17',
 'tokenized-glm/tokenized-18',
 'tokenized-glm/tokenized-19',
 'tokenized-glm/tokenized-20']

In [22]:
!rm -rf multipacking-glm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
with MDSWriter(out='multipacking-glm', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2341/2341 [00:00<00:00, 3696.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:00<00:00, 14699.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 16046.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:00<00:00, 16033.42it/s]
100%|███████████████████████

In [24]:
dataset = LocalDataset('multipacking-glm')
len(dataset)

10062

In [25]:
dataset[0]

{'attention_mask': array([70, 70, 52, 52, 58, 59, 66, 66, 56, 56, 80, 80, 56, 56, 78, 79, 56,
        56, 57, 58, 76, 76, 61, 61, 60, 61, 65, 66, 66, 67, 68, 68, 64, 65,
        48, 49, 45, 44, 50, 50, 76, 76, 78, 78, 70, 71, 69, 69, 56, 56, 57,
        57, 61, 60, 60, 60, 56, 56, 69, 68, 75, 75, 61, 61, 60, 59, 62, 63,
        58, 58, 68, 68, 68, 68, 62, 63, 63, 63, 59, 59, 68, 68, 60, 61, 65,
        64, 62, 62, 67, 67, 64, 64, 61, 60, 74, 74, 58, 58, 53, 53, 76, 75,
        74, 74, 64, 64, 76, 76, 64, 65, 70, 70, 58, 59, 61, 60, 59, 59, 57,
        57, 58, 59, 58, 58, 50, 51, 61, 60, 62, 62, 62, 62, 57, 56, 71, 70,
        70, 70, 62, 62, 55, 55, 51, 50, 49, 48, 73, 72, 68, 68, 63, 64, 55,
        55, 62, 62, 59, 58, 58, 58, 63, 64, 59, 59, 66, 66, 60, 60, 66, 67,
        61, 62, 63, 63, 59, 59, 74, 74, 57, 56, 63, 64, 74, 74, 65, 64, 63,
        63, 66, 66, 54, 53, 70, 70, 67, 66, 56, 57, 58, 58, 64, 63, 62, 62,
        57, 57, 54, 54, 49, 49, 51, 50, 61, 61, 59, 58, 59, 58, 71, 72

In [26]:
tokenizer.decode(dataset[0]['input_ids'])

'[gMASK]<sop><|user|>\ntukar ayat aktif ke ayat pasif: Encik Razak mengajar pelajar-pelajar tentang kepentingan menjaga alam sekitar.<|assistant|>\n<think></think>\nPelajar-pelajar diajar tentang kepentingan menjaga alam sekitar oleh Encik Razak.[gMASK]<sop><|user|>\ntukar ayat pasif ke ayat aktif: Pelajar-pelajar diajar tentang kepentingan menjaga alam sekitar oleh Encik Razak.<|assistant|>\n<think></think>\nEncik Razak mengajar pelajar-pelajar tentang kepentingan menjaga alam sekitar.[gMASK]<sop><|user|>\ntukar ayat aktif ke ayat pasif: Ibu memasak nasi lemak untuk sarapan pagi.<|assistant|>\n<think></think>\nNasi lemak dimasak oleh ibu untuk sarapan pagi.[gMASK]<sop><|user|>\ntukar ayat pasif ke ayat aktif: Nasi lemak dimasak oleh ibu untuk sarapan pagi.<|assistant|>\n<think></think>\nIbu memasak nasi lemak untuk sarapan pagi.[gMASK]<sop><|user|>\ntukar ayat aktif ke ayat pasif: Doktor Wong merawat pesakit yang mengalami demam denggi.<|assistant|>\n<think></think>\nPesakit yang meng