In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools
import json
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

datasets = ['ayat_aktif_pasif', 'coding', 'malaysian_reasoning', 'meta_prompt', 'multiple_choice_qa']

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [14]:
tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-70B-Instruct')

In [4]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", datasets[0])

In [5]:
data = []

ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'ayat_aktif_pasif')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['output']}
    ]
    data.append(messages)

In [6]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'coding')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [7]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'malaysian_reasoning')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'system', 'content': ds['train'][i]['system']},
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['reasoning']}
    ]
    data.append(messages)

In [8]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'meta_prompt')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [9]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'multiple_choice_qa')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [10]:
len(data)

225265

In [11]:
!rm -rf tokenized-llama3

In [12]:
def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

sequence_length = 1024 * 16
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-llama3/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            t = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(t, add_special_tokens=False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])

            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [15]:
loop((data[:100], 0))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 2836.94it/s]


In [16]:
multiprocessing(data, loop, cores = 20, returned=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [17]:
from glob import glob

folders = sorted(glob('tokenized-llama3/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-llama3/tokenized-0',
 'tokenized-llama3/tokenized-1',
 'tokenized-llama3/tokenized-2',
 'tokenized-llama3/tokenized-3',
 'tokenized-llama3/tokenized-4',
 'tokenized-llama3/tokenized-5',
 'tokenized-llama3/tokenized-6',
 'tokenized-llama3/tokenized-7',
 'tokenized-llama3/tokenized-8',
 'tokenized-llama3/tokenized-9',
 'tokenized-llama3/tokenized-10',
 'tokenized-llama3/tokenized-11',
 'tokenized-llama3/tokenized-12',
 'tokenized-llama3/tokenized-13',
 'tokenized-llama3/tokenized-14',
 'tokenized-llama3/tokenized-15',
 'tokenized-llama3/tokenized-16',
 'tokenized-llama3/tokenized-17',
 'tokenized-llama3/tokenized-18',
 'tokenized-llama3/tokenized-19',
 'tokenized-llama3/tokenized-20']

In [18]:
!rm -rf multipacking-llama3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
with MDSWriter(out='multipacking-llama3', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1564/1564 [00:00<00:00, 4123.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 430/430 [00:00<00:00, 2623.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 430/430 [00:00<00:00, 2563.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 426/426 [00:00<00:00, 16387.46it/s]
100%|███████████████████████

In [20]:
dataset = LocalDataset('multipacking-llama3')
len(dataset)

10316

In [21]:
dataset[0]

{'attention_mask': array([ 96,  96,  78,  78,  84,  85,  92,  92,  83,  83, 106, 106,  82,
         82, 104, 105,  81,  81,  82,  83, 100, 100,  87,  87,  86,  87,
         91,  92,  91,  92,  94,  94,  90,  91,  73,  74,  71,  70,  75,
         75, 100, 100, 102, 102,  96,  97,  95,  95,  82,  82,  83,  83,
         89,  88,  86,  86,  82,  82,  96,  96, 101, 101,  85,  85,  86,
         85,  90,  91,  82,  82,  92,  92,  94,  94,  87,  88,  89,  89,
         85,  85,  94,  94,  86,  87,  91,  90,  86,  86,  93,  93,  90,
         90,  87,  86, 100, 100,  84,  84,  79,  79, 102, 101, 100, 100,
         90,  90, 102, 102,  90,  91,  96,  96,  84,  85,  86,  85,  85,
         85,  83,  83,  84,  85,  84,  84,  75,  76,  86,  85,  88,  88,
         88,  88,  83,  82,  97,  96,  96,  96,  88,  88,  81,  81,  76,
         75,  75,  74,  99,  98,  94,  94,  89,  90,  81,  81,  88,  88,
         83,  82,  84,  84,  89,  90,  85,  85,  92,  92,  86,  86,  91,
         92,  87,  88,  89,  89, 