In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AddedToken
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import numpy as np
from tqdm import tqdm
from multiprocess import Pool
import itertools
import json
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

datasets = ['ayat_aktif_pasif', 'coding', 'malaysian_reasoning', 'meta_prompt', 'multiple_choice_qa']

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [2]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-32B')

In [3]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", datasets[0])

In [4]:
data = []

ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'ayat_aktif_pasif')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['output']}
    ]
    data.append(messages)

In [5]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'coding')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [6]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'malaysian_reasoning')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'system', 'content': ds['train'][i]['system']},
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer'], 'reasoning_content': ds['train'][i]['reasoning']}
    ]
    data.append(messages)

In [None]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'meta_prompt')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['input']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [None]:
ds = load_dataset("Scicom-intl/Malaysian-Instructions", 'multiple_choice_qa')
for i in range(len(ds['train'])):
    messages = [
        {'role': 'user', 'content': ds['train'][i]['question']},
        {'role': 'assistant', 'content': ds['train'][i]['answer']}
    ]
    data.append(messages)

In [None]:
len(data)

In [None]:
!rm -rf tokenized-qwen3

In [None]:
def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

sequence_length = 1024 * 16
def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-qwen3/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):

            t = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(t, add_special_tokens=False)
            position = range(len(outputs['input_ids']))
            length = len(outputs['input_ids'])

            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids)
                if o['input_ids'].shape[0] > 0:
                    out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [position]
                count = length
                
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                count += len(outputs['input_ids'])
        
        if len(temp):
            o = collator(temp, position_ids)
            if o['input_ids'].shape[0] > 0:
                out.write(o)

In [None]:
loop((data[:100], 0))

In [None]:
multiprocessing(data, loop, cores = 20, returned=False)

In [None]:
from glob import glob

folders = sorted(glob('tokenized-qwen3/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

In [None]:
!rm -rf multipacking-qwen3

In [None]:
with MDSWriter(out='multipacking-qwen3', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

In [None]:
dataset = LocalDataset('multipacking-qwen3')
len(dataset)

In [None]:
dataset[0]

In [1]:
from transformers import Qwen3ForCausalLM

model = Qwen3ForCausalLM.from_pretrained('Qwen/Qwen3-0.6B')

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [2]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layer

In [4]:
import math
import torch
from torch import nn
import torch.nn.functional as F
import torch.nn.init as init

class LinearLoRA(nn.Module):
    def __init__(self, linear: nn.Linear, r=4, alpha=1.0):
        super().__init__()
        self.linear = linear
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r

        in_features = linear.in_features
        out_features = linear.out_features
        
        device = self.linear.weight.device
        dtype = self.linear.weight.dtype

        self.lora_A = nn.ModuleDict({})
        self.lora_B = nn.ModuleDict({})
        
        self.lora_A['e'] = nn.Linear(
            in_features, r, bias=False, 
            device = device,
            dtype = torch.float32,
        )
        self.lora_B['e'] = nn.Linear(
            r, out_features, bias=False, 
            device = device,
            dtype = torch.float32,
        )

        for param in self.lora_A['e'].parameters():
            param.requires_grad = True
        for param in self.lora_B['e'].parameters():
            param.requires_grad = True

        # https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py#L260
        init.kaiming_uniform_(self.lora_A['e'].weight, a=math.sqrt(5))
        init.zeros_(self.lora_B['e'].weight)

    def forward(self, x):
        out = self.linear(x)
        lora_update = self.lora_B['e'](self.lora_A['e'](x.to(self.lora_A['e'].weight.dtype))) * self.scaling
        return out + lora_update.to(x.dtype)

selected = [
    "q_proj", 
    "k_proj", 
    "v_proj", 
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]
for name, module in model.named_modules():
    for child_name, child in module.named_children():
        if len(child_name) and any([a in child_name for a in selected]) and isinstance(child, nn.Linear):
            lora = LinearLoRA(child, r=128, alpha=256)
            setattr(module, child_name, lora)

In [9]:
import os
from torch.distributed.device_mesh import init_device_mesh

dp_size = 4
device_type = torch.accelerator.current_accelerator().type
device_mesh = init_device_mesh(device_type, (dp_size,), mesh_dim_names=("dp",))

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set