In [1]:
from transformers import Qwen3MoeForCausalLM, AutoTokenizer, TextStreamer
from tqdm import tqdm
from torch import nn
import torch
import torch.nn.functional as F
import os

  import pynvml  # type: ignore[import]
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [2]:
from glob import glob

files = sorted(glob('gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/*'))
files

['gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/319-model_state_dict.pt',
 'gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/639-model_state_dict.pt',
 'gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/959-model_state_dict.pt']

In [3]:
tokenizer = AutoTokenizer.from_pretrained('gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507')

In [4]:
for f in files:
    print(f)

    epoch = os.path.split(f)[1].split('-')[0]

    model = Qwen3MoeForCausalLM.from_pretrained(
        'gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507', 
        torch_dtype="auto",
        device_map="auto",
    )
    
    mapping = torch.load(f, map_location='cpu')
    keys = mapping.keys()
    state_dict = model.state_dict()
    scaling = 2.0  # alpha / r
    
    for i in tqdm(range(model.config.num_hidden_layers)):
        # gate_proj
        A = f'model.layers.{i}.mlp.gate_dora.A'
        if A in mapping:
            B = f'model.layers.{i}.mlp.gate_dora.B'
            M = f'model.layers.{i}.mlp.gate_dora.magnitude'
            a = mapping[A]
            for k in range(a.shape[0]):
                W_key = f'model.layers.{i}.mlp.experts.{k}.gate_proj.weight'
                W = state_dict[W_key]
                A_ = mapping[A][k].to(W.device)  # (in_dim, r)
                B_ = mapping[B][k].to(W.device)  # (r, out_dim)
                mag = mapping[M][k].to(W.device)  # (out_dim,)
                
                # W is (out_dim, in_dim), need to work in that space
                # adapted = W + scaling * (A @ B).T
                lora_update = (A_ @ B_).T  # (out_dim, in_dim)
                adapted = W.float() + scaling * lora_update.float()
                
                # normalize and apply magnitude
                direction = F.normalize(adapted, dim=1)
                merged = mag.unsqueeze(1).float() * direction
                
                state_dict[W_key].copy_(merged.to(W.dtype))
                
        # up_proj
        A = f'model.layers.{i}.mlp.up_dora.A'
        if A in mapping:
            B = f'model.layers.{i}.mlp.up_dora.B'
            M = f'model.layers.{i}.mlp.up_dora.magnitude'
            a = mapping[A]
            for k in range(a.shape[0]):
                W_key = f'model.layers.{i}.mlp.experts.{k}.up_proj.weight'
                W = state_dict[W_key]
                A_ = mapping[A][k].to(W.device)
                B_ = mapping[B][k].to(W.device)
                mag = mapping[M][k].to(W.device)
                
                lora_update = (A_ @ B_).T
                adapted = W.float() + scaling * lora_update.float()
                direction = F.normalize(adapted, dim=1)
                merged = mag.unsqueeze(1).float() * direction
                
                state_dict[W_key].copy_(merged.to(W.dtype))
    
        # down_proj
        A = f'model.layers.{i}.mlp.down_dora.A'
        if A in mapping:
            B = f'model.layers.{i}.mlp.down_dora.B'
            M = f'model.layers.{i}.mlp.down_dora.magnitude'
            a = mapping[A]
            for k in range(a.shape[0]):
                W_key = f'model.layers.{i}.mlp.experts.{k}.down_proj.weight'
                W = state_dict[W_key]
                A_ = mapping[A][k].to(W.device)
                B_ = mapping[B][k].to(W.device)
                mag = mapping[M][k].to(W.device)
                
                lora_update = (A_ @ B_).T
                adapted = W.float() + scaling * lora_update.float()
                direction = F.normalize(adapted, dim=1)
                merged = mag.unsqueeze(1).float() * direction
                
                state_dict[W_key].copy_(merged.to(W.dtype))
    
    # handle non-expert DoRA layers (attention, etc)
    keys_dora = [k.rsplit('.', 1)[0] for k in keys if '.dora.' in k or k.endswith('.magnitude')]
    keys_dora = sorted(list(set([k for k in keys_dora if 'A' not in k.split('.')[-1]])))
    
    for k in tqdm(keys_dora):
        # skip expert layers already handled
        if 'gate_dora' in k or 'up_dora' in k or 'down_dora' in k:
            continue
            
        k_ori = k.replace('_orig_mod.', '').replace('.dora', '') + '.weight'
        if k_ori not in state_dict:
            continue
            
        A = k + '.A' if (k + '.A') in mapping else k + '.lora_A'
        B = k + '.B' if (k + '.B') in mapping else k + '.lora_B'
        M = k + '.magnitude'
        
        if A not in mapping or M not in mapping:
            continue
        
        W = state_dict[k_ori]
        A_ = mapping[A].to(W.device)
        B_ = mapping[B].to(W.device)
        mag = mapping[M].to(W.device)
        
        # A: (r, in), B: (out, r) -> lora_update: (out, in)
        lora_update = B_.float() @ A_.float()
        adapted = W.float() + scaling * lora_update
        direction = F.normalize(adapted, dim=1)
        merged = mag.unsqueeze(1).float() * direction
        
        state_dict[k_ori].copy_(merged.to(W.dtype))

    new_path = f'gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507-dora-256-{epoch}'
    print(new_path)
    model.save_pretrained(new_path)
    tokenizer.save_pretrained(new_path)

    del model, mapping
    torch.cuda.empty_cache()

`torch_dtype` is deprecated! Use `dtype` instead!


gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/319-model_state_dict.pt


Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

100%|███████████████████████████████████████| 48/48 [00:00<00:00, 495878.31it/s]
100%|████████████████████████████████████████| 336/336 [00:00<00:00, 583.56it/s]
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507-dora-256-319
gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/639-model_state_dict.pt


Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

100%|███████████████████████████████████████| 48/48 [00:00<00:00, 524288.00it/s]
100%|███████████████████████████████████████| 336/336 [00:00<00:00, 3380.72it/s]


gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507-dora-256-639
gfs-01be5b33-Qwen3-30B-A3B-Instruct-2507-stack/959-model_state_dict.pt


Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

100%|███████████████████████████████████████| 48/48 [00:00<00:00, 633102.49it/s]
100%|███████████████████████████████████████| 336/336 [00:00<00:00, 3332.02it/s]


gfs/01be5b33/Qwen3-30B-A3B-Instruct-2507-dora-256-959


In [5]:
# streamer = TextStreamer(tokenizer)

In [6]:
# q = """
# Budak itu sangat nakal, pantang orang leka sedikit, duit syiling pun dikebasnya.

# terjemah ke kedah
# """

# q = """
# Pasangan algoritma yang digunakan untuk melakukan penyulitan dan nyahsulit dikenali sebagai
# A. kunci (keys)
# B. Sifer (cipher)
# C. Teks sifer (ciphertext)
# """

# system = 'First, you try to think step-by-step in {{lang}}, after that, put your final answer within $\\boxed{}$.'
# messages = [
#     {"role": "system", "content": system.replace('{{lang}}', 'malay')},
#     {"role": "user", "content": q},
# ]

# row = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     tokenize=False,
# )
# input_ids = tokenizer(row, add_special_tokens = False, return_tensors = 'pt').to(model.device)['input_ids']
# input_ids

In [7]:
# gen_kwargs = {
#     "max_new_tokens": 1024, 
#     "do_sample": True, 
#     "temperature": 0.9, 
#     "top_p": None, 
#     "top_k": None,
#     "streamer": streamer,
# }

# output_ids = model.generate(input_ids, **gen_kwargs)
# response = tokenizer.batch_decode(output_ids)[0]
# response