In [1]:
from transformers import Glm4MoeForCausalLM, AutoTokenizer
from tqdm import tqdm
from torch import nn
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [2]:
from glob import glob

files = sorted(glob('ramdisk/GLM-4.5-Air-stack_checkpoint/*'))
files

['ramdisk/GLM-4.5-Air-stack_checkpoint/314-model_state_dict.pt',
 'ramdisk/GLM-4.5-Air-stack_checkpoint/629-model_state_dict.pt',
 'ramdisk/GLM-4.5-Air-stack_checkpoint/944-model_state_dict.pt']

In [3]:
tokenizer = AutoTokenizer.from_pretrained('ramdisk/GLM-4.5-Air')

In [4]:
for f in files:
    print(f)

    epoch = os.path.split(f)[1].split('-')[0]

    model = Glm4MoeForCausalLM.from_pretrained(
        'ramdisk/GLM-4.5-Air', 
        torch_dtype="auto",
        device_map="auto",
    )
    
    mapping = torch.load(f, map_location='cpu')
    keys = mapping.keys()
    state_dict = model.state_dict()
    for i in tqdm(range(model.config.num_hidden_layers)):
        A = f'_orig_mod.model.layers.{i}.mlp.gate_lora.A'
        if A in mapping:
            B = f'_orig_mod.model.layers.{i}.mlp.gate_lora.B'
            a = mapping[A]
            for k in range(a.shape[0]):
                W = f'model.layers.{i}.mlp.experts.{k}.gate_proj.weight'
                W = state_dict[W]
                A_ = mapping[A][k].to(W.device)
                B_ = mapping[B][k].to(W.device)
                m = torch.matmul(A_, B_) * 2.0
                W += m.T.to(W.dtype)
                
        A = f'_orig_mod.model.layers.{i}.mlp.up_lora.A'
        if A in mapping:
            B = f'_orig_mod.model.layers.{i}.mlp.up_lora.B'
            a = mapping[A]
            for k in range(a.shape[0]):
                W = f'model.layers.{i}.mlp.experts.{k}.up_proj.weight'
                W = state_dict[W]
                A_ = mapping[A][k].to(W.device)
                B_ = mapping[B][k].to(W.device)
                m = torch.matmul(A_, B_) * 2.0
                W += m.T.to(W.dtype)
    
        A = f'_orig_mod.model.layers.{i}.mlp.down_lora.A'
        if A in mapping:
            B = f'_orig_mod.model.layers.{i}.mlp.down_lora.B'
            a = mapping[A]
            for k in range(a.shape[0]):
                W = f'model.layers.{i}.mlp.experts.{k}.down_proj.weight'
                W = state_dict[W]
                A_ = mapping[A][k].to(W.device)
                B_ = mapping[B][k].to(W.device)
                m = torch.matmul(A_, B_) * 2.0
                W += m.T.to(W.dtype)
    
    keys_lora = [k.split('.lora')[0] for k in keys if '.lora' in k]
    keys_lora = sorted(list(set(keys_lora)))
    for k in tqdm(keys_lora):
        k_ori = k.replace('_orig_mod.', '') + '.weight'
        post_A = '.lora_A'
        post_B = '.lora_B'
        A = k + post_A
        B = k + post_B
        W = state_dict[k_ori]
        A = mapping[A].to(W.device)
        B = mapping[B].to(W.device)
        m = torch.matmul(A.t(), B.t()) * 2.0
        W += m.T.to(W.dtype)

    new_path = f'ramdisk/GLM-4.5-Air-lora-256-{epoch}'
    print(new_path)
    model.save_pretrained(new_path)
    tokenizer.save_pretrained(new_path)

    del model, mapping
    torch.cuda.empty_cache()

`torch_dtype` is deprecated! Use `dtype` instead!


ramdisk/GLM-4.5-Air-stack_checkpoint/314-model_state_dict.pt


Loading checkpoint shards: 100%|████████████████| 47/47 [00:33<00:00,  1.41it/s]
Some weights of the model checkpoint at ramdisk/GLM-4.5-Air were not used when initializing Glm4MoeForCausalLM: ['model.layers.46.eh_proj.weight', 'model.layers.46.embed_tokens.weight', 'model.layers.46.enorm.weight', 'model.layers.46.hnorm.weight', 'model.layers.46.input_layernorm.weight', 'model.layers.46.mlp.experts.0.down_proj.weight', 'model.layers.46.mlp.experts.0.gate_proj.weight', 'model.layers.46.mlp.experts.0.up_proj.weight', 'model.layers.46.mlp.experts.1.down_proj.weight', 'model.layers.46.mlp.experts.1.gate_proj.weight', 'model.layers.46.mlp.experts.1.up_proj.weight', 'model.layers.46.mlp.experts.10.down_proj.weight', 'model.layers.46.mlp.experts.10.gate_proj.weight', 'model.layers.46.mlp.experts.10.up_proj.weight', 'model.layers.46.mlp.experts.100.down_proj.weight', 'model.layers.46.mlp.experts.100.gate_proj.weight', 'model.layers.46.mlp.experts.100.up_proj.weight', 'model.layers.46.mlp.exper

ramdisk/GLM-4.5-Air-lora-256-314
ramdisk/GLM-4.5-Air-stack_checkpoint/629-model_state_dict.pt


Loading checkpoint shards: 100%|████████████████| 47/47 [00:29<00:00,  1.61it/s]
Some weights of the model checkpoint at ramdisk/GLM-4.5-Air were not used when initializing Glm4MoeForCausalLM: ['model.layers.46.eh_proj.weight', 'model.layers.46.embed_tokens.weight', 'model.layers.46.enorm.weight', 'model.layers.46.hnorm.weight', 'model.layers.46.input_layernorm.weight', 'model.layers.46.mlp.experts.0.down_proj.weight', 'model.layers.46.mlp.experts.0.gate_proj.weight', 'model.layers.46.mlp.experts.0.up_proj.weight', 'model.layers.46.mlp.experts.1.down_proj.weight', 'model.layers.46.mlp.experts.1.gate_proj.weight', 'model.layers.46.mlp.experts.1.up_proj.weight', 'model.layers.46.mlp.experts.10.down_proj.weight', 'model.layers.46.mlp.experts.10.gate_proj.weight', 'model.layers.46.mlp.experts.10.up_proj.weight', 'model.layers.46.mlp.experts.100.down_proj.weight', 'model.layers.46.mlp.experts.100.gate_proj.weight', 'model.layers.46.mlp.experts.100.up_proj.weight', 'model.layers.46.mlp.exper

ramdisk/GLM-4.5-Air-lora-256-629


Saving checkpoint shards: 100%|█████████████████| 43/43 [02:15<00:00,  3.15s/it]


ramdisk/GLM-4.5-Air-stack_checkpoint/944-model_state_dict.pt


Loading checkpoint shards: 100%|████████████████| 47/47 [00:34<00:00,  1.38it/s]
Some weights of the model checkpoint at ramdisk/GLM-4.5-Air were not used when initializing Glm4MoeForCausalLM: ['model.layers.46.eh_proj.weight', 'model.layers.46.embed_tokens.weight', 'model.layers.46.enorm.weight', 'model.layers.46.hnorm.weight', 'model.layers.46.input_layernorm.weight', 'model.layers.46.mlp.experts.0.down_proj.weight', 'model.layers.46.mlp.experts.0.gate_proj.weight', 'model.layers.46.mlp.experts.0.up_proj.weight', 'model.layers.46.mlp.experts.1.down_proj.weight', 'model.layers.46.mlp.experts.1.gate_proj.weight', 'model.layers.46.mlp.experts.1.up_proj.weight', 'model.layers.46.mlp.experts.10.down_proj.weight', 'model.layers.46.mlp.experts.10.gate_proj.weight', 'model.layers.46.mlp.experts.10.up_proj.weight', 'model.layers.46.mlp.experts.100.down_proj.weight', 'model.layers.46.mlp.experts.100.gate_proj.weight', 'model.layers.46.mlp.experts.100.up_proj.weight', 'model.layers.46.mlp.exper

ramdisk/GLM-4.5-Air-lora-256-944
