In [1]:
%load_ext autoreload
%autoreload 2
%env CUDA_VISIBLE_DEVICES=2
%env TRANSFORMERS_CACHE=/mnt/LLM/hub
%env HF_HOME=/mnt/LLM/hub
%env OMP_NUM_THREADS=16

import os
import sys
sys.path.insert(0, '..')

import time
import random
from tqdm.auto import trange, tqdm
import numpy as np
import ipynbname  # pip install ipynbname

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers

import ipynbname  # pip install ipynbname
import wandb

from src.aq import QuantizedWeight, QuantizedLinear
from src.modelutils import get_model
from src.datautils import get_loaders


torch.set_num_threads(16)
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# wandb.login(relogin=True)

env: CUDA_VISIBLE_DEVICES=2
env: TRANSFORMERS_CACHE=/mnt/LLM/hub
env: HF_HOME=/mnt/LLM/hub
env: OMP_NUM_THREADS=16




In [2]:
class args:
    base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
    quant_model = "/extra_disk_1/jheuristic/tinyllama-3t-2x8g8/"
    dtype = 'bfloat16'
    model_seqlen = 1024  # can be 2048 for 1.1B, 4096-8192 for larger models
    device_map = 'auto'
    
    dataset = 'pajama'
    nsamples = 256
    seed = 42
    beam_size = 4
    stochastic_rounding_tau = 0.0
    
    code_lr = 1e-2
    code_lr_plateau_scale = 0.5
    code_betas = (0.0, 0.95)
    delta_decay = 1.0
    codebook_lr = 1e-5
    codebook_betas = (0.9, 0.95)
    codebook_grad_accumulation_steps = 8
    
    
    autocast_dtype = torch.bfloat16  # bfloat16 or None (not using grad scaler!)
    training_dtype = torch.float32
    gradient_checkpointing = False
    devices = [device]

In [3]:
train_data = get_loaders(
    args.dataset,
    nsamples=args.nsamples * 10,
    seed=args.seed,
    model_path=args.base_model,
    seqlen=args.model_seqlen,
)

base_model = get_model(args.base_model, None, args.dtype, args.device_map)
if not args.device_map:
    base_model = base_model.to(device)
    
quantized_model = get_model(args.base_model, args.quant_model, args.dtype, args.device_map)
if not args.device_map:
    quantized_model = quantized_model.to(device)
quantized_model = quantized_model.to(args.training_dtype)

Loading red_pajama from togethercomputer/RedPajama-Data-1T-Sample


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
                                                                                                                                                                                                                                                                                          

Loaded data from pajama; len(data)=2560 sequences
Loading pretrained model ...
Model loaded sucсessfully ...
Initializing model with random weights...
Loading quantized model ...
Model loaded sucсessfully ...


In [4]:
from src.pv_ops import create_dequantized_model
dequantized_model, master_parameters = create_dequantized_model(
    quantized_model, reuse_non_quantized=True, dequantized_dtype=args.autocast_dtype
)
for param in dequantized_model.parameters():
    param.data = param.data.to(args.autocast_dtype)

In [6]:
from src.pv_ops import StraightThroughAdamW

optimizer = StraightThroughAdamW(
    named_dequantized_params=dict(dequantized_model.named_parameters()),
    named_master_params=master_parameters,
    update_codes=dict(lr=args.code_lr, betas=args.code_betas),
    max_code_change_per_step=1e-3,
    beam_size=args.beam_size,
    stochastic_rounding_tau=args.stochastic_rounding_tau,
    clamp_value=1e9,
    debias=True,
    dequantized_dtype=torch.float32,
)

In [7]:

if args.gradient_checkpointing:
    quantized_model.gradient_checkpointing_enable()
    quantized_model.enable_input_require_grads()
    for module in quantized_model.modules():
        if isinstance(module, QuantizedLinear):
            module.use_checkpoint = True
    dequantized_model.gradient_checkpointing_enable()
    dequantized_model.enable_input_require_grads()


In [8]:
def _run_one_step(args, base_model, dequantized_model, optimizer, train_data, **kwargs):
    optimizer.zero_grad(set_to_none=True)
    with tqdm(train_data, desc="V step") as progress:

        total_loss = 0.0
        for i, batch in enumerate(progress):
            batch = torch.as_tensor(batch, device=device)
            with torch.no_grad():
                teacher_logits = base_model(batch).logits
            student_logits = dequantized_model(batch).logits  # forward accumulates XTX statistics
            loss = kl_div(student_logits, teacher_logits)
            (loss / len(train_data)).backward()  # backward accumulates gradient
            total_loss = loss.item() / (i + 1) + total_loss * i / (i + 1)
            progress.desc = f"V step: accumulating gradients, loss = {total_loss:.9f}"
            del student_logits, teacher_logits, loss
    optimizer.step(**kwargs)
    optimizer.zero_grad(set_to_none=True)  # reset statistics for the next step
    
    return total_loss


def kl_div(student_hiddens, teacher_hiddens):
    C = student_hiddens.shape[-1]  # num classes
    return F.kl_div(
        input=F.log_softmax(student_hiddens.view(-1, C), dim=-1),
        target=F.log_softmax(teacher_hiddens.view(-1, C), dim=-1),
        log_target=True,
        reduction="batchmean",
    )


In [9]:
eval_data = get_loaders(
    'wikitext2',
    seed=args.seed,
    model_path=args.base_model,
    seqlen=args.model_seqlen,
    eval_mode=True,
)

@torch.inference_mode()
def eval_ppl_naive(model, eval_data):
    eval_inps = [
        eval_data[:, start: start + args.model_seqlen] for start in range(0, eval_data.shape[1], args.model_seqlen)
    ]
    total_tokens = 0
    nlls = []
    for input_ids in tqdm(eval_inps):
        input_ids = input_ids.to(device)
        lm_logits = model(input_ids).logits

        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * args.model_seqlen
        nlls.append(neg_log_likelihood)
        total_tokens += shift_labels.numel()
    ppl = torch.exp(torch.stack(nlls).sum() / total_tokens)
    return ppl

Loaded data from wikitext2; len(data)=1 sequences


In [10]:
os.environ["WANDB_NOTEBOOK_NAME"] = os.path.join(os.getcwd(), ipynbname.name() + ".ipynb")
wandb.init(
    project="aqml-full-finetuning",
    config = {
        k: v for k, v
        in args.__dict__.items()
        if not k.startswith("__")
    },
    settings=wandb.Settings(code_dir="."),
    name=ipynbname.name(),
)

[34m[1mwandb[0m: Currently logged in as: [33mdenismazur8[0m ([33mrouting-experiments[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
POINTER = 0
def next_train_data():
    global POINTER
    batch = []
    for i in range(args.nsamples):
        batch.append(train_data[POINTER % len(train_data)])
        POINTER += 1
    return batch

In [12]:
# demo: run the algorithm on a fixed training set till it overfits
# TODO train on large corpora, feed next portion of data into each p and v step

last_ppl_eval_score = None
for i in range(100):
    print("STEP", i)
    if i % 5 == 0:
        ppl_eval_score = eval_ppl_naive(dequantized_model, eval_data).item()
        last_ppl_eval_score = ppl_eval_score
        print("wikitext2 ppl (naive)", ppl_eval_score)

    v_step_train_loss = _run_one_step(args, base_model, dequantized_model, optimizer, next_train_data())
    print("v_step_train_loss", v_step_train_loss)
    state = {
        "ppl_eval_score": ppl_eval_score,
        "v_step_train_loss": v_step_train_loss,
        "step": i,
    }
    wandb.log(state, step=i, commit=True)


STEP 0


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 18.942625045776367


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0
model.layers.0.self_attn.k_proj.weight change rate: 0.0
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight change rate: 0.0
model.layers.2

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0
model.layers.0.self_attn.k_proj.weight change rate: 0.0
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight change rate: 0.0
model.layers.2

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0
model.layers.0.self_attn.k_proj.weight change rate: 0.0
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight change rate: 0.0
model.layers.2

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 1.621246337890625e-05
model.layers.0.self_attn.k_proj.weight change rate: 0.0
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight change rate: 

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 4.38690185546875e-05
model.layers.0.self_attn.k_proj.weight change rate: 2.288818359375e-05
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 18.94213104248047


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00017642974853515625
model.layers.0.self_attn.k_proj.weight change rate: 9.1552734375e-05
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.self_attn.o_proj.weight

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00025463104248046875
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 9.5367431640625e-07
model.layers.1.self_attn.k_proj.weight change rate: 3.0517578125e-05
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.l

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0005674362182617188
model.layers.0.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 1.811981201171875e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_proj.weight change rate: 0.0
model.layers.2.sel

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0005750656127929688
model.layers.0.self_attn.k_proj.weight change rate: 0.00051116943359375
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 2.956390380859375e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.00014495849609375
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 6.103515625e-05
model.layers.2.self_attn.v_proj.weight change 

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0005817413330078125
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 5.054473876953125e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.0004119873046875
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 5.7220458984375e-06
model.layers.2.self_attn.k_proj.weight change rate: 0.0001220703125
model.layers.2.self_attn.v_proj.

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 18.756591796875


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007877349853515625
model.layers.0.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 9.059906005859375e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0001068115234375
model.layers.2.self_attn.v_proj.weight chan

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0005979537963867188
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0002079010009765625
model.layers.1.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 3.0517578125e-05
model.layers.2.self_attn.k_proj.weight change rate: 0.0009002685546875
model.layers.2.self_attn.v_proj

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006170272827148438
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0008707046508789062
model.layers.1.self_attn.k_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 4.38690185546875e-05
model.layers.2.self_attn.k_proj.weight change rate: 0.0003204345703125
model.layers.2.self_attn.v_pr

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008325576782226562
model.layers.0.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 0.0
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0004444122314453125
model.layers.1.self_attn.k_proj.weight change rate: 0.0005340576171875
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 9.250640869140625e-05
model.layers.2.self_attn.k_proj.weight change rate: 0.00051116943359375
model.la

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007371902465820312
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 7.62939453125e-06
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 1.3871626833861228e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.00040340423583984375
model.layers.1.self_attn.k_proj.weight change rate: 0.00087738037109375
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 6.935813416930614e-07
model.layers.2.self_attn.q_proj.weight change rate: 0.0001468658447265625
model.layers.2.self_attn.k_proj.weight chang

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 16.677682876586914


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007305145263671875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.0.self_attn.v_proj.weight change rate: 7.62939453125e-06
model.layers.0.self_attn.o_proj.weight change rate: 1.239776611328125e-05
model.layers.0.mlp.gate_proj.weight change rate: 6.935813416930614e-07
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.00040531158447265625
model.layers.1.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.000217437744140625
model.layers.2.self_attn.k_proj.weight change ra

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006761550903320312
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.0
model.layers.0.self_attn.o_proj.weight change rate: 2.86102294921875e-06
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0008974075317382812
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 1.9073486328125e-06
model.layers.1.mlp.gate_proj.weight change rate: 1.3871626833861228e-06
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 1.3871626833861228e-06
model.layers.2.self_attn.q_proj.weight change rate: 0.00028705596923828125
model.la

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006227493286132812
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 7.62939453125e-06
model.layers.0.self_attn.o_proj.weight change rate: 5.7220458984375e-06
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0005655288696289062
model.layers.1.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.000469207763671875
model.layers.2.self_attn.k_proj.weight change rate: 

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008096694946289062
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.0001068115234375
model.layers.0.self_attn.o_proj.weight change rate: 8.7738037109375e-05
model.layers.0.mlp.gate_proj.weight change rate: 2.7743253667722456e-06
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.v_proj.weight change rate: 1.52587890625e-05
model.layers.1.self_attn.o_proj.weight change rate: 1.1444091796875e-05
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 2.0807440250791842e-06
model.layers.2.self_attn.q_proj.weight change rate: 0.00016021728515625
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006494522094726562
model.layers.0.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.v_proj.weight change rate: 8.392333984375e-05
model.layers.0.self_attn.o_proj.weight change rate: 5.7220458984375e-05
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 2.0807440250791842e-06
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0008020401000976562
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 4.57763671875e-05
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.00016880035400390625
model.layers.

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 15.578177452087402


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000652313232421875
model.layers.0.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.v_proj.weight change rate: 3.814697265625e-05
model.layers.0.self_attn.o_proj.weight change rate: 7.152557373046875e-05
model.layers.0.mlp.gate_proj.weight change rate: 3.467906708465307e-06
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0007658004760742188
model.layers.1.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 2.288818359375e-05
model.layers.1.mlp.gate_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0002689361572265625

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006570816040039062
model.layers.0.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.v_proj.weight change rate: 3.0517578125e-05
model.layers.0.self_attn.o_proj.weight change rate: 8.106231689453125e-05
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 3.467906708465307e-06
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0007619857788085938
model.layers.1.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.0
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.0003147125244140625
model.layers.2.self_attn.k_proj.w

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000823974609375
model.layers.0.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.v_proj.weight change rate: 3.814697265625e-05
model.layers.0.self_attn.o_proj.weight change rate: 0.00023937225341796875
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 2.0807440250791842e-06
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.000568389892578125
model.layers.1.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.v_proj.weight change rate: 7.62939453125e-05
model.layers.1.self_attn.o_proj.weight change rate: 1.9073486328125e-06
model.layers.1.mlp.gate_proj.weight change rate: 1.3871626833861228e-06
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.00047016143798828125

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006923675537109375
model.layers.0.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.v_proj.weight change rate: 0.000213623046875
model.layers.0.self_attn.o_proj.weight change rate: 0.00030994415283203125
model.layers.0.mlp.gate_proj.weight change rate: 7.62939453125e-06
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 3.1211158784572035e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005540847778320312
model.layers.1.self_attn.k_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.v_proj.weight change rate: 2.288818359375e-05
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 5.548650733544491e-06
model.layers.2.self_attn.q_proj.weight change rate: 0.0005311965942382812

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006856918334960938
model.layers.0.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.v_proj.weight change rate: 0.0001220703125
model.layers.0.self_attn.o_proj.weight change rate: 0.00041866302490234375
model.layers.0.mlp.gate_proj.weight change rate: 9.016557669383474e-06
model.layers.0.mlp.up_proj.weight change rate: 6.935813416930614e-06
model.layers.0.mlp.down_proj.weight change rate: 9.016557669383474e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0005598068237304688
model.layers.1.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.v_proj.weight change rate: 2.288818359375e-05
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 4.855069164477754e-06
model.layers.1.mlp.up_proj.weight change rate: 3.467906708465307e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 14.835488319396973


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00031280517578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005702972412109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 6.935813416930614e-06
model.layers.0.mlp.down_proj.weight change rate: 2.080743979604449e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005578994750976562
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 2.288818359375e-05
model.layers.1.self_attn.o_proj.weight change rate: 0.000316619873046875
model.layers.1.mlp.gate_proj.weight change rate: 2.0807440250791842e-06
model.layers.1.mlp.up_proj.weight change rate: 2.0807440250791842e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.we

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007038116455078125
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 9.918212890625e-05
model.layers.0.self_attn.o_proj.weight change rate: 0.000644683837890625
model.layers.0.mlp.gate_proj.weight change rate: 1.3178045264794491e-05
model.layers.0.mlp.up_proj.weight change rate: 8.322976100316737e-06
model.layers.0.mlp.down_proj.weight change rate: 1.6645952200633474e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.000553131103515625
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.000244140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0001811981201171875
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 6.935813416930614e-06
model.layers.1.mlp.down_proj.weight change rate: 9.016557669383474e-06
model.layers.2.self

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008363723754882812
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00046539306640625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006456375122070312
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 2.7743253667722456e-06
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0005645751953125
model.layers.1.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00017547607421875
model.layers.1.self_attn.o_proj.weight change rate: 2.86102294921875e-06
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 2.7743253667722456e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.00054931640625
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007953643798828125
model.layers.0.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.v_proj.weight change rate: 0.0002593994140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006818771362304688
model.layers.0.mlp.gate_proj.weight change rate: 2.7743253667722456e-05
model.layers.0.mlp.up_proj.weight change rate: 5.548650733544491e-06
model.layers.0.mlp.down_proj.weight change rate: 3.814697265625e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0009145736694335938
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 8.392333984375e-05
model.layers.1.self_attn.o_proj.weight change rate: 3.814697265625e-06
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 4.855069164477754e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weigh

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.v_proj.weight change rate: 0.000244140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006952285766601562
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 2.2194602934177965e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005731582641601562
model.layers.1.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.1.self_attn.v_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.o_proj.weight change rate: 1.1444091796875e-05
model.layers.1.mlp.gate_proj.weight change rate: 1.8033115338766947e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 0.000911712646484375
m

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.996399879455566


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008077621459960938
model.layers.0.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00025177001953125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006341934204101562
model.layers.0.mlp.gate_proj.weight change rate: 0.0
model.layers.0.mlp.up_proj.weight change rate: 2.4275346731883474e-05
model.layers.0.mlp.down_proj.weight change rate: 3.7453391996677965e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005664825439453125
model.layers.1.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00038909912109375
model.layers.1.self_attn.o_proj.weight change rate: 0.00090789794921875
model.layers.1.mlp.gate_proj.weight change rate: 6.935813416930614e-06
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 2.288818359375e-05
model.layers.2.self_attn.q_proj.weight

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007305145263671875
model.layers.0.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00028228759765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006618499755859375
model.layers.0.mlp.gate_proj.weight change rate: 4.022771827294491e-05
model.layers.0.mlp.up_proj.weight change rate: 1.8033115338766947e-05
model.layers.0.mlp.down_proj.weight change rate: 3.675981133710593e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005578994750976562
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00014495849609375
model.layers.1.self_attn.o_proj.weight change rate: 1.52587890625e-05
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 1.8033115338766947e-05
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007123947143554688
model.layers.0.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.v_proj.weight change rate: 0.00029754638671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0005817413330078125
model.layers.0.mlp.gate_proj.weight change rate: 2.0113859136472456e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 4.9937858420889825e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.000560760498046875
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00075531005859375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006666183471679688
model.layers.1.mlp.gate_proj.weight change rate: 3.467906708465307e-07
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 1.9420276657911018e-05
model.layers.2.self_attn.q_pr

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007162094116210938
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0007266998291015625
model.layers.0.mlp.gate_proj.weight change rate: 2.6356090529588982e-05
model.layers.0.mlp.up_proj.weight change rate: 3.467906572041102e-05
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0005655288696289062
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.o_proj.weight change rate: 3.719329833984375e-05
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 2.2194602934177965e-05
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rat

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007162094116210938
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.00061798095703125
model.layers.0.mlp.gate_proj.weight change rate: 2.913041680585593e-05
model.layers.0.mlp.up_proj.weight change rate: 1.8033115338766947e-05
model.layers.0.mlp.down_proj.weight change rate: 7.62939453125e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005731582641601562
model.layers.1.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.1.self_attn.v_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.o_proj.weight change rate: 3.62396240234375e-05
model.layers.1.mlp.gate_proj.weight change rate: 3.0517578125e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 2.358176607231144e-05
model.layers.2.self_attn

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.486318588256836


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008230209350585938
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0007266998291015625
model.layers.0.mlp.gate_proj.weight change rate: 2.913041680585593e-05
model.layers.0.mlp.up_proj.weight change rate: 2.4968929210444912e-05
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-07
model.layers.1.self_attn.q_proj.weight change rate: 0.0009126663208007812
model.layers.1.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00034332275390625
model.layers.1.self_attn.o_proj.weight change rate: 0.00095367431640625
model.layers.1.mlp.gate_proj.weight change rate: 1.733953286020551e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weigh

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006465911865234375
model.layers.0.mlp.gate_proj.weight change rate: 3.1211158784572035e-05
model.layers.0.mlp.up_proj.weight change rate: 3.6066230677533895e-05
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-07
model.layers.1.self_attn.q_proj.weight change rate: 0.0005617141723632812
model.layers.1.self_attn.k_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.00077056884765625
model.layers.1.mlp.gate_proj.weight change rate: 1.52587890625e-05
model.layers.1.mlp.up_proj.weight change rate: 3.1211158784572035e-05
model.layers.1.mlp.down_proj.weight change rate: 2.843683432729449e-05
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007982254028320312
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.o_proj.weight change rate: 0.0006628036499023438
model.layers.0.mlp.gate_proj.weight change rate: 3.467906572041102e-05
model.layers.0.mlp.up_proj.weight change rate: 2.843683432729449e-05
model.layers.0.mlp.down_proj.weight change rate: 0.00013282083091326058
model.layers.1.self_attn.q_proj.weight change rate: 0.0008792877197265625
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00067138671875
model.layers.1.self_attn.o_proj.weight change rate: 7.534027099609375e-05
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 1.6645952200633474e-05
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.we

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007228851318359375
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007047653198242188
model.layers.0.mlp.gate_proj.weight change rate: 4.300204454921186e-05
model.layers.0.mlp.up_proj.weight change rate: 3.0517578125e-05
model.layers.0.mlp.down_proj.weight change rate: 7.941506191855296e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005693435668945312
model.layers.1.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.1.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0009222030639648438
model.layers.1.mlp.gate_proj.weight change rate: 3.259832374169491e-05
model.layers.1.mlp.up_proj.weight change rate: 2.080743979604449e-05
model.layers.1.mlp.down_proj.weight change rate: 2.4275346731883474e-05

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007009506225585938
model.layers.0.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006322860717773438
model.layers.0.mlp.gate_proj.weight change rate: 4.022771827294491e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 8.531050843885168e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.000568389892578125
model.layers.1.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00035858154296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007915496826171875
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 2.4968929210444912e-05
model.layers.1.mlp.down_proj.weight change rate: 3.259832374169491e-05
model.layers.2.self_attn.q_proj.weigh

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.237404823303223


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007104873657226562
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.000644683837890625
model.layers.0.mlp.gate_proj.weight change rate: 5.930120460106991e-05
model.layers.0.mlp.up_proj.weight change rate: 6.0341575590427965e-05
model.layers.0.mlp.down_proj.weight change rate: 8.461692050332204e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005559921264648438
model.layers.1.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007715225219726562
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 2.0113859136472456e-05
model.layers.2.self_attn.q_proj.weig

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008020401000976562
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006389617919921875
model.layers.0.mlp.gate_proj.weight change rate: 1.7339533542326535e-06
model.layers.0.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.0.mlp.down_proj.weight change rate: 6.935813416930614e-07
model.layers.1.self_attn.q_proj.weight change rate: 0.0008630752563476562
model.layers.1.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.000179290771484375
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 5.826083361171186e-05
model.layers.1.mlp.down_proj.weight change rate: 2.358176607231144e-05
model.layers

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007801055908203125
model.layers.0.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005893707275390625
model.layers.0.mlp.gate_proj.weight change rate: 9.293990297010168e-05
model.layers.0.mlp.up_proj.weight change rate: 1.0403720125395921e-06
model.layers.0.mlp.down_proj.weight change rate: 1.0403720125395921e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.000827789306640625
model.layers.1.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008668899536132812
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 3.710660166689195e-05
model.layers.1.mlp.down_proj.weight change rate: 3.329190440126695e-05
model.layers.2.s

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007266998291015625
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005712509155273438
model.layers.0.mlp.gate_proj.weight change rate: 7.62939453125e-05
model.layers.0.mlp.up_proj.weight change rate: 0.00011270696995779872
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-07
model.layers.1.self_attn.q_proj.weight change rate: 0.000591278076171875
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.00021648406982421875
model.layers.1.mlp.gate_proj.weight change rate: 0.00012727217108476907
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 4.646995148505084e-05
model.layers.2.sel

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008020401000976562
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005865097045898438
model.layers.0.mlp.gate_proj.weight change rate: 6.589022814296186e-05
model.layers.0.mlp.up_proj.weight change rate: 1.0403720125395921e-06
model.layers.0.mlp.down_proj.weight change rate: 0.00031453915289603174
model.layers.1.self_attn.q_proj.weight change rate: 0.0008115768432617188
model.layers.1.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008554458618164062
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 4.092129893251695e-05
model.layers.2.self_attn.q_pro

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.015673637390137


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.000568389892578125
model.layers.0.mlp.gate_proj.weight change rate: 9.918212890625e-05
model.layers.0.mlp.up_proj.weight change rate: 1.0403720125395921e-06
model.layers.0.mlp.down_proj.weight change rate: 2.0807440250791842e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0007762908935546875
model.layers.1.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007953643798828125
model.layers.1.mlp.gate_proj.weight change rate: 3.467906708465307e-07
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight 

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006961822509765625
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005664825439453125
model.layers.0.mlp.gate_proj.weight change rate: 3.467906708465307e-06
model.layers.0.mlp.up_proj.weight change rate: 1.0403720125395921e-06
model.layers.0.mlp.down_proj.weight change rate: 1.7339533542326535e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0007562637329101562
model.layers.1.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007810592651367188
model.layers.1.mlp.gate_proj.weight change rate: 0.00014738603204023093
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 9.71013869275339e-05
model.laye

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006999969482421875
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005884170532226562
model.layers.0.mlp.gate_proj.weight change rate: 4.1614880501583684e-06
model.layers.0.mlp.up_proj.weight change rate: 0.00022333319066092372
model.layers.0.mlp.down_proj.weight change rate: 1.7339533542326535e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0007410049438476562
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.00030231475830078125
model.layers.1.mlp.gate_proj.weight change rate: 3.467906708465307e-07
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 7.490678399335593e-05
model.layers.2

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007190704345703125
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008382797241210938
model.layers.0.mlp.gate_proj.weight change rate: 3.121116151305614e-06
model.layers.0.mlp.up_proj.weight change rate: 1.7339533542326535e-06
model.layers.0.mlp.down_proj.weight change rate: 2.7743253667722456e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0005970001220703125
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008144378662109375
model.layers.1.mlp.gate_proj.weight change rate: 0.00013455477892421186
model.layers.1.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.down_proj.weight change rate: 7.525357796112075e-0

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007381439208984375
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.o_proj.weight change rate: 0.00069427490234375
model.layers.0.mlp.gate_proj.weight change rate: 5.548650733544491e-06
model.layers.0.mlp.up_proj.weight change rate: 3.814697265625e-06
model.layers.0.mlp.down_proj.weight change rate: 4.855069164477754e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0005559921264648438
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.o_proj.weight change rate: 0.0003871917724609375
model.layers.1.mlp.gate_proj.weight change rate: 8.739125041756779e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0003218217461835593
model.layers.1.mlp.down_proj.weight change rate: 6.935813416930614e-07
mod

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.942079544067383


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008325576782226562
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006132125854492188
model.layers.0.mlp.gate_proj.weight change rate: 7.62939453125e-06
model.layers.0.mlp.up_proj.weight change rate: 0.000270496733719483
model.layers.0.mlp.down_proj.weight change rate: 0.0007230585324577987
model.layers.1.self_attn.q_proj.weight change rate: 0.0005617141723632812
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.00032711029052734375
model.layers.1.mlp.gate_proj.weight change rate: 0.00011270696995779872
model.layers.1.mlp.up_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.down_proj.weight change rate: 3.467906708465307e-07
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007085800170898438
model.layers.0.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005731582641601562
model.layers.0.mlp.gate_proj.weight change rate: 0.0005565989995375276
model.layers.0.mlp.up_proj.weight change rate: 0.00016125765978358686
model.layers.0.mlp.down_proj.weight change rate: 4.855069164477754e-06
model.layers.1.self_attn.q_proj.weight change rate: 0.0008382797241210938
model.layers.1.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.1.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.o_proj.weight change rate: 0.0003910064697265625
model.layers.1.mlp.gate_proj.weight change rate: 3.467906708465307e-06
model.layers.1.mlp.up_proj.weight change rate: 1.7339533542326535e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0002285350492

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007123947143554688
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.o_proj.weight change rate: 0.000560760498046875
model.layers.0.mlp.gate_proj.weight change rate: 0.000209808349609375
model.layers.0.mlp.up_proj.weight change rate: 0.00015709617582615465
model.layers.0.mlp.down_proj.weight change rate: 1.1444091796875e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.000774383544921875
model.layers.1.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0003986358642578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0002094615629175678
model.layers.1.mlp.up_proj.weight change rate: 3.467906708465307e-07
model.layers.1.mlp.down_proj.weight change rate: 1.3871626833861228e-06


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000736236572265625
model.layers.0.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.o_proj.weight change rate: 0.000797271728515625
model.layers.0.mlp.gate_proj.weight change rate: 0.000244140625
model.layers.0.mlp.up_proj.weight change rate: 0.00017755682347342372
model.layers.0.mlp.down_proj.weight change rate: 0.0006550875841639936
model.layers.1.self_attn.q_proj.weight change rate: 0.0005788803100585938
model.layers.1.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.1.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.o_proj.weight change rate: 0.00047969818115234375
model.layers.1.mlp.gate_proj.weight change rate: 3.467906708465307e-06
model.layers.1.mlp.up_proj.weight change rate: 2.7743253667722456e-06
model.layers.1.mlp.down_proj.weight change rate: 0.00025281039415858686
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007162094116210938
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005941390991210938
model.layers.0.mlp.gate_proj.weight change rate: 0.00024448742624372244
model.layers.0.mlp.up_proj.weight change rate: 7.62939453125e-06
model.layers.0.mlp.down_proj.weight change rate: 0.00033083828748203814
model.layers.1.self_attn.q_proj.weight change rate: 0.0005731582641601562
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008831024169921875
model.layers.1.mlp.gate_proj.weight change rate: 4.1614880501583684e-06
model.layers.1.mlp.up_proj.weight change rate: 0.000492095947265625
model.layers.1.mlp.down_proj.weight change rate: 0.00018449263006914407


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.860671997070312


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000804901123046875
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005788803100585938
model.layers.0.mlp.gate_proj.weight change rate: 2.080743979604449e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0002989335625898093
model.layers.0.mlp.down_proj.weight change rate: 0.00038909912109375
model.layers.1.self_attn.q_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.k_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007829666137695312
model.layers.1.mlp.gate_proj.weight change rate: 0.0003630898427218199
model.layers.1.mlp.up_proj.weight change rate: 6.589022632397246e-06
model.layers.1.mlp.down_proj.weight change rate: 3.814697265625e-06
model.la

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007581710815429688
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007734298706054688
model.layers.0.mlp.gate_proj.weight change rate: 2.115423012583051e-05
model.layers.0.mlp.up_proj.weight change rate: 1.52587890625e-05
model.layers.0.mlp.down_proj.weight change rate: 2.4275346731883474e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0007724761962890625
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007505416870117188
model.layers.1.mlp.gate_proj.weight change rate: 0.00018587980594020337
model.layers.1.mlp.up_proj.weight change rate: 6.242232302611228e-06
model.layers.1.mlp.down_proj.weight change rate: 3.814697265625e-06
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000782012939453125
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.0.self_attn.o_proj.weight change rate: 0.0006647109985351562
model.layers.0.mlp.gate_proj.weight change rate: 0.0006221424555405974
model.layers.0.mlp.up_proj.weight change rate: 1.4565208402927965e-05
model.layers.0.mlp.down_proj.weight change rate: 0.0006169406115077436
model.layers.1.self_attn.q_proj.weight change rate: 0.0005817413330078125
model.layers.1.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005159378051757812
model.layers.1.mlp.gate_proj.weight change rate: 0.00021674417075701058
model.layers.1.mlp.up_proj.weight change rate: 8.669766430102754e-06
model.layers.1.mlp.down_proj.weight change rate: 6.935813416930614e-

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007429122924804688
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.000652313232421875
model.layers.0.mlp.gate_proj.weight change rate: 3.155795275233686e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0005624944460578263
model.layers.0.mlp.down_proj.weight change rate: 3.398548506083898e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005702972412109375
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007724761962890625
model.layers.1.mlp.gate_proj.weight change rate: 0.00023685803171247244
model.layers.1.mlp.up_proj.weight change rate: 0.0006419095443561673
model.layers.1.mlp.down_proj.weight change rate: 8.322976100316737e-0

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007925033569335938
model.layers.0.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006160736083984375
model.layers.0.mlp.gate_proj.weight change rate: 0.0006072304677218199
model.layers.0.mlp.up_proj.weight change rate: 2.9823997465427965e-05
model.layers.0.mlp.down_proj.weight change rate: 0.0007147355936467648
model.layers.1.self_attn.q_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.000732421875
model.layers.1.mlp.gate_proj.weight change rate: 0.0002777793270070106
model.layers.1.mlp.up_proj.weight change rate: 0.00029789318796247244
model.layers.1.mlp.down_proj.weight change rate: 1.352483650407521e-05
model

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.791522026062012


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007190704345703125
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0005674362182617188
model.layers.0.mlp.gate_proj.weight change rate: 4.300204454921186e-05
model.layers.0.mlp.up_proj.weight change rate: 3.398548506083898e-05
model.layers.0.mlp.down_proj.weight change rate: 5.583329766523093e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0007658004760742188
model.layers.1.self_attn.k_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005235671997070312
model.layers.1.mlp.gate_proj.weight change rate: 0.00031315197702497244
model.layers.1.mlp.up_proj.weight change rate: 1.2831254935008474e-05
model.layers.1.mlp.down_proj.weight change rate: 0.00094708532560

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007915496826171875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.o_proj.weight change rate: 0.00057220458984375
model.layers.0.mlp.gate_proj.weight change rate: 0.0007313815294764936
model.layers.0.mlp.up_proj.weight change rate: 2.843683432729449e-05
model.layers.0.mlp.down_proj.weight change rate: 6.831776408944279e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0006017684936523438
model.layers.1.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007686614990234375
model.layers.1.mlp.gate_proj.weight change rate: 2.913041680585593e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0005271218251436949
model.layers.1.mlp.down_proj.weight change rate: 0.0003599687188398093
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007781982421875
model.layers.0.mlp.gate_proj.weight change rate: 5.964799493085593e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0009571422706358135
model.layers.0.mlp.down_proj.weight change rate: 8.357655315194279e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.0005712509155273438
model.layers.1.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.00070953369140625
model.layers.1.mlp.gate_proj.weight change rate: 0.0005690834950655699
model.layers.1.mlp.up_proj.weight change rate: 2.704967300815042e-05
model.layers.1.mlp.down_proj.weight change rate: 2.531571954023093e-05
model.l

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007839202880859375
model.layers.0.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005817413330078125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008229342638514936
model.layers.0.mlp.up_proj.weight change rate: 0.000540993467438966
model.layers.0.mlp.down_proj.weight change rate: 8.912520570447668e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.000820159912109375
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.000522613525390625
model.layers.1.mlp.gate_proj.weight change rate: 3.849376298603602e-05
model.layers.1.mlp.up_proj.weight change rate: 3.329190440126695e-05
model.layers.1.mlp.down_proj.weight change rate: 3.259832374169491e-05
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000804901123046875
model.layers.0.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007333755493164062
model.layers.0.mlp.gate_proj.weight change rate: 0.0006061900639906526
model.layers.0.mlp.up_proj.weight change rate: 0.0005080483388155699
model.layers.0.mlp.down_proj.weight change rate: 0.0009058172581717372
model.layers.1.self_attn.q_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.000507354736328125
model.layers.1.mlp.gate_proj.weight change rate: 0.000678322569001466
model.layers.1.mlp.up_proj.weight change rate: 4.5082786527927965e-05
model.layers.1.mlp.down_proj.weight change rate: 4.092129893251695e-05


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.763656616210938


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007686614990234375
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006895065307617188
model.layers.0.mlp.gate_proj.weight change rate: 0.000629425048828125
model.layers.0.mlp.up_proj.weight change rate: 0.0005174116813577712
model.layers.0.mlp.down_proj.weight change rate: 0.00012623181100934744
model.layers.1.self_attn.q_proj.weight change rate: 0.0005826950073242188
model.layers.1.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007877349853515625
model.layers.1.mlp.gate_proj.weight change rate: 5.201860039960593e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0009450045763514936
model.layers.1.mlp.down_proj.weight change rate: 4.092129893251695e

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007925033569335938
model.layers.0.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.000614166259765625
model.layers.0.mlp.gate_proj.weight change rate: 0.000682137266267091
model.layers.0.mlp.up_proj.weight change rate: 8.0108642578125e-05
model.layers.0.mlp.down_proj.weight change rate: 0.00012796575902029872
model.layers.1.self_attn.q_proj.weight change rate: 0.0008115768432617188
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.000522613525390625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008250150131061673
model.layers.1.mlp.up_proj.weight change rate: 0.0007747303461655974
model.layers.1.mlp.down_proj.weight change rate: 0.0009456981788389385


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007495880126953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005750656127929688
model.layers.0.mlp.gate_proj.weight change rate: 0.00012345747381914407
model.layers.0.mlp.up_proj.weight change rate: 0.0008829290745779872
model.layers.0.mlp.down_proj.weight change rate: 0.0008655895362608135
model.layers.1.self_attn.q_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007638931274414062
model.layers.1.mlp.gate_proj.weight change rate: 0.0005961331771686673
model.layers.1.mlp.up_proj.weight change rate: 8.114901720546186e-05
model.layers.1.mlp.down_proj.weight change rate: 5.791404328192584e-05
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007839202880859375
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0005741119384765625
model.layers.0.mlp.gate_proj.weight change rate: 0.00013871626288164407
model.layers.0.mlp.up_proj.weight change rate: 9.744818089529872e-05
model.layers.0.mlp.down_proj.weight change rate: 0.0008440884994342923
model.layers.1.self_attn.q_proj.weight change rate: 0.0005731582641601562
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007333755493164062
model.layers.1.mlp.gate_proj.weight change rate: 9.259310900233686e-05
model.layers.1.mlp.up_proj.weight change rate: 8.877841173671186e-05
model.layers.1.mlp.down_proj.weight change rate: 0.0009238503407686949


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007276535034179688
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008020401000976562
model.layers.0.mlp.gate_proj.weight change rate: 0.0008770336280576885
model.layers.0.mlp.up_proj.weight change rate: 9.432706428924575e-05
model.layers.0.mlp.down_proj.weight change rate: 0.00019767068442888558
model.layers.1.self_attn.q_proj.weight change rate: 0.000827789306640625
model.layers.1.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.v_proj.weight change rate: 0.00063323974609375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006952285766601562
model.layers.1.mlp.gate_proj.weight change rate: 0.0009002685546875
model.layers.1.mlp.up_proj.weight change rate: 0.00011166659533046186
model.layers.1.mlp.down_proj.weight change rate: 0.00010091608419315

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.698882102966309


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007877349853515625
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006837844848632812
model.layers.0.mlp.gate_proj.weight change rate: 0.00018033114611171186
model.layers.0.mlp.up_proj.weight change rate: 0.000888130918610841
model.layers.0.mlp.down_proj.weight change rate: 0.0008024736307561398
model.layers.1.self_attn.q_proj.weight change rate: 0.0007648468017578125
model.layers.1.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005388259887695312
model.layers.1.mlp.gate_proj.weight change rate: 0.00012033636448904872
model.layers.1.mlp.up_proj.weight change rate: 0.00012449784844648093
model.layers.1.mlp.down_proj.weight change rate: 0.000106811523

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007162094116210938
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006742477416992188
model.layers.0.mlp.gate_proj.weight change rate: 0.0001848394313128665
model.layers.0.mlp.up_proj.weight change rate: 0.0001407970121363178
model.layers.0.mlp.down_proj.weight change rate: 0.0007688348996452987
model.layers.1.self_attn.q_proj.weight change rate: 0.0005941390991210938
model.layers.1.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007467269897460938
model.layers.1.mlp.gate_proj.weight change rate: 0.0008662830805405974
model.layers.1.mlp.up_proj.weight change rate: 0.00013559515355154872
model.layers.1.mlp.down_proj.weight change rate: 0.00013282083091326058

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000705718994140625
model.layers.0.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.v_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.o_proj.weight change rate: 0.00061798095703125
model.layers.0.mlp.gate_proj.weight change rate: 0.00021396984811872244
model.layers.0.mlp.up_proj.weight change rate: 0.000853798643220216
model.layers.0.mlp.down_proj.weight change rate: 0.0007729963981546462
model.layers.1.self_attn.q_proj.weight change rate: 0.0008106231689453125
model.layers.1.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00067138671875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005197525024414062
model.layers.1.mlp.gate_proj.weight change rate: 0.00014807961997576058
model.layers.1.mlp.up_proj.weight change rate: 0.0008697509765625
model.layers.1.mlp.down_proj.weight change rate: 0.0008603876340202987
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000728607177734375
model.layers.0.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006885528564453125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008603876340202987
model.layers.0.mlp.up_proj.weight change rate: 0.00018241189536638558
model.layers.0.mlp.down_proj.weight change rate: 0.0003218217461835593
model.layers.1.self_attn.q_proj.weight change rate: 0.0007486343383789062
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005130767822265625
model.layers.1.mlp.gate_proj.weight change rate: 0.0001636851957300678
model.layers.1.mlp.up_proj.weight change rate: 0.00019385598716326058
model.layers.1.mlp.down_proj.weight change rate: 0.0008555325912311673

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007944107055664062
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005731582641601562
model.layers.0.mlp.gate_proj.weight change rate: 0.00029477206408046186
model.layers.0.mlp.up_proj.weight change rate: 0.0008322976063936949
model.layers.0.mlp.down_proj.weight change rate: 0.00031904742354527116
model.layers.1.self_attn.q_proj.weight change rate: 0.0007619857788085938
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005159378051757812
model.layers.1.mlp.gate_proj.weight change rate: 0.000843048095703125
model.layers.1.mlp.up_proj.weight change rate: 0.00020079179375898093
model.layers.1.mlp.down_proj.weight change rate: 0.000827789306640625

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.730141639709473


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007314682006835938
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.00052642822265625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007143020629882812
model.layers.0.mlp.gate_proj.weight change rate: 0.00025974621530622244
model.layers.0.mlp.up_proj.weight change rate: 0.00022541393991559744
model.layers.0.mlp.down_proj.weight change rate: 0.0007941506337374449
model.layers.1.self_attn.q_proj.weight change rate: 0.0005931854248046875
model.layers.1.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00063323974609375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005207061767578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007993524777702987
model.layers.1.mlp.up_proj.weight change rate: 0.0002288818359375
model.layers.1.mlp.down_proj.weight change rate: 0.00021431663481052965

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008039474487304688
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00052642822265625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006561279296875
model.layers.0.mlp.gate_proj.weight change rate: 0.000838539854157716
model.layers.0.mlp.up_proj.weight change rate: 0.000812530517578125
model.layers.0.mlp.down_proj.weight change rate: 0.00045949764898978174
model.layers.1.self_attn.q_proj.weight change rate: 0.0007724761962890625
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008306503295898438
model.layers.1.mlp.gate_proj.weight change rate: 0.0007650202023796737
model.layers.1.mlp.up_proj.weight change rate: 0.0008409674046561122
model.layers.1.mlp.down_proj.weight change rate: 0.0007983121322467923

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007925033569335938
model.layers.0.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005893707275390625
model.layers.0.mlp.gate_proj.weight change rate: 0.0007882551872171462
model.layers.0.mlp.up_proj.weight change rate: 0.0007868680404499173
model.layers.0.mlp.down_proj.weight change rate: 0.0004074790340382606
model.layers.1.self_attn.q_proj.weight change rate: 0.000591278076171875
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007457733154296875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007483742665499449
model.layers.1.mlp.up_proj.weight change rate: 0.0007740368018858135
model.layers.1.mlp.down_proj.weight change rate: 0.000252463622018694

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007963180541992188
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005779266357421875
model.layers.0.mlp.gate_proj.weight change rate: 0.0007625927100889385
model.layers.0.mlp.up_proj.weight change rate: 0.000785134092438966
model.layers.0.mlp.down_proj.weight change rate: 0.0004484003293327987
model.layers.1.self_attn.q_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007343292236328125
model.layers.1.mlp.gate_proj.weight change rate: 0.00031766024767421186
model.layers.1.mlp.up_proj.weight change rate: 0.0003311850887257606
model.layers.1.mlp.down_proj.weight change rate: 0.000289570220047608


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007200241088867188
model.layers.0.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005655288696289062
model.layers.0.mlp.gate_proj.weight change rate: 0.0007563504623249173
model.layers.0.mlp.up_proj.weight change rate: 0.0003516457509249449
model.layers.0.mlp.down_proj.weight change rate: 0.000811836973298341
model.layers.1.self_attn.q_proj.weight change rate: 0.0008058547973632812
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007114410400390625
model.layers.1.mlp.gate_proj.weight change rate: 0.00032910433947108686
model.layers.1.mlp.up_proj.weight change rate: 0.000766060606110841
model.layers.1.mlp.down_proj.weight change rate: 0.00078964233398437

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.631528854370117


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007801055908203125
model.layers.0.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.v_proj.weight change rate: 0.00052642822265625
model.layers.0.self_attn.o_proj.weight change rate: 0.00055694580078125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005125566385686398
model.layers.0.mlp.up_proj.weight change rate: 0.00032459606882184744
model.layers.0.mlp.down_proj.weight change rate: 0.0005018060910515487
model.layers.1.self_attn.q_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.000530242919921875
model.layers.1.mlp.gate_proj.weight change rate: 0.0003540732723195106
model.layers.1.mlp.up_proj.weight change rate: 0.0007358898292295635
model.layers.1.mlp.down_proj.weight change rate: 0.0004012368153780699
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005636215209960938
model.layers.0.mlp.gate_proj.weight change rate: 0.0007674477528780699
model.layers.0.mlp.up_proj.weight change rate: 0.00035511364694684744
model.layers.0.mlp.down_proj.weight change rate: 0.0007875615847297013
model.layers.1.self_attn.q_proj.weight change rate: 0.000751495361328125
model.layers.1.self_attn.k_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.000766754150390625
model.layers.1.mlp.gate_proj.weight change rate: 0.0003828569024335593
model.layers.1.mlp.up_proj.weight change rate: 0.0007143887924030423
model.layers.1.mlp.down_proj.weight change rate: 0.0007525357650592923

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007429122924804688
model.layers.0.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008640289306640625
model.layers.0.mlp.gate_proj.weight change rate: 0.0007462935172952712
model.layers.0.mlp.up_proj.weight change rate: 0.000819466367829591
model.layers.0.mlp.down_proj.weight change rate: 0.0007365833735093474
model.layers.1.self_attn.q_proj.weight change rate: 0.0006151199340820312
model.layers.1.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.1.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007276535034179688
model.layers.1.mlp.gate_proj.weight change rate: 0.000812530517578125
model.layers.1.mlp.up_proj.weight change rate: 0.0005052739870734513
model.layers.1.mlp.down_proj.weight change rate: 0.0004598444211296737
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007143020629882812
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007076263427734375
model.layers.0.mlp.gate_proj.weight change rate: 0.0005070079350844026
model.layers.0.mlp.up_proj.weight change rate: 0.0007504550158046186
model.layers.0.mlp.down_proj.weight change rate: 0.0005031932378187776
model.layers.1.self_attn.q_proj.weight change rate: 0.0005846023559570312
model.layers.1.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005216598510742188
model.layers.1.mlp.gate_proj.weight change rate: 0.0007372769759967923
model.layers.1.mlp.up_proj.weight change rate: 0.00045048107858747244
model.layers.1.mlp.down_proj.weight change rate: 0.0004431984852999449


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007905960083007812
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.000576019287109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0005049272440373898
model.layers.0.mlp.up_proj.weight change rate: 0.0007365833735093474
model.layers.0.mlp.down_proj.weight change rate: 0.0007528825663030148
model.layers.1.self_attn.q_proj.weight change rate: 0.0005826950073242188
model.layers.1.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007801055908203125
model.layers.1.mlp.gate_proj.weight change rate: 0.0005024996935389936
model.layers.1.mlp.up_proj.weight change rate: 0.0004875876766163856
model.layers.1.mlp.down_proj.weight change rate: 0.00076189910760149

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.687273025512695


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007266998291015625
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.0.self_attn.o_proj.weight change rate: 0.000743865966796875
model.layers.0.mlp.gate_proj.weight change rate: 0.000502846494782716
model.layers.0.mlp.up_proj.weight change rate: 0.0005056207883171737
model.layers.0.mlp.down_proj.weight change rate: 0.0007154291379265487
model.layers.1.self_attn.q_proj.weight change rate: 0.0008420944213867188
model.layers.1.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007076263427734375
model.layers.1.mlp.gate_proj.weight change rate: 0.0005052739870734513
model.layers.1.mlp.up_proj.weight change rate: 0.0005014592898078263
model.layers.1.mlp.down_proj.weight change rate: 0.0007164695416577

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007352828979492188
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005779266357421875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008073286735452712
model.layers.0.mlp.up_proj.weight change rate: 0.0005038868403062224
model.layers.0.mlp.down_proj.weight change rate: 0.0007032915018498898
model.layers.1.self_attn.q_proj.weight change rate: 0.0006046295166015625
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.000701904296875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005018060910515487
model.layers.1.mlp.up_proj.weight change rate: 0.0007986589334905148
model.layers.1.mlp.down_proj.weight change rate: 0.0005035400390625
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007314682006835938
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005588531494140625
model.layers.0.mlp.gate_proj.weight change rate: 0.0005063143908046186
model.layers.0.mlp.up_proj.weight change rate: 0.0007792386459186673
model.layers.0.mlp.down_proj.weight change rate: 0.0006953153060749173
model.layers.1.self_attn.q_proj.weight change rate: 0.0008001327514648438
model.layers.1.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007085800170898438
model.layers.1.mlp.gate_proj.weight change rate: 0.0007976185297593474
model.layers.1.mlp.up_proj.weight change rate: 0.0007258328841999173
model.layers.1.mlp.down_proj.weight change rate: 0.0005021528922952

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007658004760742188
model.layers.0.mlp.gate_proj.weight change rate: 0.0007823597406968474
model.layers.0.mlp.up_proj.weight change rate: 0.000506661192048341
model.layers.0.mlp.down_proj.weight change rate: 0.0005045804427936673
model.layers.1.self_attn.q_proj.weight change rate: 0.0007753372192382812
model.layers.1.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005331039428710938
model.layers.1.mlp.gate_proj.weight change rate: 0.0007195906364358962
model.layers.1.mlp.up_proj.weight change rate: 0.0007022510981187224
model.layers.1.mlp.down_proj.weight change rate: 0.0007643266580998898
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007104873657226562
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006914138793945312
model.layers.0.mlp.gate_proj.weight change rate: 0.000506661192048341
model.layers.0.mlp.up_proj.weight change rate: 0.0005042336415499449
model.layers.0.mlp.down_proj.weight change rate: 0.0005011125467717648
model.layers.1.self_attn.q_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.k_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.0005340576171875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005207061767578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0005024996935389936
model.layers.1.mlp.up_proj.weight change rate: 0.000690460205078125
model.layers.1.mlp.down_proj.weight change rate: 0.000701904296875
model.

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.658602714538574


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006742477416992188
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006160736083984375
model.layers.0.mlp.gate_proj.weight change rate: 0.0007605119608342648
model.layers.0.mlp.up_proj.weight change rate: 0.0005031932378187776
model.layers.0.mlp.down_proj.weight change rate: 0.0007771578966639936
model.layers.1.self_attn.q_proj.weight change rate: 0.0007524490356445312
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007915496826171875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007473339210264385
model.layers.1.mlp.up_proj.weight change rate: 0.0005035400390625
model.layers.1.mlp.down_proj.weight change rate: 0.0006932345568202436
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007944107055664062
model.layers.0.self_attn.k_proj.weight change rate: 0.00054931640625
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005731582641601562
model.layers.0.mlp.gate_proj.weight change rate: 0.0007473339210264385
model.layers.0.mlp.up_proj.weight change rate: 0.0005031932378187776
model.layers.0.mlp.down_proj.weight change rate: 0.0007379705202765763
model.layers.1.self_attn.q_proj.weight change rate: 0.0006132125854492188
model.layers.1.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.v_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007467269897460938
model.layers.1.mlp.gate_proj.weight change rate: 0.000502846494782716
model.layers.1.mlp.up_proj.weight change rate: 0.0007306879269890487
model.layers.1.mlp.down_proj.weight change rate: 0.000690460205078125
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007677078247070312
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.0.self_attn.o_proj.weight change rate: 0.00075531005859375
model.layers.0.mlp.gate_proj.weight change rate: 0.0007251392817124724
model.layers.0.mlp.up_proj.weight change rate: 0.0005031932378187776
model.layers.0.mlp.down_proj.weight change rate: 0.0005024996935389936
model.layers.1.self_attn.q_proj.weight change rate: 0.0005922317504882812
model.layers.1.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00052642822265625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007085800170898438
model.layers.1.mlp.gate_proj.weight change rate: 0.0005021528922952712
model.layers.1.mlp.up_proj.weight change rate: 0.0006994768045842648
model.layers.1.mlp.down_proj.weight change rate: 0.0006741610704921

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0005731582641601562
model.layers.0.mlp.gate_proj.weight change rate: 0.0005056207883171737
model.layers.0.mlp.up_proj.weight change rate: 0.0005024996935389936
model.layers.0.mlp.down_proj.weight change rate: 0.000758431211579591
model.layers.1.self_attn.q_proj.weight change rate: 0.0008058547973632812
model.layers.1.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005331039428710938
model.layers.1.mlp.gate_proj.weight change rate: 0.000766754150390625
model.layers.1.mlp.up_proj.weight change rate: 0.0006835244130343199
model.layers.1.mlp.down_proj.weight change rate: 0.0005018060910515487
model.l

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007762908935546875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007419586181640625
model.layers.0.mlp.gate_proj.weight change rate: 0.0005031932378187776
model.layers.0.mlp.up_proj.weight change rate: 0.0005031932378187776
model.layers.0.mlp.down_proj.weight change rate: 0.0005031932378187776
model.layers.1.self_attn.q_proj.weight change rate: 0.0007467269897460938
model.layers.1.self_attn.k_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005168914794921875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005049272440373898
model.layers.1.mlp.up_proj.weight change rate: 0.0005080483388155699
model.layers.1.mlp.down_proj.weight change rate: 0.000501459289807826

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.66895580291748


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007562637329101562
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005712509155273438
model.layers.0.mlp.gate_proj.weight change rate: 0.000502846494782716
model.layers.0.mlp.up_proj.weight change rate: 0.0008565729367546737
model.layers.0.mlp.down_proj.weight change rate: 0.000758431211579591
model.layers.1.self_attn.q_proj.weight change rate: 0.0005817413330078125
model.layers.1.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005083084106445312
model.layers.1.mlp.gate_proj.weight change rate: 0.00075531005859375
model.layers.1.mlp.up_proj.weight change rate: 0.0005011125467717648
model.layers.1.mlp.down_proj.weight change rate: 0.0007615523063577712
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007696151733398438
model.layers.0.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.o_proj.weight change rate: 0.0007171630859375
model.layers.0.mlp.gate_proj.weight change rate: 0.000774383544921875
model.layers.0.mlp.up_proj.weight change rate: 0.0007716092513874173
model.layers.0.mlp.down_proj.weight change rate: 0.000720284238923341
model.layers.1.self_attn.q_proj.weight change rate: 0.0007810592651367188
model.layers.1.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.1.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008192062377929688
model.layers.1.mlp.gate_proj.weight change rate: 0.0005024996935389936
model.layers.1.mlp.up_proj.weight change rate: 0.0005014592898078263
model.layers.1.mlp.down_proj.weight change rate: 0.0007119612419046462
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007610321044921875
model.layers.0.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005655288696289062
model.layers.0.mlp.gate_proj.weight change rate: 0.0005052739870734513
model.layers.0.mlp.up_proj.weight change rate: 0.0005042336415499449
model.layers.0.mlp.down_proj.weight change rate: 0.0005031932378187776
model.layers.1.self_attn.q_proj.weight change rate: 0.0007467269897460938
model.layers.1.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.1.self_attn.v_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005292892456054688
model.layers.1.mlp.gate_proj.weight change rate: 0.0005021528922952712
model.layers.1.mlp.up_proj.weight change rate: 0.0005021528922952712
model.layers.1.mlp.down_proj.weight change rate: 0.00050145928980782

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007028579711914062
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007038116455078125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005042336415499449
model.layers.0.mlp.up_proj.weight change rate: 0.000502846494782716
model.layers.0.mlp.down_proj.weight change rate: 0.0007657138048671186
model.layers.1.self_attn.q_proj.weight change rate: 0.000728607177734375
model.layers.1.self_attn.k_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005178451538085938
model.layers.1.mlp.gate_proj.weight change rate: 0.0005018060910515487
model.layers.1.mlp.up_proj.weight change rate: 0.0005014592898078263
model.layers.1.mlp.down_proj.weight change rate: 0.0005014592898078263


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.000518798828125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005779266357421875
model.layers.0.mlp.gate_proj.weight change rate: 0.0005035400390625
model.layers.0.mlp.up_proj.weight change rate: 0.0007886019884608686
model.layers.0.mlp.down_proj.weight change rate: 0.0007136951899155974
model.layers.1.self_attn.q_proj.weight change rate: 0.0006122589111328125
model.layers.1.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.1.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0005130767822265625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008000460802577436
model.layers.1.mlp.up_proj.weight change rate: 0.0005000721430405974
model.layers.1.mlp.down_proj.weight change rate: 0.0007632862543687224
mod

In [14]:
ppl_eval_score = eval_ppl_naive(dequantized_model, eval_data).item()
print("wikitext2 ppl (naive)", ppl_eval_score)

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.60940170288086
