In [1]:
%env CUDA_VISIBLE_DEVICES=1
%env TRANSFORMERS_CACHE=/mnt/LLM/hub
%env HF_HOME=/mnt/LLM/hub
%env OMP_NUM_THREADS=16

import os
import sys
sys.path.insert(0, '..')

import time
import random
from tqdm.auto import trange, tqdm
import numpy as np
import ipynbname  # pip install ipynbname

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers

import ipynbname  # pip install ipynbname
import wandb

from src.aq import QuantizedWeight, QuantizedLinear
from src.modelutils import get_model
from src.datautils import get_loaders


torch.set_num_threads(16)
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# wandb.login(relogin=True)

env: CUDA_VISIBLE_DEVICES=1
env: TRANSFORMERS_CACHE=/mnt/LLM/hub
env: HF_HOME=/mnt/LLM/hub
env: OMP_NUM_THREADS=16




In [2]:
class args:
    base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
    quant_model = "/extra_disk_1/jheuristic/tinyllama-3t-2x8g8/"
    dtype = 'bfloat16'
    model_seqlen = 1024  # can be 2048 for 1.1B, 4096-8192 for larger models
    device_map = 'auto'
    
    dataset = 'pajama'
    nsamples = 256
    seed = 42
    beam_size = 4
    stochastic_rounding_tau = 0.0
    
    code_lr = 1e-3
    code_lr_plateau_scale = 0.5
    code_betas = (0.0, 0.95)
    delta_decay = 1.0
    codebook_lr = 1e-5
    codebook_betas = (0.9, 0.95)
    codebook_grad_accumulation_steps = 8
    
    
    autocast_dtype = torch.bfloat16  # bfloat16 or None (not using grad scaler!)
    training_dtype = torch.float32
    gradient_checkpointing = False
    devices = [device]

In [3]:
train_data = get_loaders(
    args.dataset,
    nsamples=args.nsamples * 10,
    seed=args.seed,
    model_path=args.base_model,
    seqlen=args.model_seqlen,
)

base_model = get_model(args.base_model, None, args.dtype, args.device_map)
if not args.device_map:
    base_model = base_model.to(device)
    
quantized_model = get_model(args.base_model, args.quant_model, args.dtype, args.device_map)
if not args.device_map:
    quantized_model = quantized_model.to(device)
quantized_model = quantized_model.to(args.training_dtype)

Loading red_pajama from togethercomputer/RedPajama-Data-1T-Sample


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
                                                                                                                                                                                                                                                                                          

Loaded data from pajama; len(data)=2560 sequences
Loading pretrained model ...
Model loaded sucсessfully ...
Initializing model with random weights...
Loading quantized model ...
Model loaded sucсessfully ...


In [4]:
from src.pv_ops import create_dequantized_model
dequantized_model, master_parameters = create_dequantized_model(
    quantized_model, reuse_non_quantized=True, dequantized_dtype=args.autocast_dtype
)
for param in dequantized_model.parameters():
    param.data = param.data.to(args.autocast_dtype)

In [6]:
from src.pv_ops import StraightThroughAdamW

optimizer = StraightThroughAdamW(
    named_dequantized_params=dict(dequantized_model.named_parameters()),
    named_master_params=master_parameters,
    update_codes=dict(lr=args.code_lr, betas=args.code_betas),
    max_code_change_per_step=1e-3,
    beam_size=args.beam_size,
    stochastic_rounding_tau=args.stochastic_rounding_tau,
    dequantized_dtype=torch.float32,
)

In [7]:

if args.gradient_checkpointing:
    quantized_model.gradient_checkpointing_enable()
    quantized_model.enable_input_require_grads()
    for module in quantized_model.modules():
        if isinstance(module, QuantizedLinear):
            module.use_checkpoint = True
    dequantized_model.gradient_checkpointing_enable()
    dequantized_model.enable_input_require_grads()


In [8]:
def _run_one_step(args, base_model, dequantized_model, optimizer, train_data, **kwargs):
    optimizer.zero_grad(set_to_none=True)
    with tqdm(train_data, desc="V step") as progress:

        total_loss = 0.0
        for i, batch in enumerate(progress):
            batch = torch.as_tensor(batch, device=device)
            with torch.no_grad():
                teacher_logits = base_model(batch).logits
            student_logits = dequantized_model(batch).logits  # forward accumulates XTX statistics
            loss = kl_div(student_logits, teacher_logits)
            (loss / len(train_data)).backward()  # backward accumulates gradient
            total_loss = loss.item() / (i + 1) + total_loss * i / (i + 1)
            progress.desc = f"V step: accumulating gradients, loss = {total_loss:.9f}"
            del student_logits, teacher_logits, loss
    optimizer.step(**kwargs)
    optimizer.zero_grad(set_to_none=True)  # reset statistics for the next step
    
    return total_loss


def kl_div(student_hiddens, teacher_hiddens):
    C = student_hiddens.shape[-1]  # num classes
    return F.kl_div(
        input=F.log_softmax(student_hiddens.view(-1, C), dim=-1),
        target=F.log_softmax(teacher_hiddens.view(-1, C), dim=-1),
        log_target=True,
        reduction="batchmean",
    )


In [9]:
eval_data = get_loaders(
    'wikitext2',
    seed=args.seed,
    model_path=args.base_model,
    seqlen=args.model_seqlen,
    eval_mode=True,
)

@torch.inference_mode()
def eval_ppl_naive(model, eval_data):
    eval_inps = [
        eval_data[:, start: start + args.model_seqlen] for start in range(0, eval_data.shape[1], args.model_seqlen)
    ]
    total_tokens = 0
    nlls = []
    for input_ids in tqdm(eval_inps):
        input_ids = input_ids.to(device)
        lm_logits = model(input_ids).logits

        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * args.model_seqlen
        nlls.append(neg_log_likelihood)
        total_tokens += shift_labels.numel()
    ppl = torch.exp(torch.stack(nlls).sum() / total_tokens)
    return ppl

Loaded data from wikitext2; len(data)=1 sequences


In [10]:
os.environ["WANDB_NOTEBOOK_NAME"] = os.path.join(os.getcwd(), ipynbname.name() + ".ipynb")
wandb.init(
    project="aqml-full-finetuning",
    config = {
        k: v for k, v
        in args.__dict__.items()
        if not k.startswith("__")
    },
    settings=wandb.Settings(code_dir="."),
    name=ipynbname.name(),
)

[34m[1mwandb[0m: Currently logged in as: [33mdenismazur8[0m ([33mrouting-experiments[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
POINTER = 0
def next_train_data():
    global POINTER
    batch = []
    for i in range(args.nsamples):
        batch.append(train_data[POINTER % len(train_data)])
        POINTER += 1
    return batch

In [12]:
last_ppl_eval_score = None
for i in range(100):
    print("STEP", i)
    if i % 5 == 0:
        ppl_eval_score = eval_ppl_naive(dequantized_model, eval_data).item()
        last_ppl_eval_score = ppl_eval_score
        print("wikitext2 ppl (naive)", ppl_eval_score)

    v_step_train_loss = _run_one_step(args, base_model, dequantized_model, optimizer, next_train_data())
    print("v_step_train_loss", v_step_train_loss)
    state = {
        "ppl_eval_score": ppl_eval_score,
        "v_step_train_loss": v_step_train_loss,
        "step": i,
    }
    wandb.log(state, step=i, commit=True)


STEP 0


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 18.942625045776367


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 6.103515625e-05
model.layers.0.self_attn.k_proj.weight change rate: 0.0
model.layers.0.self_attn.v_proj.weight change rate: 9.918212890625e-05
model.layers.0.self_attn.o_proj.weight change rate: 8.58306884765625e-06
model.layers.0.mlp.gate_proj.weight change rate: 1.3871626833861228e-06
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 0.0
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 1.52587890625e-05
model.layers.1.self_attn.o_proj.weight change rate: 0.0
model.layers.1.mlp.gate_proj.weight change rate: 0.0
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 1.3871626833861228e-06
model.layers.2.self_attn.q_proj.weight change rate: 0.0
model.layers.2.self_attn.k_proj.weight change rate: 0.0
model.layers.2.self_attn.v_pro

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007038116455078125
model.layers.0.self_attn.k_proj.weight change rate: 0.00020599365234375
model.layers.0.self_attn.v_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006237030029296875
model.layers.0.mlp.gate_proj.weight change rate: 5.8954414271283895e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 1.1444091796875e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.0
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 1.9073486328125e-06
model.layers.1.mlp.gate_proj.weight change rate: 1.733953286020551e-05
model.layers.1.mlp.up_proj.weight change rate: 2.9823997465427965e-05
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weight change rate: 2.09808349609375e-

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006113052368164062
model.layers.0.self_attn.k_proj.weight change rate: 0.00054931640625
model.layers.0.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005950927734375
model.layers.0.mlp.gate_proj.weight change rate: 0.0006176341557875276
model.layers.0.mlp.up_proj.weight change rate: 0.0004938298952765763
model.layers.0.mlp.down_proj.weight change rate: 0.0
model.layers.1.self_attn.q_proj.weight change rate: 3.337860107421875e-05
model.layers.1.self_attn.k_proj.weight change rate: 0.00014495849609375
model.layers.1.self_attn.v_proj.weight change rate: 0.00049591064453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0009546279907226562
model.layers.1.mlp.gate_proj.weight change rate: 0.00017062101687770337
model.layers.1.mlp.up_proj.weight change rate: 0.000270496733719483
model.layers.1.mlp.down_proj.weight change rate: 0.0
model.layers.2.self_attn.q_proj.weigh

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006866455078125
model.layers.0.mlp.gate_proj.weight change rate: 1.9420276657911018e-05
model.layers.0.mlp.up_proj.weight change rate: 0.0008489435422234237
model.layers.0.mlp.down_proj.weight change rate: 3.467906708465307e-07
model.layers.1.self_attn.q_proj.weight change rate: 0.0003337860107421875
model.layers.1.self_attn.k_proj.weight change rate: 0.0001983642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005216598510742188
model.layers.1.mlp.gate_proj.weight change rate: 6.935813416930614e-07
model.layers.1.mlp.up_proj.weight change rate: 0.0
model.layers.1.mlp.down_proj.weight change rate: 0.0008912520133890212
model.layers.2.self_a

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.o_proj.weight change rate: 0.000598907470703125
model.layers.0.mlp.gate_proj.weight change rate: 0.0009619973134249449
model.layers.0.mlp.up_proj.weight change rate: 3.398548506083898e-05
model.layers.0.mlp.down_proj.weight change rate: 0.0009831516072154045
model.layers.1.self_attn.q_proj.weight change rate: 0.00030612945556640625
model.layers.1.self_attn.k_proj.weight change rate: 9.1552734375e-05
model.layers.1.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.000820159912109375
model.layers.1.mlp.gate_proj.weight change rate: 0.000602028623688966
model.layers.1.mlp.up_proj.weight change rate: 4.1614880501583684e-06
model.layers.1.mlp.down_proj.weight change rate: 0.0008562261937186122
model.

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 14.192126274108887


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007343292236328125
model.layers.0.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.v_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.o_proj.weight change rate: 0.000591278076171875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008950667106546462
model.layers.0.mlp.up_proj.weight change rate: 0.0009432706283405423
model.layers.0.mlp.down_proj.weight change rate: 7.42132033337839e-05
model.layers.1.self_attn.q_proj.weight change rate: 0.00010395050048828125
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007600784301757812
model.layers.1.mlp.gate_proj.weight change rate: 4.855069164477754e-06
model.layers.1.mlp.up_proj.weight change rate: 0.000988006591796875
model.layers.1.mlp.down_proj.weight change rate: 1.3871626833861228e

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006170272827148438
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00055694580078125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005807876586914062
model.layers.0.mlp.gate_proj.weight change rate: 0.000327717192703858
model.layers.0.mlp.up_proj.weight change rate: 0.00019558993517421186
model.layers.0.mlp.down_proj.weight change rate: 0.00022992221056483686
model.layers.1.self_attn.q_proj.weight change rate: 0.0007505416870117188
model.layers.1.self_attn.k_proj.weight change rate: 0.0003814697265625
model.layers.1.self_attn.v_proj.weight change rate: 0.0005035400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007429122924804688
model.layers.1.mlp.gate_proj.weight change rate: 1.8379905668552965e-05
model.layers.1.mlp.up_proj.weight change rate: 4.924427412333898e-05
model.layers.1.mlp.down_proj.weight change rate: 5.375255568651482

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007801055908203125
model.layers.0.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.v_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.o_proj.weight change rate: 0.0005846023559570312
model.layers.0.mlp.gate_proj.weight change rate: 0.0004608847957570106
model.layers.0.mlp.up_proj.weight change rate: 0.000842354551423341
model.layers.0.mlp.down_proj.weight change rate: 0.0003943009942304343
model.layers.1.self_attn.q_proj.weight change rate: 0.0005617141723632812
model.layers.1.self_attn.k_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.o_proj.weight change rate: 0.000553131103515625
model.layers.1.mlp.gate_proj.weight change rate: 3.259832374169491e-05
model.layers.1.mlp.up_proj.weight change rate: 0.0009054704569280148
model.layers.1.mlp.down_proj.weight change rate: 0.0001231106871273

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006275177001953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.o_proj.weight change rate: 0.0005865097045898438
model.layers.0.mlp.gate_proj.weight change rate: 0.0008399270009249449
model.layers.0.mlp.up_proj.weight change rate: 0.0005090886843390763
model.layers.0.mlp.down_proj.weight change rate: 0.0005021528922952712
model.layers.1.self_attn.q_proj.weight change rate: 0.00018596649169921875
model.layers.1.self_attn.k_proj.weight change rate: 0.00052642822265625
model.layers.1.self_attn.v_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008068084716796875
model.layers.1.mlp.gate_proj.weight change rate: 5.964799493085593e-05
model.layers.1.mlp.up_proj.weight change rate: 0.00018345226999372244
model.layers.1.mlp.down_proj.weight change rate: 0.00022125244140

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 0.00054931640625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007534027099609375
model.layers.0.mlp.gate_proj.weight change rate: 0.0005077015375718474
model.layers.0.mlp.up_proj.weight change rate: 0.0005063143908046186
model.layers.0.mlp.down_proj.weight change rate: 0.0008295233128592372
model.layers.1.self_attn.q_proj.weight change rate: 0.0002384185791015625
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.000553131103515625
model.layers.1.mlp.gate_proj.weight change rate: 0.0009047768544405699
model.layers.1.mlp.up_proj.weight change rate: 0.0003159262996632606
model.layers.1.mlp.down_proj.weight change rate: 0.0008180792210623622
model.l

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.30947208404541


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006322860717773438
model.layers.0.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.v_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007123947143554688
model.layers.0.mlp.gate_proj.weight change rate: 0.0005070079350844026
model.layers.0.mlp.up_proj.weight change rate: 0.0005035400390625
model.layers.0.mlp.down_proj.weight change rate: 0.0007462935172952712
model.layers.1.self_attn.q_proj.weight change rate: 0.0008478164672851562
model.layers.1.self_attn.k_proj.weight change rate: 0.0005340576171875
model.layers.1.self_attn.v_proj.weight change rate: 0.00055694580078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007944107055664062
model.layers.1.mlp.gate_proj.weight change rate: 0.0008548389887437224
model.layers.1.mlp.up_proj.weight change rate: 0.0007920698844827712
model.layers.1.mlp.down_proj.weight change rate: 0.0005031932378187776
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00083160400390625
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006933212280273438
model.layers.0.mlp.gate_proj.weight change rate: 0.0005090886843390763
model.layers.0.mlp.up_proj.weight change rate: 0.0005045804427936673
model.layers.0.mlp.down_proj.weight change rate: 0.000502846494782716
model.layers.1.self_attn.q_proj.weight change rate: 0.0007810592651367188
model.layers.1.self_attn.k_proj.weight change rate: 0.0005340576171875
model.layers.1.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007600784301757812
model.layers.1.mlp.gate_proj.weight change rate: 0.0007924166857264936
model.layers.1.mlp.up_proj.weight change rate: 0.0005021528922952712
model.layers.1.mlp.down_proj.weight change rate: 0.00074005126953125
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006513595581054688
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.000705718994140625
model.layers.0.mlp.gate_proj.weight change rate: 0.0008153048693202436
model.layers.0.mlp.up_proj.weight change rate: 0.0005052739870734513
model.layers.0.mlp.down_proj.weight change rate: 0.0007785450434312224
model.layers.1.self_attn.q_proj.weight change rate: 0.00045490264892578125
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.o_proj.weight change rate: 0.0005588531494140625
model.layers.1.mlp.gate_proj.weight change rate: 0.0005059675895608962
model.layers.1.mlp.up_proj.weight change rate: 0.0007532293093390763
model.layers.1.mlp.down_proj.weight change rate: 0.000716816342901438

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006303787231445312
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0005970001220703125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005059675895608962
model.layers.0.mlp.up_proj.weight change rate: 0.00084686279296875
model.layers.0.mlp.down_proj.weight change rate: 0.000728607177734375
model.layers.1.self_attn.q_proj.weight change rate: 0.000453948974609375
model.layers.1.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.1.self_attn.v_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007867813110351562
model.layers.1.mlp.gate_proj.weight change rate: 0.0005049272440373898
model.layers.1.mlp.up_proj.weight change rate: 0.0005035400390625
model.layers.1.mlp.down_proj.weight change rate: 0.000502846494782716
model.la

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006170272827148438
model.layers.0.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007495880126953125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008246682118624449
model.layers.0.mlp.up_proj.weight change rate: 0.0007480274653062224
model.layers.0.mlp.down_proj.weight change rate: 0.0007310347282327712
model.layers.1.self_attn.q_proj.weight change rate: 0.0008382797241210938
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.000553131103515625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008028204319998622
model.layers.1.mlp.up_proj.weight change rate: 0.0005056207883171737
model.layers.1.mlp.down_proj.weight change rate: 0.0007771578966639936

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.11144733428955


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008630752563476562
model.layers.0.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006256103515625
model.layers.0.mlp.gate_proj.weight change rate: 0.0005049272440373898
model.layers.0.mlp.up_proj.weight change rate: 0.0005059675895608962
model.layers.0.mlp.down_proj.weight change rate: 0.0005031932378187776
model.layers.1.self_attn.q_proj.weight change rate: 0.0007753372192382812
model.layers.1.self_attn.k_proj.weight change rate: 0.000518798828125
model.layers.1.self_attn.v_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005626678466796875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007348494254983962
model.layers.1.mlp.up_proj.weight change rate: 0.0008191195665858686
model.layers.1.mlp.down_proj.weight change rate: 0.0007254860829561949
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007543563842773438
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006151199340820312
model.layers.0.mlp.gate_proj.weight change rate: 0.0008069818723015487
model.layers.0.mlp.up_proj.weight change rate: 0.0007938038324937224
model.layers.0.mlp.down_proj.weight change rate: 0.0005042336415499449
model.layers.1.self_attn.q_proj.weight change rate: 0.0007457733154296875
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.00055694580078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0005540847778320312
model.layers.1.mlp.gate_proj.weight change rate: 0.0007098804926499724
model.layers.1.mlp.up_proj.weight change rate: 0.0005038868403062224
model.layers.1.mlp.down_proj.weight change rate: 0.0005018060910515487
model

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006437301635742188
model.layers.0.self_attn.k_proj.weight change rate: 0.00054931640625
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.00074005126953125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005097822868265212
model.layers.0.mlp.up_proj.weight change rate: 0.0007372769759967923
model.layers.0.mlp.down_proj.weight change rate: 0.0005101290880702436
model.layers.1.self_attn.q_proj.weight change rate: 0.00054931640625
model.layers.1.self_attn.k_proj.weight change rate: 0.00054931640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0005626678466796875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007143887924030423
model.layers.1.mlp.up_proj.weight change rate: 0.0008062883280217648
model.layers.1.mlp.down_proj.weight change rate: 0.0005059675895608962
model.l

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006284713745117188
model.layers.0.self_attn.k_proj.weight change rate: 0.00087738037109375
model.layers.0.self_attn.v_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.o_proj.weight change rate: 0.0007390975952148438
model.layers.0.mlp.gate_proj.weight change rate: 0.0005156777333468199
model.layers.0.mlp.up_proj.weight change rate: 0.0005108226323500276
model.layers.0.mlp.down_proj.weight change rate: 0.00051116943359375
model.layers.1.self_attn.q_proj.weight change rate: 0.0008134841918945312
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0005807876586914062
model.layers.1.mlp.gate_proj.weight change rate: 0.0007199374376796186
model.layers.1.mlp.up_proj.weight change rate: 0.0005087419413030148
model.layers.1.mlp.down_proj.weight change rate: 0.00050770153757184

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007677078247070312
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006589889526367188
model.layers.0.mlp.gate_proj.weight change rate: 0.000518798828125
model.layers.0.mlp.up_proj.weight change rate: 0.0007962313829921186
model.layers.0.mlp.down_proj.weight change rate: 0.0008638555882498622
model.layers.1.self_attn.q_proj.weight change rate: 0.0007638931274414062
model.layers.1.self_attn.k_proj.weight change rate: 0.00055694580078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00064849853515625
model.layers.1.self_attn.o_proj.weight change rate: 0.0005855560302734375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007220181869342923
model.layers.1.mlp.up_proj.weight change rate: 0.000510475889313966
model.layers.1.mlp.down_proj.weight change rate: 0.0005115162348374724

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.94009780883789


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006589889526367188
model.layers.0.self_attn.k_proj.weight change rate: 0.0008544921875
model.layers.0.self_attn.v_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007715225219726562
model.layers.0.mlp.gate_proj.weight change rate: 0.0008697509765625
model.layers.0.mlp.up_proj.weight change rate: 0.0005177584826014936
model.layers.0.mlp.down_proj.weight change rate: 0.0005118630360811949
model.layers.1.self_attn.q_proj.weight change rate: 0.0005502700805664062
model.layers.1.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00060272216796875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008306503295898438
model.layers.1.mlp.gate_proj.weight change rate: 0.000728607177734375
model.layers.1.mlp.up_proj.weight change rate: 0.000849983945954591
model.layers.1.mlp.down_proj.weight change rate: 0.0005135969840921462
model.

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.0.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.o_proj.weight change rate: 0.00064849853515625
model.layers.0.mlp.gate_proj.weight change rate: 0.0007806257926858962
model.layers.0.mlp.up_proj.weight change rate: 0.0008187727653421462
model.layers.0.mlp.down_proj.weight change rate: 0.0008170388173311949
model.layers.1.self_attn.q_proj.weight change rate: 0.0005369186401367188
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006103515625
model.layers.1.self_attn.o_proj.weight change rate: 0.0005807876586914062
model.layers.1.mlp.gate_proj.weight change rate: 0.000720977783203125
model.layers.1.mlp.up_proj.weight change rate: 0.0007723027956672013
model.layers.1.mlp.down_proj.weight change rate: 0.0005177584826014936
model.

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006618499755859375
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006589889526367188
model.layers.0.mlp.gate_proj.weight change rate: 0.0007480274653062224
model.layers.0.mlp.up_proj.weight change rate: 0.0007535761105827987
model.layers.0.mlp.down_proj.weight change rate: 0.0007560036610811949
model.layers.1.self_attn.q_proj.weight change rate: 0.0007982254028320312
model.layers.1.self_attn.k_proj.weight change rate: 0.00092315673828125
model.layers.1.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008087158203125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007216713856905699
model.layers.1.mlp.up_proj.weight change rate: 0.0007442127680405974
model.layers.1.mlp.down_proj.weight change rate: 0.000897494261153042

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0006732940673828125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005292025743983686
model.layers.0.mlp.up_proj.weight change rate: 0.0005236539291217923
model.layers.0.mlp.down_proj.weight change rate: 0.0005201860330998898
model.layers.1.self_attn.q_proj.weight change rate: 0.0005445480346679688
model.layers.1.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.1.self_attn.v_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007686614990234375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007345026242546737
model.layers.1.mlp.up_proj.weight change rate: 0.0007459467160515487
model.layers.1.mlp.down_proj.weight change rate: 0.00078894878970459

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000652313232421875
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006418228149414062
model.layers.0.mlp.gate_proj.weight change rate: 0.000533364072907716
model.layers.0.mlp.up_proj.weight change rate: 0.0005215731798671186
model.layers.0.mlp.down_proj.weight change rate: 0.000522613525390625
model.layers.1.self_attn.q_proj.weight change rate: 0.0007963180541992188
model.layers.1.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0005960464477539062
model.layers.1.mlp.gate_proj.weight change rate: 0.0007178566884249449
model.layers.1.mlp.up_proj.weight change rate: 0.0005146373296156526
model.layers.1.mlp.down_proj.weight change rate: 0.000521919981110841


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.97278118133545


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007610321044921875
model.layers.0.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006847381591796875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008513710927218199
model.layers.0.mlp.up_proj.weight change rate: 0.0008479031966999173
model.layers.0.mlp.down_proj.weight change rate: 0.0008381930529139936
model.layers.1.self_attn.q_proj.weight change rate: 0.0005474090576171875
model.layers.1.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006008148193359375
model.layers.1.mlp.gate_proj.weight change rate: 0.000507354736328125
model.layers.1.mlp.up_proj.weight change rate: 0.0007903359364718199
model.layers.1.mlp.down_proj.weight change rate: 0.000813570921309

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007190704345703125
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008106231689453125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005350980209186673
model.layers.0.mlp.up_proj.weight change rate: 0.0007636330556124449
model.layers.0.mlp.down_proj.weight change rate: 0.0005260814214125276
model.layers.1.self_attn.q_proj.weight change rate: 0.0005340576171875
model.layers.1.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006008148193359375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007799321901984513
model.layers.1.mlp.up_proj.weight change rate: 0.0005208795773796737
model.layers.1.mlp.down_proj.weight change rate: 0.0007823597406968474


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006780624389648438
model.layers.0.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008106231689453125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008111433708108962
model.layers.0.mlp.up_proj.weight change rate: 0.0007636330556124449
model.layers.0.mlp.down_proj.weight change rate: 0.0005350980209186673
model.layers.1.self_attn.q_proj.weight change rate: 0.0008192062377929688
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00064849853515625
model.layers.1.self_attn.o_proj.weight change rate: 0.00060272216796875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007345026242546737
model.layers.1.mlp.up_proj.weight change rate: 0.000800392881501466
model.layers.1.mlp.down_proj.weight change rate: 0.0005298961186781526


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007581710815429688
model.layers.0.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007915496826171875
model.layers.0.mlp.gate_proj.weight change rate: 0.0007646734011359513
model.layers.0.mlp.up_proj.weight change rate: 0.000544808164704591
model.layers.0.mlp.down_proj.weight change rate: 0.000827095762360841
model.layers.1.self_attn.q_proj.weight change rate: 0.0005521774291992188
model.layers.1.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.1.self_attn.o_proj.weight change rate: 0.000823974609375
model.layers.1.mlp.gate_proj.weight change rate: 0.000732421875
model.layers.1.mlp.up_proj.weight change rate: 0.0005285089719109237
model.layers.1.mlp.down_proj.weight change rate: 0.0005337108159437776
model.layer

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007276535034179688
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0006866455078125
model.layers.0.mlp.gate_proj.weight change rate: 0.0007577376090921462
model.layers.0.mlp.up_proj.weight change rate: 0.0007785450434312224
model.layers.0.mlp.down_proj.weight change rate: 0.000769875303376466
model.layers.1.self_attn.q_proj.weight change rate: 0.0005331039428710938
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007925033569335938
model.layers.1.mlp.gate_proj.weight change rate: 0.0005135969840921462
model.layers.1.mlp.up_proj.weight change rate: 0.000808022276032716
model.layers.1.mlp.down_proj.weight change rate: 0.0005441145622171462


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.019956588745117


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006666183471679688
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007781982421875
model.layers.0.mlp.gate_proj.weight change rate: 0.0005527843022719026
model.layers.0.mlp.up_proj.weight change rate: 0.0005434209597297013
model.layers.0.mlp.down_proj.weight change rate: 0.0005451549077406526
model.layers.1.self_attn.q_proj.weight change rate: 0.0005426406860351562
model.layers.1.self_attn.k_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0006351470947265625
model.layers.1.mlp.gate_proj.weight change rate: 0.0005201860330998898
model.layers.1.mlp.up_proj.weight change rate: 0.0007643266580998898
model.layers.1.mlp.down_proj.weight change rate: 0.000830910459626466

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006570816040039062
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.o_proj.weight change rate: 0.0006923675537109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0007931102882139385
model.layers.0.mlp.up_proj.weight change rate: 0.000553131103515625
model.layers.0.mlp.down_proj.weight change rate: 0.0005382191156968474
model.layers.1.self_attn.q_proj.weight change rate: 0.00052642822265625
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008144378662109375
model.layers.1.mlp.gate_proj.weight change rate: 0.0008166920160874724
model.layers.1.mlp.up_proj.weight change rate: 0.0007497614133171737
model.layers.1.mlp.down_proj.weight change rate: 0.000540993467438966


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006589889526367188
model.layers.0.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.0.self_attn.o_proj.weight change rate: 0.0006952285766601562
model.layers.0.mlp.gate_proj.weight change rate: 0.0005520907579921186
model.layers.0.mlp.up_proj.weight change rate: 0.0005500100087374449
model.layers.0.mlp.down_proj.weight change rate: 0.0005534779047593474
model.layers.1.self_attn.q_proj.weight change rate: 0.000537872314453125
model.layers.1.self_attn.k_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0006561279296875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007695285021327436
model.layers.1.mlp.up_proj.weight change rate: 0.0005357915651984513
model.layers.1.mlp.down_proj.weight change rate: 0.0007976185297593474
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006275177001953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008106231689453125
model.layers.0.mlp.gate_proj.weight change rate: 0.0005597201525233686
model.layers.0.mlp.up_proj.weight change rate: 0.0005673495470546186
model.layers.0.mlp.down_proj.weight change rate: 0.0005517439567483962
model.layers.1.self_attn.q_proj.weight change rate: 0.0008869171142578125
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008230209350585938
model.layers.1.mlp.gate_proj.weight change rate: 0.0007560036610811949
model.layers.1.mlp.up_proj.weight change rate: 0.0005482760607264936
model.layers.1.mlp.down_proj.weight change rate: 0.0007664074073545635


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006322860717773438
model.layers.0.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.0.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.0.self_attn.o_proj.weight change rate: 0.0007295608520507812
model.layers.0.mlp.gate_proj.weight change rate: 0.000827789306640625
model.layers.0.mlp.up_proj.weight change rate: 0.0005624944460578263
model.layers.0.mlp.down_proj.weight change rate: 0.0008624683832749724
model.layers.1.self_attn.q_proj.weight change rate: 0.00054931640625
model.layers.1.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006380081176757812
model.layers.1.mlp.gate_proj.weight change rate: 0.0007539229118265212
model.layers.1.mlp.up_proj.weight change rate: 0.0008107965695671737
model.layers.1.mlp.down_proj.weight change rate: 0.0005482760607264936
model

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.965863227844238


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006275177001953125
model.layers.0.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0007009506225585938
model.layers.0.mlp.gate_proj.weight change rate: 0.0005659624002873898
model.layers.0.mlp.up_proj.weight change rate: 0.0005656155990436673
model.layers.0.mlp.down_proj.weight change rate: 0.0007768110954202712
model.layers.1.self_attn.q_proj.weight change rate: 0.000537872314453125
model.layers.1.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008020401000976562
model.layers.1.mlp.gate_proj.weight change rate: 0.000533364072907716
model.layers.1.mlp.up_proj.weight change rate: 0.0005482760607264936
model.layers.1.mlp.down_proj.weight change rate: 0.0005500100087374449

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008325576782226562
model.layers.0.self_attn.k_proj.weight change rate: 0.00087738037109375
model.layers.0.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.o_proj.weight change rate: 0.000820159912109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0005763660883530974
model.layers.0.mlp.up_proj.weight change rate: 0.0008770336280576885
model.layers.0.mlp.down_proj.weight change rate: 0.0007719560526311398
model.layers.1.self_attn.q_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.1.self_attn.o_proj.weight change rate: 0.0006322860717773438
model.layers.1.mlp.gate_proj.weight change rate: 0.0005337108159437776
model.layers.1.mlp.up_proj.weight change rate: 0.00080108642578125
model.layers.1.mlp.down_proj.weight change rate: 0.0008166920160874724


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006418228149414062
model.layers.0.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00086212158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008096694946289062
model.layers.0.mlp.gate_proj.weight change rate: 0.000579833984375
model.layers.0.mlp.up_proj.weight change rate: 0.0005628412473015487
model.layers.0.mlp.down_proj.weight change rate: 0.0005565989995375276
model.layers.1.self_attn.q_proj.weight change rate: 0.000545501708984375
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008115768432617188
model.layers.1.mlp.gate_proj.weight change rate: 0.0007931102882139385
model.layers.1.mlp.up_proj.weight change rate: 0.00075531005859375
model.layers.1.mlp.down_proj.weight change rate: 0.0005659624002873898
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007638931274414062
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008420944213867188
model.layers.0.mlp.gate_proj.weight change rate: 0.0008378462516702712
model.layers.0.mlp.up_proj.weight change rate: 0.0005579862045124173
model.layers.0.mlp.down_proj.weight change rate: 0.0007951909792609513
model.layers.1.self_attn.q_proj.weight change rate: 0.0005369186401367188
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006389617919921875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005316301248967648
model.layers.1.mlp.up_proj.weight change rate: 0.0007729963981546462
model.layers.1.mlp.down_proj.weight change rate: 0.0007993524777702

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007190704345703125
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008153915405273438
model.layers.0.mlp.gate_proj.weight change rate: 0.0005784468376077712
model.layers.0.mlp.up_proj.weight change rate: 0.0008246682118624449
model.layers.0.mlp.down_proj.weight change rate: 0.0005652687977999449
model.layers.1.self_attn.q_proj.weight change rate: 0.0008554458618164062
model.layers.1.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0006198883056640625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008007396827451885
model.layers.1.mlp.up_proj.weight change rate: 0.00075531005859375
model.layers.1.mlp.down_proj.weight change rate: 0.00078409368870779

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.869410514831543


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007238388061523438
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.o_proj.weight change rate: 0.00083160400390625
model.layers.0.mlp.gate_proj.weight change rate: 0.0007976185297593474
model.layers.0.mlp.up_proj.weight change rate: 0.0005718577886000276
model.layers.0.mlp.down_proj.weight change rate: 0.0005697770975530148
model.layers.1.self_attn.q_proj.weight change rate: 0.0007982254028320312
model.layers.1.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006227493286132812
model.layers.1.mlp.gate_proj.weight change rate: 0.0007566972635686398
model.layers.1.mlp.up_proj.weight change rate: 0.0005465421127155423
model.layers.1.mlp.down_proj.weight change rate: 0.0007750771474093199

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008230209350585938
model.layers.0.mlp.gate_proj.weight change rate: 0.0005763660883530974
model.layers.0.mlp.up_proj.weight change rate: 0.0008090626215562224
model.layers.0.mlp.down_proj.weight change rate: 0.0005652687977999449
model.layers.1.self_attn.q_proj.weight change rate: 0.0005435943603515625
model.layers.1.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.v_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006170272827148438
model.layers.1.mlp.gate_proj.weight change rate: 0.0005465421127155423
model.layers.1.mlp.up_proj.weight change rate: 0.0007840936887077987
model.layers.1.mlp.down_proj.weight change rate: 0.0007695285021327436
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006570816040039062
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008296966552734375
model.layers.0.mlp.gate_proj.weight change rate: 0.0005756724858656526
model.layers.0.mlp.up_proj.weight change rate: 0.0007806257926858962
model.layers.0.mlp.down_proj.weight change rate: 0.0008236278663389385
model.layers.1.self_attn.q_proj.weight change rate: 0.0005617141723632812
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006160736083984375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007861744379624724
model.layers.1.mlp.up_proj.weight change rate: 0.0005552118527702987
model.layers.1.mlp.down_proj.weight change rate: 0.000561454100534

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000766754150390625
model.layers.0.self_attn.k_proj.weight change rate: 0.00055694580078125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.0.self_attn.o_proj.weight change rate: 0.0007076263427734375
model.layers.0.mlp.gate_proj.weight change rate: 0.0008159984718076885
model.layers.0.mlp.up_proj.weight change rate: 0.0007834001444280148
model.layers.0.mlp.down_proj.weight change rate: 0.0007729963981546462
model.layers.1.self_attn.q_proj.weight change rate: 0.0005474090576171875
model.layers.1.self_attn.k_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008230209350585938
model.layers.1.mlp.gate_proj.weight change rate: 0.0005444613634608686
model.layers.1.mlp.up_proj.weight change rate: 0.0005628412473015487
model.layers.1.mlp.down_proj.weight change rate: 0.00056977709755301

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007200241088867188
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.o_proj.weight change rate: 0.00072479248046875
model.layers.0.mlp.gate_proj.weight change rate: 0.0005697770975530148
model.layers.0.mlp.up_proj.weight change rate: 0.000769875303376466
model.layers.0.mlp.down_proj.weight change rate: 0.0005742853390984237
model.layers.1.self_attn.q_proj.weight change rate: 0.0005474090576171875
model.layers.1.self_attn.k_proj.weight change rate: 0.00054931640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006208419799804688
model.layers.1.mlp.gate_proj.weight change rate: 0.0007910295389592648
model.layers.1.mlp.up_proj.weight change rate: 0.0008052479242905974
model.layers.1.mlp.down_proj.weight change rate: 0.000796578184235841
mod

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.782340049743652


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006504058837890625
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007305145263671875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008017800282686949
model.layers.0.mlp.up_proj.weight change rate: 0.0007691817008890212
model.layers.0.mlp.down_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.q_proj.weight change rate: 0.0005512237548828125
model.layers.1.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008106231689453125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007587780128233135
model.layers.1.mlp.up_proj.weight change rate: 0.0005635348497889936
model.layers.1.mlp.down_proj.weight change rate: 0.0007834001444280148
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006504058837890625
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007181167602539062
model.layers.0.mlp.gate_proj.weight change rate: 0.0007719560526311398
model.layers.0.mlp.up_proj.weight change rate: 0.0005728981923311949
model.layers.0.mlp.down_proj.weight change rate: 0.0007754239486530423
model.layers.1.self_attn.q_proj.weight change rate: 0.0005397796630859375
model.layers.1.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.1.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.o_proj.weight change rate: 0.00080108642578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007646734011359513
model.layers.1.mlp.up_proj.weight change rate: 0.0008014332270249724
model.layers.1.mlp.down_proj.weight change rate: 0.0007771578966639936
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006227493286132812
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008306503295898438
model.layers.0.mlp.gate_proj.weight change rate: 0.000766754150390625
model.layers.0.mlp.up_proj.weight change rate: 0.0007847872911952436
model.layers.0.mlp.down_proj.weight change rate: 0.000579140440095216
model.layers.1.self_attn.q_proj.weight change rate: 0.0008878707885742188
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006456375122070312
model.layers.1.mlp.gate_proj.weight change rate: 0.0005583330057561398
model.layers.1.mlp.up_proj.weight change rate: 0.0005687366938218474
model.layers.1.mlp.down_proj.weight change rate: 0.000582955137360841
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006256103515625
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.0.self_attn.o_proj.weight change rate: 0.000743865966796875
model.layers.0.mlp.gate_proj.weight change rate: 0.0007844404899515212
model.layers.0.mlp.up_proj.weight change rate: 0.0007587780128233135
model.layers.0.mlp.down_proj.weight change rate: 0.0005725513910874724
model.layers.1.self_attn.q_proj.weight change rate: 0.0005483627319335938
model.layers.1.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008020401000976562
model.layers.1.mlp.gate_proj.weight change rate: 0.0007781982421875
model.layers.1.mlp.up_proj.weight change rate: 0.0007962313829921186
model.layers.1.mlp.down_proj.weight change rate: 0.0008014332270249724
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007867813110351562
model.layers.0.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007295608520507812
model.layers.0.mlp.gate_proj.weight change rate: 0.0007761174929328263
model.layers.0.mlp.up_proj.weight change rate: 0.000571511045563966
model.layers.0.mlp.down_proj.weight change rate: 0.0008055947255343199
model.layers.1.self_attn.q_proj.weight change rate: 0.0008535385131835938
model.layers.1.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.1.self_attn.v_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0006418228149414062
model.layers.1.mlp.gate_proj.weight change rate: 0.0005527843022719026
model.layers.1.mlp.up_proj.weight change rate: 0.0005756724858656526
model.layers.1.mlp.down_proj.weight change rate: 0.0005735917948186398

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.840359687805176


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007495880126953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.o_proj.weight change rate: 0.000728607177734375
model.layers.0.mlp.gate_proj.weight change rate: 0.0007681413553655148
model.layers.0.mlp.up_proj.weight change rate: 0.0007768110954202712
model.layers.0.mlp.down_proj.weight change rate: 0.0005777532351203263
model.layers.1.self_attn.q_proj.weight change rate: 0.00054931640625
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0006170272827148438
model.layers.1.mlp.gate_proj.weight change rate: 0.0005420338129624724
model.layers.1.mlp.up_proj.weight change rate: 0.0005725513910874724
model.layers.1.mlp.down_proj.weight change rate: 0.000800392881501466
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006799697875976562
model.layers.0.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008363723754882812
model.layers.0.mlp.gate_proj.weight change rate: 0.0007861744379624724
model.layers.0.mlp.up_proj.weight change rate: 0.0007764642941765487
model.layers.0.mlp.down_proj.weight change rate: 0.00078582763671875
model.layers.1.self_attn.q_proj.weight change rate: 0.0005512237548828125
model.layers.1.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.v_proj.weight change rate: 0.000701904296875
model.layers.1.self_attn.o_proj.weight change rate: 0.00063323974609375
model.layers.1.mlp.gate_proj.weight change rate: 0.0005565989995375276
model.layers.1.mlp.up_proj.weight change rate: 0.0008139177225530148
model.layers.1.mlp.down_proj.weight change rate: 0.000788948789704591
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006427764892578125
model.layers.0.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.v_proj.weight change rate: 0.0009307861328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007696151733398438
model.layers.0.mlp.gate_proj.weight change rate: 0.00080108642578125
model.layers.0.mlp.up_proj.weight change rate: 0.0007920698844827712
model.layers.0.mlp.down_proj.weight change rate: 0.000583648681640625
model.layers.1.self_attn.q_proj.weight change rate: 0.0005512237548828125
model.layers.1.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.1.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008325576782226562
model.layers.1.mlp.gate_proj.weight change rate: 0.0005784468376077712
model.layers.1.mlp.up_proj.weight change rate: 0.0005891973269172013
model.layers.1.mlp.down_proj.weight change rate: 0.0005819147336296737


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006771087646484375
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.0.self_attn.o_proj.weight change rate: 0.000850677490234375
model.layers.0.mlp.gate_proj.weight change rate: 0.0006377480458468199
model.layers.0.mlp.up_proj.weight change rate: 0.00081634521484375
model.layers.0.mlp.down_proj.weight change rate: 0.0005846890853717923
model.layers.1.self_attn.q_proj.weight change rate: 0.0008420944213867188
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.0009002685546875
model.layers.1.self_attn.o_proj.weight change rate: 0.000675201416015625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008274425636045635
model.layers.1.mlp.up_proj.weight change rate: 0.000808022276032716
model.layers.1.mlp.down_proj.weight change rate: 0.0008076754747889936


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006742477416992188
model.layers.0.self_attn.k_proj.weight change rate: 0.00063323974609375
model.layers.0.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007829666137695312
model.layers.0.mlp.gate_proj.weight change rate: 0.0008184259640984237
model.layers.0.mlp.up_proj.weight change rate: 0.0006349736941047013
model.layers.0.mlp.down_proj.weight change rate: 0.0005843422841280699
model.layers.1.self_attn.q_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.k_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.00070953369140625
model.layers.1.self_attn.o_proj.weight change rate: 0.0006704330444335938
model.layers.1.mlp.gate_proj.weight change rate: 0.000769875303376466
model.layers.1.mlp.up_proj.weight change rate: 0.000582955137360841
model.layers.1.mlp.down_proj.weight change rate: 0.0007865212392061949
m

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.789929389953613


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008192062377929688
model.layers.0.self_attn.k_proj.weight change rate: 0.000640869140625
model.layers.0.self_attn.v_proj.weight change rate: 0.00089263916015625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008554458618164062
model.layers.0.mlp.gate_proj.weight change rate: 0.000815651670563966
model.layers.0.mlp.up_proj.weight change rate: 0.0008208535145968199
model.layers.0.mlp.down_proj.weight change rate: 0.000594399229157716
model.layers.1.self_attn.q_proj.weight change rate: 0.000576019287109375
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008563995361328125
model.layers.1.mlp.gate_proj.weight change rate: 0.0005690834950655699
model.layers.1.mlp.up_proj.weight change rate: 0.0008014332270249724
model.layers.1.mlp.down_proj.weight change rate: 0.000781319395173341
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007658004760742188
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008573532104492188
model.layers.0.mlp.gate_proj.weight change rate: 0.0006453774403780699
model.layers.0.mlp.up_proj.weight change rate: 0.0008146112668327987
model.layers.0.mlp.down_proj.weight change rate: 0.0005898909294046462
model.layers.1.self_attn.q_proj.weight change rate: 0.0005645751953125
model.layers.1.self_attn.k_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008258819580078125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007979653310030699
model.layers.1.mlp.up_proj.weight change rate: 0.0005826083361171186
model.layers.1.mlp.down_proj.weight change rate: 0.0007868680404499173


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007419586181640625
model.layers.0.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.000858306884765625
model.layers.0.mlp.gate_proj.weight change rate: 0.000827789306640625
model.layers.0.mlp.up_proj.weight change rate: 0.0006360140978358686
model.layers.0.mlp.down_proj.weight change rate: 0.0008673234842717648
model.layers.1.self_attn.q_proj.weight change rate: 0.0005636215209960938
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00066375732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0006666183471679688
model.layers.1.mlp.gate_proj.weight change rate: 0.0007788918446749449
model.layers.1.mlp.up_proj.weight change rate: 0.0007969249854795635
model.layers.1.mlp.down_proj.weight change rate: 0.0005954395746812224

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000751495361328125
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007619857788085938
model.layers.0.mlp.gate_proj.weight change rate: 0.0008191195665858686
model.layers.0.mlp.up_proj.weight change rate: 0.0008302168571390212
model.layers.0.mlp.down_proj.weight change rate: 0.0005971735226921737
model.layers.1.self_attn.q_proj.weight change rate: 0.0005655288696289062
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006732940673828125
model.layers.1.mlp.gate_proj.weight change rate: 0.000777504697907716
model.layers.1.mlp.up_proj.weight change rate: 0.0005878101801499724
model.layers.1.mlp.down_proj.weight change rate: 0.00059405242791399

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007600784301757812
model.layers.0.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008392333984375
model.layers.0.mlp.gate_proj.weight change rate: 0.0008094094227999449
model.layers.0.mlp.up_proj.weight change rate: 0.0008149580680765212
model.layers.0.mlp.down_proj.weight change rate: 0.0006002946756780148
model.layers.1.self_attn.q_proj.weight change rate: 0.000949859619140625
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.o_proj.weight change rate: 0.0006780624389648438
model.layers.1.mlp.gate_proj.weight change rate: 0.0007723027956672013
model.layers.1.mlp.up_proj.weight change rate: 0.0005864230333827436
model.layers.1.mlp.down_proj.weight change rate: 0.0008083690772764385
mo

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.882414817810059


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008363723754882812
model.layers.0.mlp.gate_proj.weight change rate: 0.0006356672965921462
model.layers.0.mlp.up_proj.weight change rate: 0.0006280379020608962
model.layers.0.mlp.down_proj.weight change rate: 0.0008159984718076885
model.layers.1.self_attn.q_proj.weight change rate: 0.00055694580078125
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.0008544921875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008478164672851562
model.layers.1.mlp.gate_proj.weight change rate: 0.0007695285021327436
model.layers.1.mlp.up_proj.weight change rate: 0.0008128773188218474
model.layers.1.mlp.down_proj.weight change rate: 0.0005950927734375
model.la

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007410049438476562
model.layers.0.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.v_proj.weight change rate: 0.0009613037109375
model.layers.0.self_attn.o_proj.weight change rate: 0.00078582763671875
model.layers.0.mlp.gate_proj.weight change rate: 0.0006186745595186949
model.layers.0.mlp.up_proj.weight change rate: 0.000621795654296875
model.layers.0.mlp.down_proj.weight change rate: 0.0005919716786593199
model.layers.1.self_attn.q_proj.weight change rate: 0.0005435943603515625
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008563995361328125
model.layers.1.mlp.gate_proj.weight change rate: 0.0005767128895968199
model.layers.1.mlp.up_proj.weight change rate: 0.0005895441281609237
model.layers.1.mlp.down_proj.weight change rate: 0.0006016818224452436
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007410049438476562
model.layers.0.self_attn.k_proj.weight change rate: 0.000701904296875
model.layers.0.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008382797241210938
model.layers.0.mlp.gate_proj.weight change rate: 0.0006443370948545635
model.layers.0.mlp.up_proj.weight change rate: 0.0006394819938577712
model.layers.0.mlp.down_proj.weight change rate: 0.0008191195665858686
model.layers.1.self_attn.q_proj.weight change rate: 0.0009107589721679688
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0006742477416992188
model.layers.1.mlp.gate_proj.weight change rate: 0.0007865212392061949
model.layers.1.mlp.up_proj.weight change rate: 0.000792763486970216
model.layers.1.mlp.down_proj.weight change rate: 0.000817385618574917

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000762939453125
model.layers.0.self_attn.k_proj.weight change rate: 0.00067138671875
model.layers.0.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008373260498046875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008381930529139936
model.layers.0.mlp.up_proj.weight change rate: 0.0008350719581358135
model.layers.0.mlp.down_proj.weight change rate: 0.0006086176144890487
model.layers.1.self_attn.q_proj.weight change rate: 0.0005598068237304688
model.layers.1.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.00083160400390625
model.layers.1.self_attn.o_proj.weight change rate: 0.0006799697875976562
model.layers.1.mlp.gate_proj.weight change rate: 0.000782012939453125
model.layers.1.mlp.up_proj.weight change rate: 0.0005940524279139936
model.layers.1.mlp.down_proj.weight change rate: 0.0008121837745420635
model.

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007953643798828125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008159984718076885
model.layers.0.mlp.up_proj.weight change rate: 0.0006301186513155699
model.layers.0.mlp.down_proj.weight change rate: 0.0006037625716999173
model.layers.1.self_attn.q_proj.weight change rate: 0.0005578994750976562
model.layers.1.self_attn.k_proj.weight change rate: 0.000946044921875
model.layers.1.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008535385131835938
model.layers.1.mlp.gate_proj.weight change rate: 0.0007750771474093199
model.layers.1.mlp.up_proj.weight change rate: 0.0005957863759249449
model.layers.1.mlp.down_proj.weight change rate: 0.000602028623688966
mode

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.007822036743164


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006952285766601562
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007829666137695312
model.layers.0.mlp.gate_proj.weight change rate: 0.0008021268295124173
model.layers.0.mlp.up_proj.weight change rate: 0.0006304654525592923
model.layers.0.mlp.down_proj.weight change rate: 0.0008205067133530974
model.layers.1.self_attn.q_proj.weight change rate: 0.0005636215209960938
model.layers.1.self_attn.k_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.v_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.o_proj.weight change rate: 0.00069427490234375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007726495969109237
model.layers.1.mlp.up_proj.weight change rate: 0.0005978671251796186
model.layers.1.mlp.down_proj.weight change rate: 0.000805594725534319

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0008106231689453125
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008535385131835938
model.layers.0.mlp.gate_proj.weight change rate: 0.0006412159418687224
model.layers.0.mlp.up_proj.weight change rate: 0.0008319508051499724
model.layers.0.mlp.down_proj.weight change rate: 0.0008215471170842648
model.layers.1.self_attn.q_proj.weight change rate: 0.000553131103515625
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008563995361328125
model.layers.1.mlp.gate_proj.weight change rate: 0.000789642333984375
model.layers.1.mlp.up_proj.weight change rate: 0.0006034157704561949
model.layers.1.mlp.down_proj.weight change rate: 0.0006023753667250276
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007581710815429688
model.layers.0.self_attn.k_proj.weight change rate: 0.00064849853515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008630752563476562
model.layers.0.mlp.gate_proj.weight change rate: 0.0008361123036593199
model.layers.0.mlp.up_proj.weight change rate: 0.0008094094227999449
model.layers.0.mlp.down_proj.weight change rate: 0.0008208535145968199
model.layers.1.self_attn.q_proj.weight change rate: 0.0009222030639648438
model.layers.1.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.1.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.o_proj.weight change rate: 0.000835418701171875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005843422841280699
model.layers.1.mlp.up_proj.weight change rate: 0.0008312572608701885
model.layers.1.mlp.down_proj.weight change rate: 0.00080698187230154

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006809234619140625
model.layers.0.self_attn.k_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008554458618164062
model.layers.0.mlp.gate_proj.weight change rate: 0.0006464177859015763
model.layers.0.mlp.up_proj.weight change rate: 0.0006405223975889385
model.layers.0.mlp.down_proj.weight change rate: 0.0006207553087733686
model.layers.1.self_attn.q_proj.weight change rate: 0.0008649826049804688
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008535385131835938
model.layers.1.mlp.gate_proj.weight change rate: 0.0008014332270249724
model.layers.1.mlp.up_proj.weight change rate: 0.0005971735226921737
model.layers.1.mlp.down_proj.weight change rate: 0.00061416625976562

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007686614990234375
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0007762908935546875
model.layers.0.mlp.gate_proj.weight change rate: 0.0006450306391343474
model.layers.0.mlp.up_proj.weight change rate: 0.00081634521484375
model.layers.0.mlp.down_proj.weight change rate: 0.0006197149050422013
model.layers.1.self_attn.q_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008544921875
model.layers.1.mlp.gate_proj.weight change rate: 0.00079345703125
model.layers.1.mlp.up_proj.weight change rate: 0.0008090626215562224
model.layers.1.mlp.down_proj.weight change rate: 0.0006061900639906526
model.layers.2.self

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 12.982158660888672


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000659942626953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.0.self_attn.v_proj.weight change rate: 0.00089263916015625
model.layers.0.self_attn.o_proj.weight change rate: 0.0007953643798828125
model.layers.0.mlp.gate_proj.weight change rate: 0.0006457242416217923
model.layers.0.mlp.up_proj.weight change rate: 0.000815651670563966
model.layers.0.mlp.down_proj.weight change rate: 0.0008371527073904872
model.layers.1.self_attn.q_proj.weight change rate: 0.0008039474487304688
model.layers.1.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007047653198242188
model.layers.1.mlp.gate_proj.weight change rate: 0.0007913762819953263
model.layers.1.mlp.up_proj.weight change rate: 0.0008014332270249724
model.layers.1.mlp.down_proj.weight change rate: 0.0008288297103717923

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006570816040039062
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008020401000976562
model.layers.0.mlp.gate_proj.weight change rate: 0.0008295233128592372
model.layers.0.mlp.up_proj.weight change rate: 0.0006391351926140487
model.layers.0.mlp.down_proj.weight change rate: 0.0006221424555405974
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008449554443359375
model.layers.1.mlp.gate_proj.weight change rate: 0.0007913762819953263
model.layers.1.mlp.up_proj.weight change rate: 0.0006061900639906526
model.layers.1.mlp.down_proj.weight change rate: 0.0008215471170842648
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006494522094726562
model.layers.0.self_attn.k_proj.weight change rate: 0.0006103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.o_proj.weight change rate: 0.000865936279296875
model.layers.0.mlp.gate_proj.weight change rate: 0.0006502324831672013
model.layers.0.mlp.up_proj.weight change rate: 0.0006457242416217923
model.layers.0.mlp.down_proj.weight change rate: 0.000830910459626466
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.1.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008544921875
model.layers.1.mlp.gate_proj.weight change rate: 0.0007955377805046737
model.layers.1.mlp.up_proj.weight change rate: 0.000815651670563966
model.layers.1.mlp.down_proj.weight change rate: 0.0006165938102640212
model.layers.2

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007772445678710938
model.layers.0.self_attn.k_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008716583251953125
model.layers.0.mlp.gate_proj.weight change rate: 0.0006457242416217923
model.layers.0.mlp.up_proj.weight change rate: 0.000823281065095216
model.layers.0.mlp.down_proj.weight change rate: 0.0008166920160874724
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.1.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007123947143554688
model.layers.1.mlp.gate_proj.weight change rate: 0.000773690000642091
model.layers.1.mlp.up_proj.weight change rate: 0.0006148598622530699
model.layers.1.mlp.down_proj.weight change rate: 0.0006061900639906526

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006647109985351562
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008726119995117188
model.layers.0.mlp.gate_proj.weight change rate: 0.0008170388173311949
model.layers.0.mlp.up_proj.weight change rate: 0.0006471113883890212
model.layers.0.mlp.down_proj.weight change rate: 0.0006169406115077436
model.layers.1.self_attn.q_proj.weight change rate: 0.0008554458618164062
model.layers.1.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.1.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007276535034179688
model.layers.1.mlp.gate_proj.weight change rate: 0.00079345703125
model.layers.1.mlp.up_proj.weight change rate: 0.000602028623688966
model.layers.1.mlp.down_proj.weight change rate: 0.0006197149050422013
model

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.130927085876465


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00074005126953125
model.layers.0.self_attn.k_proj.weight change rate: 0.00066375732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008001327514648438
model.layers.0.mlp.gate_proj.weight change rate: 0.000659249082673341
model.layers.0.mlp.up_proj.weight change rate: 0.0006467645871452987
model.layers.0.mlp.down_proj.weight change rate: 0.0008250150131061673
model.layers.1.self_attn.q_proj.weight change rate: 0.0008058547973632812
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007266998291015625
model.layers.1.mlp.gate_proj.weight change rate: 0.0007903359364718199
model.layers.1.mlp.up_proj.weight change rate: 0.0008087158203125
model.layers.1.mlp.down_proj.weight change rate: 0.000613472715485841
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007562637329101562
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008440017700195312
model.layers.0.mlp.gate_proj.weight change rate: 0.0008225874626077712
model.layers.0.mlp.up_proj.weight change rate: 0.0008312572608701885
model.layers.0.mlp.down_proj.weight change rate: 0.0006152066634967923
model.layers.1.self_attn.q_proj.weight change rate: 0.0005788803100585938
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008649826049804688
model.layers.1.mlp.gate_proj.weight change rate: 0.0005846890853717923
model.layers.1.mlp.up_proj.weight change rate: 0.0008218939183279872
model.layers.1.mlp.down_proj.weight change rate: 0.0006152066634967923


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007200241088867188
model.layers.0.self_attn.k_proj.weight change rate: 0.000518798828125
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008087158203125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008208535145968199
model.layers.0.mlp.up_proj.weight change rate: 0.0006464177859015763
model.layers.0.mlp.down_proj.weight change rate: 0.0008187727653421462
model.layers.1.self_attn.q_proj.weight change rate: 0.0008401870727539062
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008869171142578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0007996992790140212
model.layers.1.mlp.up_proj.weight change rate: 0.0006159002077765763
model.layers.1.mlp.down_proj.weight change rate: 0.0008368059061467648

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006771087646484375
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008478164672851562
model.layers.0.mlp.gate_proj.weight change rate: 0.0006589022814296186
model.layers.0.mlp.up_proj.weight change rate: 0.00083160400390625
model.layers.0.mlp.down_proj.weight change rate: 0.0006269975565373898
model.layers.1.self_attn.q_proj.weight change rate: 0.0005674362182617188
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007295608520507812
model.layers.1.mlp.gate_proj.weight change rate: 0.000590584531892091
model.layers.1.mlp.up_proj.weight change rate: 0.0008229342638514936
model.layers.1.mlp.down_proj.weight change rate: 0.0008198131690733135
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006303787231445312
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008392333984375
model.layers.0.mlp.gate_proj.weight change rate: 0.0008225874626077712
model.layers.0.mlp.up_proj.weight change rate: 0.0008180792210623622
model.layers.0.mlp.down_proj.weight change rate: 0.0008267489611171186
model.layers.1.self_attn.q_proj.weight change rate: 0.000797271728515625
model.layers.1.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.1.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008697509765625
model.layers.1.mlp.gate_proj.weight change rate: 0.0007938038324937224
model.layers.1.mlp.up_proj.weight change rate: 0.0006186745595186949
model.layers.1.mlp.down_proj.weight change rate: 0.0008184259640984237
model

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.05428409576416


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000789642333984375
model.layers.0.self_attn.k_proj.weight change rate: 0.0005645751953125
model.layers.0.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.000873565673828125
model.layers.0.mlp.gate_proj.weight change rate: 0.0006557811866514385
model.layers.0.mlp.up_proj.weight change rate: 0.0006387883913703263
model.layers.0.mlp.down_proj.weight change rate: 0.0006315057980827987
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.o_proj.weight change rate: 0.00072479248046875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005968267214484513
model.layers.1.mlp.up_proj.weight change rate: 0.0008208535145968199
model.layers.1.mlp.down_proj.weight change rate: 0.0006269975565373898

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006685256958007812
model.layers.0.self_attn.k_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008544921875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008134841918945312
model.layers.0.mlp.gate_proj.weight change rate: 0.0006464177859015763
model.layers.0.mlp.up_proj.weight change rate: 0.0006443370948545635
model.layers.0.mlp.down_proj.weight change rate: 0.0008413141476921737
model.layers.1.self_attn.q_proj.weight change rate: 0.0005474090576171875
model.layers.1.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.1.self_attn.v_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007410049438476562
model.layers.1.mlp.gate_proj.weight change rate: 0.0007955377805046737
model.layers.1.mlp.up_proj.weight change rate: 0.0008031671750359237
model.layers.1.mlp.down_proj.weight change rate: 0.0008312572608701885
mode

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006570816040039062
model.layers.0.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008525848388671875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008295233128592372
model.layers.0.mlp.up_proj.weight change rate: 0.0006443370948545635
model.layers.0.mlp.down_proj.weight change rate: 0.000632546201813966
model.layers.1.self_attn.q_proj.weight change rate: 0.0008172988891601562
model.layers.1.self_attn.k_proj.weight change rate: 0.00061798095703125
model.layers.1.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.o_proj.weight change rate: 0.0008916854858398438
model.layers.1.mlp.gate_proj.weight change rate: 0.0007892955909483135
model.layers.1.mlp.up_proj.weight change rate: 0.0008069818723015487
model.layers.1.mlp.down_proj.weight change rate: 0.0008257086155936

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007696151733398438
model.layers.0.self_attn.k_proj.weight change rate: 0.00055694580078125
model.layers.0.self_attn.v_proj.weight change rate: 0.0008087158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008068084716796875
model.layers.0.mlp.gate_proj.weight change rate: 0.000834725156892091
model.layers.0.mlp.up_proj.weight change rate: 0.0008288297103717923
model.layers.0.mlp.down_proj.weight change rate: 0.0008302168571390212
model.layers.1.self_attn.q_proj.weight change rate: 0.0005741119384765625
model.layers.1.self_attn.k_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008640289306640625
model.layers.1.mlp.gate_proj.weight change rate: 0.0006054965197108686
model.layers.1.mlp.up_proj.weight change rate: 0.0008170388173311949
model.layers.1.mlp.down_proj.weight change rate: 0.0008170388173311

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006818771362304688
model.layers.0.self_attn.k_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.v_proj.weight change rate: 0.00086212158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008563995361328125
model.layers.0.mlp.gate_proj.weight change rate: 0.0008246682118624449
model.layers.0.mlp.up_proj.weight change rate: 0.0008246682118624449
model.layers.0.mlp.down_proj.weight change rate: 0.0008184259640984237
model.layers.1.self_attn.q_proj.weight change rate: 0.000843048095703125
model.layers.1.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.1.self_attn.v_proj.weight change rate: 0.00086212158203125
model.layers.1.self_attn.o_proj.weight change rate: 0.00086212158203125
model.layers.1.mlp.gate_proj.weight change rate: 0.00058746337890625
model.layers.1.mlp.up_proj.weight change rate: 0.0008111433708108962
model.layers.1.mlp.down_proj.weight change rate: 0.0008153048693202436
m

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.180264472961426


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007524490356445312
model.layers.0.self_attn.k_proj.weight change rate: 0.0007476806640625
model.layers.0.self_attn.v_proj.weight change rate: 0.0009307861328125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008029937744140625
model.layers.0.mlp.gate_proj.weight change rate: 0.0006609830306842923
model.layers.0.mlp.up_proj.weight change rate: 0.0008215471170842648
model.layers.0.mlp.down_proj.weight change rate: 0.0008274425636045635
model.layers.1.self_attn.q_proj.weight change rate: 0.000576019287109375
model.layers.1.self_attn.k_proj.weight change rate: 0.000823974609375
model.layers.1.self_attn.v_proj.weight change rate: 0.00089263916015625
model.layers.1.self_attn.o_proj.weight change rate: 0.000873565673828125
model.layers.1.mlp.gate_proj.weight change rate: 0.0008128773188218474
model.layers.1.mlp.up_proj.weight change rate: 0.0008094094227999449
model.layers.1.mlp.down_proj.weight change rate: 0.000629425048828125
m

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006608963012695312
model.layers.0.self_attn.k_proj.weight change rate: 0.00060272216796875
model.layers.0.self_attn.v_proj.weight change rate: 0.000823974609375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008592605590820312
model.layers.0.mlp.gate_proj.weight change rate: 0.000827789306640625
model.layers.0.mlp.up_proj.weight change rate: 0.0006481517921201885
model.layers.0.mlp.down_proj.weight change rate: 0.0006380948470905423
model.layers.1.self_attn.q_proj.weight change rate: 0.0005779266357421875
model.layers.1.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.v_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007314682006835938
model.layers.1.mlp.gate_proj.weight change rate: 0.0007792386459186673
model.layers.1.mlp.up_proj.weight change rate: 0.0006162470090202987
model.layers.1.mlp.down_proj.weight change rate: 0.00062699755653738

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006341934204101562
model.layers.0.self_attn.k_proj.weight change rate: 0.00075531005859375
model.layers.0.self_attn.v_proj.weight change rate: 0.0008544921875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008144378662109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0008388866554014385
model.layers.0.mlp.up_proj.weight change rate: 0.0008177324198186398
model.layers.0.mlp.down_proj.weight change rate: 0.0006387883913703263
model.layers.1.self_attn.q_proj.weight change rate: 0.00055694580078125
model.layers.1.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.o_proj.weight change rate: 0.0007314682006835938
model.layers.1.mlp.gate_proj.weight change rate: 0.0006013350212015212
model.layers.1.mlp.up_proj.weight change rate: 0.0006315057980827987
model.layers.1.mlp.down_proj.weight change rate: 0.0006256103515625
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000652313232421875
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008592605590820312
model.layers.0.mlp.gate_proj.weight change rate: 0.0006589022814296186
model.layers.0.mlp.up_proj.weight change rate: 0.000815651670563966
model.layers.0.mlp.down_proj.weight change rate: 0.0006391351926140487
model.layers.1.self_attn.q_proj.weight change rate: 0.0005521774291992188
model.layers.1.self_attn.k_proj.weight change rate: 0.000762939453125
model.layers.1.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.o_proj.weight change rate: 0.000732421875
model.layers.1.mlp.gate_proj.weight change rate: 0.0005996010731905699
model.layers.1.mlp.up_proj.weight change rate: 0.000827095762360841
model.layers.1.mlp.down_proj.weight change rate: 0.0006339333485811949
model.laye

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007801055908203125
model.layers.0.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.0.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.o_proj.weight change rate: 0.000820159912109375
model.layers.0.mlp.gate_proj.weight change rate: 0.0008208535145968199
model.layers.0.mlp.up_proj.weight change rate: 0.0006682656239718199
model.layers.0.mlp.down_proj.weight change rate: 0.0008340315544046462
model.layers.1.self_attn.q_proj.weight change rate: 0.0008420944213867188
model.layers.1.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.1.self_attn.v_proj.weight change rate: 0.00074005126953125
model.layers.1.self_attn.o_proj.weight change rate: 0.0007266998291015625
model.layers.1.mlp.gate_proj.weight change rate: 0.000605843320954591
model.layers.1.mlp.up_proj.weight change rate: 0.0006238764035515487
model.layers.1.mlp.down_proj.weight change rate: 0.00083437835564836

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.233437538146973


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.k_proj.weight change rate: 0.000732421875
model.layers.0.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008525848388671875
model.layers.0.mlp.gate_proj.weight change rate: 0.0008305636583827436
model.layers.0.mlp.up_proj.weight change rate: 0.0006595958839170635
model.layers.0.mlp.down_proj.weight change rate: 0.000827789306640625
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.00057220458984375
model.layers.1.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008678436279296875
model.layers.1.mlp.gate_proj.weight change rate: 0.0008253618143498898
model.layers.1.mlp.up_proj.weight change rate: 0.0006377480458468199
model.layers.1.mlp.down_proj.weight change rate: 0.0006450306391343474
model

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006504058837890625
model.layers.0.self_attn.k_proj.weight change rate: 0.0006561279296875
model.layers.0.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008153915405273438
model.layers.0.mlp.gate_proj.weight change rate: 0.0006835244130343199
model.layers.0.mlp.up_proj.weight change rate: 0.0006561279296875
model.layers.0.mlp.down_proj.weight change rate: 0.0006384416483342648
model.layers.1.self_attn.q_proj.weight change rate: 0.0005598068237304688
model.layers.1.self_attn.k_proj.weight change rate: 0.00058746337890625
model.layers.1.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0007410049438476562
model.layers.1.mlp.gate_proj.weight change rate: 0.0006009882199577987
model.layers.1.mlp.up_proj.weight change rate: 0.0006353204953484237
model.layers.1.mlp.down_proj.weight change rate: 0.0006335865473374724


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.0.self_attn.o_proj.weight change rate: 0.000850677490234375
model.layers.0.mlp.gate_proj.weight change rate: 0.0006696527707390487
model.layers.0.mlp.up_proj.weight change rate: 0.00083160400390625
model.layers.0.mlp.down_proj.weight change rate: 0.0006481517921201885
model.layers.1.self_attn.q_proj.weight change rate: 0.0005521774291992188
model.layers.1.self_attn.k_proj.weight change rate: 0.000579833984375
model.layers.1.self_attn.v_proj.weight change rate: 0.000885009765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008792877197265625
model.layers.1.mlp.gate_proj.weight change rate: 0.0008055947255343199
model.layers.1.mlp.up_proj.weight change rate: 0.0006318525993265212
model.layers.1.mlp.down_proj.weight change rate: 0.000830910459626466
model

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006551742553710938
model.layers.0.self_attn.k_proj.weight change rate: 0.00067901611328125
model.layers.0.self_attn.v_proj.weight change rate: 0.00093841552734375
model.layers.0.self_attn.o_proj.weight change rate: 0.00083160400390625
model.layers.0.mlp.gate_proj.weight change rate: 0.00083160400390625
model.layers.0.mlp.up_proj.weight change rate: 0.0008246682118624449
model.layers.0.mlp.down_proj.weight change rate: 0.0006432966911233962
model.layers.1.self_attn.q_proj.weight change rate: 0.0005779266357421875
model.layers.1.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.v_proj.weight change rate: 0.00084686279296875
model.layers.1.self_attn.o_proj.weight change rate: 0.0008544921875
model.layers.1.mlp.gate_proj.weight change rate: 0.0006068836664780974
model.layers.1.mlp.up_proj.weight change rate: 0.000842354551423341
model.layers.1.mlp.down_proj.weight change rate: 0.000827789306640625
model.l

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000751495361328125
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.0008697509765625
model.layers.0.self_attn.o_proj.weight change rate: 0.0008554458618164062
model.layers.0.mlp.gate_proj.weight change rate: 0.0006686124252155423
model.layers.0.mlp.up_proj.weight change rate: 0.0006686124252155423
model.layers.0.mlp.down_proj.weight change rate: 0.0006297718500718474
model.layers.1.self_attn.q_proj.weight change rate: 0.0008211135864257812
model.layers.1.self_attn.k_proj.weight change rate: 0.00080108642578125
model.layers.1.self_attn.v_proj.weight change rate: 0.00077056884765625
model.layers.1.self_attn.o_proj.weight change rate: 0.0008497238159179688
model.layers.1.mlp.gate_proj.weight change rate: 0.0007931102882139385
model.layers.1.mlp.up_proj.weight change rate: 0.0006353204953484237
model.layers.1.mlp.down_proj.weight change rate: 0.0008156516705639

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.302081108093262


V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006551742553710938
model.layers.0.self_attn.k_proj.weight change rate: 0.00069427490234375
model.layers.0.self_attn.v_proj.weight change rate: 0.00078582763671875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008182525634765625
model.layers.0.mlp.gate_proj.weight change rate: 0.000670693174470216
model.layers.0.mlp.up_proj.weight change rate: 0.0006568215321749449
model.layers.0.mlp.down_proj.weight change rate: 0.0006467645871452987
model.layers.1.self_attn.q_proj.weight change rate: 0.0007839202880859375
model.layers.1.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.v_proj.weight change rate: 0.00081634521484375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008678436279296875
model.layers.1.mlp.gate_proj.weight change rate: 0.000614166259765625
model.layers.1.mlp.up_proj.weight change rate: 0.00063323974609375
model.layers.1.mlp.down_proj.weight change rate: 0.0006398287951014936

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.000652313232421875
model.layers.0.self_attn.k_proj.weight change rate: 0.0006866455078125
model.layers.0.self_attn.v_proj.weight change rate: 0.00086212158203125
model.layers.0.self_attn.o_proj.weight change rate: 0.000858306884765625
model.layers.0.mlp.gate_proj.weight change rate: 0.0006561279296875
model.layers.0.mlp.up_proj.weight change rate: 0.0006623701774515212
model.layers.0.mlp.down_proj.weight change rate: 0.0006495389388874173
model.layers.1.self_attn.q_proj.weight change rate: 0.0005655288696289062
model.layers.1.self_attn.k_proj.weight change rate: 0.00054168701171875
model.layers.1.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008678436279296875
model.layers.1.mlp.gate_proj.weight change rate: 0.0006117387674748898
model.layers.1.mlp.up_proj.weight change rate: 0.0008392333984375
model.layers.1.mlp.down_proj.weight change rate: 0.0006391351926140487
mod

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.00077056884765625
model.layers.0.self_attn.k_proj.weight change rate: 0.0006256103515625
model.layers.0.self_attn.v_proj.weight change rate: 0.0008544921875
model.layers.0.self_attn.o_proj.weight change rate: 0.0008087158203125
model.layers.0.mlp.gate_proj.weight change rate: 0.0006696527707390487
model.layers.0.mlp.up_proj.weight change rate: 0.0008260553586296737
model.layers.0.mlp.down_proj.weight change rate: 0.0006471113883890212
model.layers.1.self_attn.q_proj.weight change rate: 0.0005626678466796875
model.layers.1.self_attn.k_proj.weight change rate: 0.0008392333984375
model.layers.1.self_attn.v_proj.weight change rate: 0.0007781982421875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007486343383789062
model.layers.1.mlp.gate_proj.weight change rate: 0.0008187727653421462
model.layers.1.mlp.up_proj.weight change rate: 0.0008312572608701885
model.layers.1.mlp.down_proj.weight change rate: 0.0008378462516702712
model.

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0006608963012695312
model.layers.0.self_attn.k_proj.weight change rate: 0.0007171630859375
model.layers.0.self_attn.v_proj.weight change rate: 0.00079345703125
model.layers.0.self_attn.o_proj.weight change rate: 0.0008687973022460938
model.layers.0.mlp.gate_proj.weight change rate: 0.0008257086155936122
model.layers.0.mlp.up_proj.weight change rate: 0.0006582086789421737
model.layers.0.mlp.down_proj.weight change rate: 0.0006360140978358686
model.layers.1.self_attn.q_proj.weight change rate: 0.0005435943603515625
model.layers.1.self_attn.k_proj.weight change rate: 0.00072479248046875
model.layers.1.self_attn.v_proj.weight change rate: 0.0008544921875
model.layers.1.self_attn.o_proj.weight change rate: 0.0007181167602539062
model.layers.1.mlp.gate_proj.weight change rate: 0.0008017800282686949
model.layers.1.mlp.up_proj.weight change rate: 0.0006360140978358686
model.layers.1.mlp.down_proj.weight change rate: 0.0006512728868983686
mo

V step:   0%|          | 0/256 [00:00<?, ?it/s]

model.layers.0.self_attn.q_proj.weight change rate: 0.0007658004760742188
model.layers.0.self_attn.k_proj.weight change rate: 0.00070953369140625
model.layers.0.self_attn.v_proj.weight change rate: 0.0008392333984375
model.layers.0.self_attn.o_proj.weight change rate: 0.0008611679077148438
model.layers.0.mlp.gate_proj.weight change rate: 0.0008180792210623622
model.layers.0.mlp.up_proj.weight change rate: 0.0006606362294405699
model.layers.0.mlp.down_proj.weight change rate: 0.000640869140625
model.layers.1.self_attn.q_proj.weight change rate: 0.0008382797241210938
model.layers.1.self_attn.k_proj.weight change rate: 0.0005950927734375
model.layers.1.self_attn.v_proj.weight change rate: 0.00087738037109375
model.layers.1.self_attn.o_proj.weight change rate: 0.0008869171142578125
model.layers.1.mlp.gate_proj.weight change rate: 0.0008114901720546186
model.layers.1.mlp.up_proj.weight change rate: 0.0008371527073904872
model.layers.1.mlp.down_proj.weight change rate: 0.0008298700558952987


In [14]:
ppl_eval_score = eval_ppl_naive(dequantized_model, eval_data).item()
print("wikitext2 ppl (naive)", ppl_eval_score)

  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 13.23874568939209
