In [1]:
%env CUDA_VISIBLE_DEVICES=6
%env TRANSFORMERS_CACHE=/mnt/LLM/hub
%env HF_HOME=/mnt/LLM/
%env OMP_NUM_THREADS=16
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.insert(0, '..')

import time
import random
from tqdm.auto import trange, tqdm
import numpy as np
import ipynbname  # pip install ipynbname

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers

from src.aq import QuantizedWeight, QuantizedLinear
from src.modelutils import get_model
from src.datautils import get_loaders
from convert_legacy_model_format import load_quantized_model_with_old_pickle


torch.set_num_threads(16)
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


env: CUDA_VISIBLE_DEVICES=6
env: TRANSFORMERS_CACHE=/mnt/LLM/hub
env: HF_HOME=/mnt/LLM/
env: OMP_NUM_THREADS=16




In [2]:
class args:
    base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
    quant_model = "/extra_disk_1/vahe1994/AQ/tinyllama-3t-1x12g4/"
    dtype = 'bfloat16'
    model_seqlen = 1024  # can be 2048 for 1.1B, 4096-8192 for larger models
    device_map = 'auto'
    
    dataset = 'pajama'
    step_nsamples = 256
    total_nsamples = 2560
    seed = 42
    beam_size = 1                    # <-- todo when training single-codebook models, use beam size 1
    stochastic_rounding_tau = 0.0    # keep it at 0
    max_code_change_per_step = 1.0   # was 1e-2 for subspace PV
    code_trust_ratio=None            # 0.1 - 0.01
    entropy_reg=0.002                 
    code_selection_temperature=100   # <-- uniform sampling for now, TODO
    delta_decay = 0.0                # <-- this defines the use of STE
    
    code_lr = 1e-4
    code_betas = (0.0, 0.95)
    codebook_lr = 2e-5
    codebook_betas = (0.9, 0.95)
    
    autocast_dtype = torch.bfloat16  # bfloat16 or None (not using grad scaler!)
    training_dtype = torch.float32
    gradient_checkpointing = False
    devices = [device]

In [3]:
quant_name = args.quant_model.split("/")[-2]

In [4]:
import inspect

def props(obj):
    pr = {}
    for name in dir(obj):
        value = getattr(obj, name)
        if not name.startswith('__') and not inspect.ismethod(value):
            pr[name] = value
    return pr

In [5]:
a= props(args)

In [6]:
import wandb

os.environ["WANDB_NOTEBOOK_NAME"] = os.path.join(os.getcwd(), ipynbname.name() + ".ipynb")

# start a new wandb run to track this script
run = wandb.init(
    # set the wandb project where this run will be logged
    dir=os.getcwd(),
    project="Entropy_PV",
    entity = "rock-and-roll",
    save_code=True,
    name = f"{ipynbname.name()}_PV_entropy_{quant_name}_{args.entropy_reg=}_{args.code_lr=}_{args.max_code_change_per_step=}_{args.code_trust_ratio=}",
    settings=wandb.Settings(code_dir="."),
    config=a
)

[34m[1mwandb[0m: Currently logged in as: [33mvahe1994[0m ([33mrock-and-roll[0m). Use [1m`wandb login --relogin`[0m to force relogin




In [7]:
# train_data = get_loaders(
#     args.dataset,
#     nsamples=args.total_nsamples,
#     seed=args.seed,
#     model_path=args.base_model,
#     seqlen=args.model_seqlen,
# )
train_data = torch.load("/extra_disk_1/vahe1994/AQ/red_paj_samples.pt")

In [8]:
base_model = get_model(args.base_model, None, args.dtype, args.device_map)
if not args.device_map:
    base_model = base_model.to(device)

quantized_model = load_quantized_model_with_old_pickle(
    args.base_model, args.quant_model, dtype=args.dtype, device_map=args.device_map)
if not args.device_map:
    quantized_model = quantized_model.to(device)
quantized_model = quantized_model.to(args.training_dtype)

Loading pretrained model ...
Model loaded sucсessfully ...
Loading pretrained model ...
Model loaded sucсessfully ...
Initializing model with random weights...
Loading quantized model ...
Model loaded sucсessfully ...
found 154 quantized weight matrices




In [9]:
from src.pv_utils import create_dequantized_model
dequantized_model, master_parameters = create_dequantized_model(
    quantized_model, reuse_non_quantized=True, dequantized_dtype=args.autocast_dtype
)
for param in dequantized_model.parameters():
    param.data = param.data.to(args.autocast_dtype)
    
if args.gradient_checkpointing:
    quantized_model.gradient_checkpointing_enable()
    quantized_model.enable_input_require_grads()
    for module in quantized_model.modules():
        if isinstance(module, QuantizedLinear):
            module.use_checkpoint = True
    dequantized_model.gradient_checkpointing_enable()
    dequantized_model.enable_input_require_grads()


In [10]:
from src.pv_optimizer import StraightThroughAdamW

optimizer = StraightThroughAdamW(
    named_dequantized_params=dict(dequantized_model.named_parameters()),
    named_quantized_params=master_parameters,
    
    update_codes=dict(lr=args.code_lr, betas=args.code_betas),
    update_codebooks_and_scales=dict(lr=args.codebook_lr, betas=args.codebook_betas),
    update_non_quantized_parameters=dict(lr=args.codebook_lr, betas=args.codebook_betas),
    code_trust_ratio=args.code_trust_ratio,
    beam_size=args.beam_size,
    max_code_change_per_step=args.max_code_change_per_step,
    stochastic_rounding_tau=args.stochastic_rounding_tau,
    straight_through_buffer_dtype=torch.float32,
    entropy_reg=args.entropy_reg,
    code_selection_temperature=args.code_selection_temperature,
    wandb = run
)

In [11]:
POINTER = 0
def next_train_data():
    global POINTER
    batch = []
    for i in range(args.step_nsamples):
        batch.append(train_data[POINTER % len(train_data)])
        POINTER += 1
    return batch


def _run_one_step(args, base_model, dequantized_model, optimizer, train_data, **kwargs):
    optimizer.zero_grad(set_to_none=True)
    with tqdm(train_data, desc="V step") as progress:

        total_loss = 0.0
        for i, batch in enumerate(progress):
            batch = torch.as_tensor(batch, device=device)
            with torch.no_grad():
                teacher_logits = base_model(batch).logits
            student_logits = dequantized_model(batch).logits  # forward accumulates XTX statistics
            loss = kl_div(student_logits, teacher_logits)
            (loss / len(train_data)).backward()  # backward accumulates gradient
            total_loss = loss.item() / (i + 1) + total_loss * i / (i + 1)
            progress.desc = f"V step: accumulating gradients, loss = {total_loss:.9f}"
            del student_logits, teacher_logits, loss
    optimizer.step(**kwargs)
    optimizer.zero_grad(set_to_none=True)  # reset statistics for the next step
    return total_loss


def kl_div(student_hiddens, teacher_hiddens):
    C = student_hiddens.shape[-1]  # num classes
    return F.kl_div(
        input=F.log_softmax(student_hiddens.view(-1, C), dim=-1),
        target=F.log_softmax(teacher_hiddens.view(-1, C), dim=-1),
        log_target=True,
        reduction="batchmean",
    )


In [12]:
eval_data = get_loaders(
    'wikitext2',
    seed=args.seed,
    model_path=args.base_model,
    seqlen=args.model_seqlen,
    eval_mode=True,
)

@torch.inference_mode()
def eval_ppl_naive(model, eval_data):
    eval_inps = [
        eval_data[:, start: start + args.model_seqlen] for start in range(0, eval_data.shape[1], args.model_seqlen)
    ]
    total_tokens = 0
    nlls = []
    for input_ids in tqdm(eval_inps):
        input_ids = input_ids.to(device)
        lm_logits = model(input_ids).logits

        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * args.model_seqlen
        nlls.append(neg_log_likelihood)
        total_tokens += shift_labels.numel()
    ppl = torch.exp(torch.stack(nlls).sum() / total_tokens)
    return ppl

Loaded data from wikitext2; len(data)=1 sequences


In [13]:
# optimizer.verbose = True
for i in range(5000):
    print("STEP", i)
    if i % 5 == 0:
        ppl_eval_score = eval_ppl_naive(dequantized_model, eval_data).item()
        print("wikitext2 ppl (naive)", ppl_eval_score)
#         run.log({"wikitext2 ppl (naive)": ppl_eval_score},step=i)
    v_step_train_loss = _run_one_step(args, base_model, dequantized_model, optimizer, next_train_data())
    print("train loss:", v_step_train_loss)
    run.log({"train loss": v_step_train_loss, "wikitext2 ppl (naive)": ppl_eval_score},step=i+1)


STEP 0


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.341968536376953


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.836497523567893
train loss: 0.0682287216186524
STEP 1


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.787444031083739
train loss: 0.06627416610717766
STEP 2


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.753761260540454
train loss: 0.06787681579589842
STEP 3


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.734296464300774
train loss: 0.07246971130371098
STEP 4


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.716225630277163
train loss: 0.06946063041687006
STEP 5


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.300655364990234


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.700725673081038
train loss: 0.06352281570434572
STEP 6


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.686971193784244
train loss: 0.06632280349731456
STEP 7


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.673471724832213
train loss: 0.06321048736572271
STEP 8


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.661000268799919
train loss: 0.0703411102294923
STEP 9


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.650068264502984
train loss: 0.06414985656738288
STEP 10


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.294114112854004


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.634480996565385
train loss: 0.058072566986084
STEP 11


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.622811949098265
train loss: 0.058865308761596714
STEP 12


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.6108430577563
train loss: 0.062080860137939405
STEP 13


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.603673882298656
train loss: 0.06548571586608896
STEP 14


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.59579092341584
train loss: 0.06406593322753908
STEP 15


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.303272247314453


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.587629784237254
train loss: 0.05910801887512206
STEP 16


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.580306396855937
train loss: 0.06219053268432604
STEP 17


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.572256356090694
train loss: 0.059674501419067334
STEP 18


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.565696157418289
train loss: 0.06643199920654295
STEP 19


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.55884525683019
train loss: 0.06110024452209475
STEP 20


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.302400588989258


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.548418521881104
train loss: 0.05575084686279293
STEP 21


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.539850414573372
train loss: 0.05610799789428714
STEP 22


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.531354182726377
train loss: 0.059133529663085944
STEP 23


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.525535547888124
train loss: 0.06176781654357912
STEP 24


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.519564317418384
train loss: 0.06077098846435554
STEP 25


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.312439918518066


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.512186216069507
train loss: 0.05625820159912107
STEP 26


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.505987896547689
train loss: 0.05909013748168944
STEP 27


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.498767878328051
train loss: 0.05707573890686038
STEP 28


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.493102539669383
train loss: 0.06327104568481445
STEP 29


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.486827196238877
train loss: 0.05847620964050294
STEP 30


  0%|          | 0/334 [00:00<?, ?it/s]

AVG entropy(not correct): 11.456333508739224
train loss: 0.05920791625976563
STEP 34


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.450537658357002
train loss: 0.058113336563110345
STEP 35


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.336711883544922


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.44328309498824
train loss: 0.05430722236633298
STEP 36


V step:   0%|          | 0/256 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AVG entropy(not correct): 11.357478454515531
train loss: 0.05385541915893554
STEP 48


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.351000914325962
train loss: 0.05883693695068356
STEP 49


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.34366753039422
train loss: 0.05533051490783696
STEP 50


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.372032165527344


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.333583671551246
train loss: 0.05228805541992186
STEP 51


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.324795269346856
train loss: 0.05220472812652595
STEP 52


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.316801412539048
train loss: 0.054686307907104485
STEP 53


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.310864566982566
train loss: 0.055930852890014655
STEP 54


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.304773949183426
train loss: 0.05504751205444334
STEP 55


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.383028030395508


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.296050408443847
train loss: 0.05217099189758303
STEP 56


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.2895771373402
train loss: 0.05405092239379883
STEP 57


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.281719813099155
train loss: 0.05297446250915526
STEP 58


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.275588824377431
train loss: 0.05759191513061526
STEP 59


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.268344926369654
train loss: 0.054498195648193345
STEP 60


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.380387306213379


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.259831697135777
train loss: 0.05198383331298829
STEP 61


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.251745244125267
train loss: 0.05179178714752198
STEP 62


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.244373007254167
train loss: 0.05412602424621584
STEP 63


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.238860014197114
train loss: 0.05507659912109381
STEP 64


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.233458646706172
train loss: 0.054129123687744106
STEP 65


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.38632869720459


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.22573727059674
train loss: 0.0517113208770752
STEP 66


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.219516343110568
train loss: 0.05326318740844722
STEP 67


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.211917981698916
train loss: 0.05244588851928711
STEP 68


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.206292184916409
train loss: 0.05673456192016598
STEP 69


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.199693912035459
train loss: 0.05378937721252441
STEP 70


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.38632869720459


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.191277806635027
train loss: 0.051712036132812486
STEP 71


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.183617436266564
train loss: 0.05144119262695309
STEP 72


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.177017184821041
train loss: 0.053826332092285115
STEP 73


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.171734112423735
train loss: 0.054486989974975586
STEP 74


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.16693785593107
train loss: 0.053730964660644545
STEP 75


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.380826950073242


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.160753623231665
train loss: 0.051517248153686516
STEP 76


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.155190456997264
train loss: 0.052867889404296854
STEP 77


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.148332495193976
train loss: 0.05214166641235354
STEP 78


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.143033811798343
train loss: 0.056145668029785156
STEP 79


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.136988630542508
train loss: 0.05347609519958501
STEP 80


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.387870788574219


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.130043782197037
train loss: 0.05164766311645509
STEP 81


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.12335298897384
train loss: 0.05125093460083002
STEP 82


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.117137740184734
train loss: 0.053523540496826144
STEP 83


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.112061056997868
train loss: 0.054040908813476556
STEP 84


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.094268875462669
train loss: 0.052485227584838895
STEP 87


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.087587050803297
train loss: 0.052059650421142516
STEP 88


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.082476738211396
train loss: 0.05571031570434573
STEP 89


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.076538381638464
train loss: 0.0532722473144531
STEP 90


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.407485008239746


V step:   0%|          | 0/256 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AVG entropy(not correct): 11.0294001187597
train loss: 0.052009582519531236
STEP 98


V step:   0%|          | 0/256 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AVG entropy(not correct): 11.012532707545665
train loss: 0.05187845230102536
STEP 101


V step:   0%|          | 0/256 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AVG entropy(not correct): 11.006412257621815
train loss: 0.05126881599426271
STEP 102


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 11.000609243070924
train loss: 0.05353665351867671
STEP 103


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.995949697184873
train loss: 0.053774595260620124
STEP 104


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.99118942564184
train loss: 0.05284261703491211
STEP 105


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.430898666381836


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.985397671724295
train loss: 0.05131173133850098
STEP 106


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.980260792490723
train loss: 0.05224490165710446
STEP 107


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.974208893714014
train loss: 0.05211257934570312
STEP 108


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.969726030315671
train loss: 0.05549669265747071
STEP 109


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.96444815706897
train loss: 0.05301833152770999
STEP 110


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.440855979919434


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.958436148507255
train loss: 0.05205154418945316
STEP 111


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.952791071944423
train loss: 0.05138242244720459
STEP 112


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.94752738731248
train loss: 0.053606033325195264
STEP 113


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.943471684858396
train loss: 0.05377602577209468
STEP 114


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.939038926904852
train loss: 0.05282044410705567
STEP 115


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.465462684631348


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.933755421406262
train loss: 0.0513637065887451
STEP 116


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.928740208025102
train loss: 0.05222773551940913
STEP 117


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.923284073154647
train loss: 0.05212020874023433
STEP 118


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.919261623125571
train loss: 0.05549812316894534
STEP 119


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.91446958308096
train loss: 0.05308294296264649
STEP 120


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.47590160369873


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.908913285701306
train loss: 0.052261829376220696
STEP 121


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.903864857825367
train loss: 0.0515758991241455
STEP 122


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.89935490372893
train loss: 0.053729772567749
STEP 123


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.895472312128389
train loss: 0.053804397583007826
STEP 124


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.882864313079166
train loss: 0.05238080024719237
STEP 127


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.878383050491284
train loss: 0.052263259887695306
STEP 128


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.8745771936782
train loss: 0.05533075332641603
STEP 129


V step:   0%|          | 0/256 [00:00<?, ?it/s]

AVG entropy(not correct): 10.870225260010013
train loss: 0.05309629440307616
STEP 130


  0%|          | 0/334 [00:00<?, ?it/s]

wikitext2 ppl (naive) 9.4810152053833


V step:   0%|          | 0/256 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
print(1)