In [1]:
import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, LlamaForCausalLM, LlamaConfig
from datasets import load_dataset
import evaluate
import wandb

import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy

import os
import argparse
import json
import hashlib
import random
import subprocess

import vllm
from vllm import LLM

import pickle

  from .autonotebook import tqdm as notebook_tqdm


INFO 03-28 11:29:11 [__init__.py:239] Automatically detected platform cuda.


2025-03-28 11:29:11,868	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
def correlate(tokens,kgram_index,k_max,shuffle_order):
    shuffled_idx = np.argsort(shuffle_order)
    results = []
    for pos in range(len(tokens)-k_max):
        k = k_max
        while k > 0:
            prefix = tokens[pos:pos+k]
            if prefix in kgram_index:
                actual_next_token = tokens[pos+k]
                shuffled_idx_avg = 0
                num_matches = 0
                for info in kgram_index[prefix]:
                    if info['next_token'] == actual_next_token:
                        num_matches += 1
                        shuffled_idx_avg += shuffled_idx[info['idx']]
                        
                if num_matches > 0:
                    shuffled_idx_avg /= num_matches
                    results.append(shuffled_idx_avg)
                    break
            k -= 1
                    
    return results

def generate_and_correlate_text(prompts,llm,kgram_index,k,shuffle_order):
    generated = llm.generate(prompts)
    full_results = []
    for i in range(len(generated)):
        tokens = generated[i].outputs[0].token_ids
        results = correlate(tokens,kgram_index,k,shuffle_order)
        
        full_results += results
        
    return full_results

def eval_tiny(model_path, eval_texts):
    perplexity = evaluate.load("perplexity", module_type="metric")
    result = perplexity.compute(model_id=model_path,
                                add_start_token=True,
                                predictions=eval_texts)
    pplx = np.log(result['perplexities'])

    return pplx

In [3]:
model_id = 1
control_id = 0
model_epoch = 9

MODEL_HASH = 'e4c9c1ac'
df_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/tinystories.csv'
model_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/epoch-{model_epoch}-index-{model_id}'
control_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/epoch-{model_epoch}-index-{control_id}'

INDEX_HASH = '6f53bb4b'
k = 8
index_path = f'./kgram-analysis/indexes/kgram_index_{INDEX_HASH}/kgram_index_k{k}.pkl'

N_TRAIN_SAMPLES = 10000
PROMPT_LEN = 20

In [4]:
dataset = load_dataset("roneneldan/TinyStories")

texts = dataset["train"]["text"][:N_TRAIN_SAMPLES]
texts = [item for item in texts if item != ""]

In [5]:
order_epoch = 0

model_pplx = eval_tiny(model_path,texts)
ctrl_pplx = eval_tiny(control_path,texts)

df = pd.read_csv(df_path)

stat = scipy.stats.spearmanr(np.argsort(df[f'order-{model_id}-epoch-{order_epoch}']), model_pplx-ctrl_pplx)

print(stat)

100%|████████████████████████████████████████████████████████████████| 625/625 [01:26<00:00,  7.21it/s]
100%|████████████████████████████████████████████████████████████████| 625/625 [01:26<00:00,  7.21it/s]


SignificanceResult(statistic=-0.36053098296530983, pvalue=1.108995454405034e-304)


In [6]:
llm = LLM(model=model_path, task="generate")

INFO 03-28 11:32:46 [config.py:2610] Downcasting torch.float32 to torch.float16.
INFO 03-28 11:32:57 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='./kgram-analysis/models/tiny_ref_model_e4c9c1ac/epoch-9-index-1', speculative_config=None, tokenizer='./kgram-analysis/models/tiny_ref_model_e4c9c1ac/epoch-9-index-1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=512, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, serv

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 15.68it/s]


INFO 03-28 11:32:59 [loader.py:447] Loading weights took 0.07 seconds
INFO 03-28 11:32:59 [model_runner.py:1146] Model loading took 0.0362 GB and 0.106923 seconds





INFO 03-28 11:32:59 [worker.py:267] Memory profiling takes 0.34 seconds
INFO 03-28 11:32:59 [worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.90) = 21.28GiB
INFO 03-28 11:32:59 [worker.py:267] model weights take 0.04GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.29GiB; the rest of the memory reserved for KV Cache is 20.95GiB.
INFO 03-28 11:33:00 [executor_base.py:111] # cuda blocks: 343232, # CPU blocks: 65536
INFO 03-28 11:33:00 [executor_base.py:116] Maximum concurrency for 512 tokens per request: 10726.00x
INFO 03-28 11:33:02 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_n

Capturing CUDA graph shapes: 100%|█████████████████████████████████████| 35/35 [00:16<00:00,  2.12it/s]

INFO 03-28 11:33:18 [model_runner.py:1570] Graph capturing finished in 17 secs, took 0.15 GiB
INFO 03-28 11:33:18 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.38 seconds





In [26]:
kgram_index = pickle.load(open(index_path,'rb'))
shuffle_order = df[f'order-{model_id}-epoch-{order_epoch}']
prompts = [text[:PROMPT_LEN] for text in texts]

n_reps = 10
avg_idx = 0
for rep in range(n_reps):
    print(f"rep: {rep}")
    results = generate_and_correlate_text(prompts,llm,kgram_index,k,shuffle_order)
    print(len(results))
    avg_idx += np.mean(results)/n_reps

print(avg_idx)

rep: 0


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 746.03it/s, est. speed input: 5068.38 toks/s, outp


1574
rep: 1


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 746.98it/s, est. speed input: 5074.83 toks/s, outp


1382
rep: 2


Processed prompts: 100%|█| 10000/10000 [00:15<00:00, 648.63it/s, est. speed input: 4406.64 toks/s, outp


1478
rep: 3


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 746.08it/s, est. speed input: 5068.69 toks/s, outp


1442
rep: 4


Processed prompts: 100%|█| 10000/10000 [00:15<00:00, 646.87it/s, est. speed input: 4394.70 toks/s, outp


1438
rep: 5


Processed prompts: 100%|█| 10000/10000 [00:15<00:00, 648.57it/s, est. speed input: 4406.19 toks/s, outp


1437
rep: 6


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 746.41it/s, est. speed input: 5070.95 toks/s, outp


1462
rep: 7


Processed prompts: 100%|█| 10000/10000 [00:15<00:00, 646.20it/s, est. speed input: 4390.11 toks/s, outp


1499
rep: 8


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 745.38it/s, est. speed input: 5063.96 toks/s, outp


1371
rep: 9


Processed prompts: 100%|█| 10000/10000 [00:13<00:00, 744.66it/s, est. speed input: 5059.02 toks/s, outp


1503
5103.451662218861


In [56]:
np.mean(results)

4980.459436988101

In [22]:
np.mean(np.random.uniform(10000,size=(100000)))

5002.20307263202