In [1]:
import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, LlamaForCausalLM, LlamaConfig
from datasets import load_dataset
import evaluate
import wandb

import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy

import os
import argparse
import json
import hashlib
import random
import subprocess

import vllm
from vllm import LLM, SamplingParams

import pickle

  from .autonotebook import tqdm as notebook_tqdm


INFO 04-04 02:31:25 [__init__.py:239] Automatically detected platform cuda.


2025-04-04 02:31:25,559	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
def correlate(tokens,kgram_index,k_max,shuffle_order):
    shuffled_idx = np.argsort(shuffle_order)
    results = []
    for pos in range(len(tokens)-k_max):
        k = k_max
        while k > 0:
            prefix = tuple(tokens[pos:pos+k])
            if prefix in kgram_index:
                actual_next_token = tokens[pos+k]
                shuffled_idx_avg = 0
                num_matches = 0
                for info in kgram_index[prefix]:
                    if info['next_token'] == actual_next_token:
                        num_matches += 1
                        shuffled_idx_avg += shuffled_idx[info['idx']]
                        
                if num_matches > 0:
                    shuffled_idx_avg /= num_matches
                    results.append(shuffled_idx_avg)
                    break
            k -= 1
                    
    return results

def generate_and_correlate_text(prompts,llm,kgram_index,k,shuffle_order,sampling_params):
    generated = llm.generate(prompts,sampling_params)
    full_results = []
    for i in range(len(generated)):
        tokens = generated[i].outputs[0].token_ids
        results = correlate(tokens,kgram_index,k,shuffle_order)
        
        full_results += results
        
    return full_results

def eval_tiny(model_path, eval_texts):
    perplexity = evaluate.load("perplexity", module_type="metric")
    result = perplexity.compute(model_id=model_path,
                                add_start_token=True,
                                predictions=eval_texts)
    pplx = np.log(result['perplexities'])

    return pplx

In [3]:
model_id = 1
control_id = 0
model_epoch = 9

MODEL_HASH = 'e4c9c1ac'
df_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/tinystories.csv'
model_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/epoch-{model_epoch}-index-{model_id}'
control_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/epoch-{model_epoch}-index-{control_id}'

INDEX_HASH = '9f87af1f'
k = 20
index_path = f'./kgram-analysis/indexes/kgram_index_{INDEX_HASH}/kgram_index_k{k}.pkl'

N_TRAIN_SAMPLES = 10000
PROMPT_LEN = 20

In [4]:
dataset = load_dataset("roneneldan/TinyStories")

texts = dataset["train"]["text"][:N_TRAIN_SAMPLES]
texts = [item for item in texts if item != ""]

In [8]:
df_path = f'./kgram-analysis/models/tiny_ref_model_{MODEL_HASH}/tinystories.csv'
df = pd.read_csv(df_path)

In [7]:
order_epoch = 0

model_pplx = eval_tiny(model_path,texts)
ctrl_pplx = eval_tiny(control_path,texts)

df = pd.read_csv(df_path)

stat = scipy.stats.spearmanr(np.argsort(df[f'order-{model_id}-epoch-{order_epoch}']), model_pplx-ctrl_pplx)

print(stat)

  0%|                                                                          | 0/625 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.33 GiB. GPU 0 has a total capacity of 47.54 GiB of which 1.59 GiB is free. Process 3468247 has 43.02 GiB memory in use. Including non-PyTorch memory, this process has 2.92 GiB memory in use. Of the allocated memory 2.60 GiB is allocated by PyTorch, and 14.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
llm = LLM(model=model_path, task="generate")

INFO 04-04 02:31:44 [config.py:2610] Downcasting torch.float32 to torch.float16.
INFO 04-04 02:31:51 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-04 02:31:52 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='./kgram-analysis/models/tiny_ref_model_e4c9c1ac/epoch-9-index-1', speculative_config=None, tokenizer='./kgram-analysis/models/tiny_ref_model_e4c9c1ac/epoch-9-index-1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=512, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 49.20it/s]



INFO 04-04 02:31:54 [loader.py:447] Loading weights took 0.03 seconds
INFO 04-04 02:31:54 [gpu_model_runner.py:1186] Model loading took 0.0362 GB and 0.228102 seconds
INFO 04-04 02:31:56 [backends.py:415] Using cache directory: /sailhome/rohithk/.cache/vllm/torch_compile_cache/a916b86176/rank_0_0 for vLLM's torch.compile
INFO 04-04 02:31:56 [backends.py:425] Dynamo bytecode transform time: 2.49 s
INFO 04-04 02:31:57 [backends.py:115] Directly load the compiled graph for shape None from the cache
INFO 04-04 02:31:58 [monitor.py:33] torch.compile takes 2.49 s in total
INFO 04-04 02:31:59 [kv_cache_utils.py:566] GPU KV cache size: 10,810,144 tokens
INFO 04-04 02:31:59 [kv_cache_utils.py:569] Maximum concurrency for 512 tokens per request: 21113.56x
INFO 04-04 02:32:16 [gpu_model_runner.py:1534] Graph capturing finished in 17 secs, took 0.24 GiB
INFO 04-04 02:32:16 [core.py:151] init engine (profile, create kv cache, warmup model) took 22.53 seconds


In [22]:
np.mean(np.random.uniform(10000,size=(100000)))

5002.20307263202

In [23]:
# kgram_index = pickle.load(open(index_path,'rb'))

input_texts = dataset["train"]["text"]
input_texts = [item for item in input_texts if item != ""]

shuffle_order = df[f'order-{model_id}-epoch-{order_epoch}']
prompts = [text[:PROMPT_LEN] for text in input_texts[10000:10000+100]]
sampling_params = SamplingParams(max_tokens=50)

n_reps = 10
avg_idx = 0
for rep in range(n_reps):
    print(f"rep: {rep}")
    results = generate_and_correlate_text(prompts,llm,kgram_index,k,shuffle_order,sampling_params)
    print(len(results))
    avg_idx += np.mean(results)/n_reps

print(avg_idx)

rep: 0


Processed prompts: 100%|█| 100/100 [00:00<00:00, 1188.68it/s, est. speed input: 7914.52 toks/s, output:


2611
rep: 1


Processed prompts: 100%|█| 100/100 [00:00<00:00, 788.75it/s, est. speed input: 5249.55 toks/s, output: 


2559
rep: 2


Processed prompts: 100%|█| 100/100 [00:00<00:00, 802.82it/s, est. speed input: 5343.09 toks/s, output: 


2560
rep: 3


Processed prompts: 100%|█| 100/100 [00:00<00:00, 796.73it/s, est. speed input: 5302.67 toks/s, output: 


2548
rep: 4


Processed prompts: 100%|█| 100/100 [00:00<00:00, 796.74it/s, est. speed input: 5302.62 toks/s, output: 


2582
rep: 5


Processed prompts: 100%|█| 100/100 [00:00<00:00, 772.71it/s, est. speed input: 5142.81 toks/s, output: 


2600
rep: 6


Processed prompts: 100%|█| 100/100 [00:00<00:00, 789.25it/s, est. speed input: 5252.82 toks/s, output: 


2569
rep: 7


Processed prompts: 100%|█| 100/100 [00:00<00:00, 790.48it/s, est. speed input: 5259.64 toks/s, output: 


2572
rep: 8


Processed prompts: 100%|█| 100/100 [00:00<00:00, 790.96it/s, est. speed input: 5264.29 toks/s, output: 


2596
rep: 9


Processed prompts: 100%|█| 100/100 [00:00<00:00, 794.94it/s, est. speed input: 5290.62 toks/s, output: 


2602
5049.175325085427


In [34]:
tuple([1,2,3])

(1, 2, 3)

In [19]:
len(input_texts[100:100+100])

100