# S1 — Minimal: Dense → Masked → CSR
Simple, readable baseline with robust perplexity and measured sparsity.

In [1]:
#For Google Colab
%cd /content/sparsify-min/notebooks

/content/sparsify-min/notebooks


In [2]:
import os, sys, warnings, pandas as pd, torch
from transformers import AutoModelForCausalLM, AutoTokenizer
sys.path.append('..'); sys.path.append('../src')
from src.eval.metrics import params_size_and_sparsity, eval_ppl_causal
from src.eval.utils import measure_latency_ms
from src.eval.csvlog import append_row
from src.eval.plotting import bar_plot
from src.pruning.policies import apply_global_magnitude_pruning_cpu_safe, select_prunable_linears
from src.pruning.pipeline import freeze_pruning_, convert_linear_weights_to_csr_
from src.wrappers.linear_csr import LinearCSRForward
warnings.filterwarnings('ignore', message='.*Sparse CSR tensor support is in beta state.*')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)
RESULTS_DIR = os.path.join('..','results'); CSV_PATH = os.path.join(RESULTS_DIR,'S1_minimal.csv')
os.makedirs(RESULTS_DIR, exist_ok=True)
pd.DataFrame(columns=["setup","size_mb","sparsity","latency_ms","perplexity"]).to_csv(CSV_PATH, index=False)

def load_fresh():
    """
    Load exactly one model depending on the device:
      - CUDA  -> EleutherAI/pythia-410m (fp16)
      - CPU   -> facebook/opt-125m     (fp32)
    """
    if device == "cuda":
        model_name = "EleutherAI/pythia-410m"
        torch_dtype = torch.float16
    else:
        model_name = "facebook/opt-125m"
        torch_dtype = None  # use default (fp32)
    tok = AutoTokenizer.from_pretrained(model_name)
    tok.pad_token = tok.eos_token
    kwargs = {}
    if torch_dtype is not None:
        kwargs["torch_dtype"] = torch_dtype
    mdl = AutoModelForCausalLM.from_pretrained(model_name, **kwargs).to(device).eval()
    print(f"Loaded: {model_name}")
    return mdl, tok, model_name

def latency_fn(model, tokenizer):
    def f(L=128, B=1):
        inp = torch.randint(0, tokenizer.vocab_size, (B, L), device=device)
        att = torch.ones(B, L, device=device, dtype=torch.long)
        return model(input_ids=inp, attention_mask=att, labels=inp).logits
    return f
SAMPLE_TEXTS = [
    "In a quiet valley, the river bends slowly around the last farm before the hills.",
    "Sparse pruning zeroes weights but needs a sparse kernel to speed up compute.",
    "A small batch size can distort latency because of cache and warmup effects.",
    "Causal LM perplexity is averaged per token over sliding blocks.",
    "Version 1.2.0 fixes: stability on CPU, deterministic seeds, better logging.",
    "“Hello?” — “Hi; can you hear me?” — “Loud and clear.”",
    "HTTP 429 means rate limiting; use exponential backoff with jitter.",
    "Compute follows memory: fewer bytes moved often means fewer milliseconds.",
    "Numbers: 3.14159, 2.71828, 0.57721 show up in odd places.",
    "Keep the same corpus when comparing Dense vs Masked vs CSR.",
    "If latency jumps, check power limits, thermal throttling, governors.",
    "We log mean, median, and p95 latency because tails matter.",
    "One batch isn’t enough: run multiple iterations with warmup.",
    "Tiny masking mistakes can create NaNs; clamp logits if needed.",
    "When in doubt, profile with both synthetic and real inputs."
]
# Optional: virtually increase the size
# SAMPLE_TEXTS = SAMPLE_TEXTS * 20


Device: cuda


## 1) Dense baseline

In [3]:
model, tok, name = load_fresh()
stats = params_size_and_sparsity(model)
ppl   = eval_ppl_causal(model, tok, SAMPLE_TEXTS, device)
lat   = measure_latency_ms(latency_fn(model, tok), 128, 1, warmup=3, iters=10)
append_row(CSV_PATH, setup='Dense', size_mb=stats['size_mb'], sparsity=stats['sparsity'], latency_ms=lat, perplexity=ppl)
stats, ppl, lat

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

Loaded: EleutherAI/pythia-410m


  df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)


({'nonzero': 405333594,
  'total': 405334016,
  'sparsity': 1.0411166675439176e-06,
  'size_mb': 773.11328125},
 189.25483576970882,
 22.550119899995025)

## 2) Masked pruning (30%) — dense execution

In [4]:
SP_MASK = 0.30
model, tok, name = load_fresh()
layers = select_prunable_linears(model, blacklist=("lm_head",))
apply_global_magnitude_pruning_cpu_safe(layers, amount=SP_MASK)
stats = params_size_and_sparsity(model)
ppl   = eval_ppl_causal(model, tok, SAMPLE_TEXTS, device)
lat   = measure_latency_ms(latency_fn(model, tok), 128, 1, warmup=3, iters=10)
append_row(CSV_PATH, setup=f'Masked{int(SP_MASK*100)}', size_mb=stats['size_mb'], sparsity=stats['sparsity'], latency_ms=lat, perplexity=ppl)
stats, ppl, lat

Loaded: EleutherAI/pythia-410m


({'nonzero': 405333594,
  'total': 405334016,
  'sparsity': 1.0411166675439176e-06,
  'size_mb': 773.11328125},
 353.49905823886434,
 28.417835000004743)

## 3) CSR execution (30%) — real sparse kernels

In [5]:
SP_CSR = 0.30
model, tok, name = load_fresh()
layers = select_prunable_linears(model, blacklist=("lm_head",))
apply_global_magnitude_pruning_cpu_safe(layers, amount=SP_CSR)
freeze_pruning_(layers); convert_linear_weights_to_csr_(layers)
swapped = 0
def find_parent(root, child):
    for _, mod in root.named_modules():
        for cn, cc in mod.named_children():
            if cc is child: return mod, cn
    raise RuntimeError('Parent not found')
for lin in layers:
    if swapped >= 4: break
    parent, attr = find_parent(model, lin)
    setattr(parent, attr, LinearCSRForward(lin.weight.detach(), lin.bias.detach() if lin.bias is not None else None).to(device))
    swapped += 1
stats = params_size_and_sparsity(model)
ppl   = eval_ppl_causal(model, tok, SAMPLE_TEXTS, device)
lat   = measure_latency_ms(latency_fn(model, tok), 128, 1, warmup=3, iters=10)
append_row(CSV_PATH, setup=f'CSR{int(SP_CSR*100)}', size_mb=stats['size_mb'], sparsity=stats['sparsity'], latency_ms=lat, perplexity=ppl)
stats, ppl, lat

Loaded: EleutherAI/pythia-410m


  out = torch.matmul(W, x.T).T               # [out,b] -> [b,out]


({'nonzero': 290704460,
  'total': 392741888,
  'sparsity': 0.25980785629874037,
  'size_mb': 749.095703125},
 353.55637379023193,
 24.141621600006147)

## 4) Plots

In [6]:
df = pd.read_csv(CSV_PATH); display(df)
bar_plot(df, 'setup', 'size_mb', 'Model size (MB)', 'size_vs_sparsity.png', RESULTS_DIR, y_min=700)
bar_plot(df, 'setup', 'latency_ms', 'Latency (ms / forward)', 'latency_vs_sparsity.png', RESULTS_DIR)
bar_plot(df, 'setup', 'perplexity', 'Perplexity', 'ppl_vs_sparsity.png', RESULTS_DIR)


Unnamed: 0,setup,size_mb,sparsity,latency_ms,perplexity
0,Dense,773.113281,1e-06,22.55012,189.254836
1,Masked30,773.113281,1e-06,28.417835,353.499058
2,CSR30,749.095703,0.259808,24.141622,353.556374


Saved: ../results/size_vs_sparsity.png
Saved: ../results/latency_vs_sparsity.png
Saved: ../results/ppl_vs_sparsity.png
