## Reproduction of the description length calculation

Reference
- Ayonrinde, Kola, Michael T. Pearce, and Lee Sharkey. "Interpretability as Compression: Reconsidering SAE Explanations of Neural Activations." NeurIPS 2024 Workshop on Scientific Methods for Understanding Deep Learning, https://openreview.net/pdf?id=OvmW8HnGzK

In [1]:
import math

def compute_description_length(L0, B, D):

   log2_D = math.log2(D)
   description_length = L0 * (B + log2_D)
   return description_length

# GPT-2 Sanity Check
L0 = 65       # sparsity
B = 7         # non-zero float bits
D = 25000     # dictionary size

dl_bits = compute_description_length(L0, B, D)
print(f"Description Length: {dl_bits:.2f} bits")

Description Length: 1404.63 bits


In [2]:
import pandas as pd

def load_layer_csv(filepath, layer_number):
    df = pd.read_csv(filepath)
    df['layer'] = layer_number

    def parse_name(name):
        parts_underscore = name.split('_')
        activation = parts_underscore[-3]
        k = int(parts_underscore[-2])
        return activation, k

    df[['activation', 'k']] = df['Name'].apply(lambda name: pd.Series(parse_name(name)))
    return df

df6 = load_layer_csv("wandb_export_2025-03-27T13_59_12.395-05_00.csv", 6)
df7 = load_layer_csv("wandb_export_2025-03-27T13_59_41.831-05_00.csv", 7)
df8 = load_layer_csv("wandb_export_2025-03-27T14_00_03.325-05_00.csv", 8)

In [3]:
def get_best_l0_per_activation(df, layer_num):
    result = df.loc[df.groupby('activation')['l2_loss'].idxmin()][['activation', 'l0_norm', 'l2_loss']]
    result = result.sort_values(by='activation')
    result.insert(0, 'layer', layer_num)
    return result

best_l0_df6 = get_best_l0_per_activation(df6, 6)
best_l0_df7 = get_best_l0_per_activation(df7, 7)
best_l0_df8 = get_best_l0_per_activation(df8, 8)

summary_df = pd.concat([best_l0_df6, best_l0_df7, best_l0_df8], ignore_index=True)
summary_df


Unnamed: 0,layer,activation,l0_norm,l2_loss
0,6,batchtopk,1024.0,0.000165
1,6,topafa,2037.287598,0.000164
2,6,topk,1023.493408,0.000174
3,7,batchtopk,2957.234619,0.000181
4,7,topafa,1930.104248,0.000195
5,7,topk,2957.234619,0.000181
6,8,batchtopk,1024.0,0.000187
7,8,topafa,2195.765137,0.000166
8,8,topk,1023.324707,0.000177


In [4]:
B = 7         # non-zero float bits
D = 12288     # dictionary size

summary_df['description_length'] = summary_df['l0_norm'].apply(lambda L0: compute_description_length(L0, B, D))

final_df = summary_df.drop(columns=['l0_norm']).rename(columns={
    'l2_loss': 'tolerance',
    'description_length': 'MDL(tolerance)'
})

# table for OpenReview
from tabulate import tabulate
print(tabulate(final_df, headers="keys", tablefmt="github", showindex=False))

|   layer | activation   |   tolerance |   MDL(tolerance) |
|---------|--------------|-------------|------------------|
|       6 | batchtopk    | 0.000165076 |          21079   |
|       6 | topafa       | 0.000164111 |          41937.5 |
|       6 | topk         | 0.000173874 |          21068.6 |
|       7 | batchtopk    | 0.000181183 |          60874.6 |
|       7 | topafa       | 0.000195328 |          39731.1 |
|       7 | topk         | 0.000181183 |          60874.6 |
|       8 | batchtopk    | 0.000187346 |          21079   |
|       8 | topafa       | 0.000165902 |          45199.7 |
|       8 | topk         | 0.000176545 |          21065.1 |
