In [29]:
# 设置环境变量
import os
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# 导入库
import torch
import blobfile as bf
import transformer_lens
import sparse_autoencoder
from experiments.utils import update_json_file, update_numpy_file
import pandas as pd
from datetime import datetime

In [42]:
import numpy as np
import re
# 加载模型
def load_model(model_name, center_writing_weights=False):
    model = transformer_lens.HookedTransformer.from_pretrained(model_name, center_writing_weights=center_writing_weights)
    device = next(model.parameters()).device
    return model, device

# 处理输入
def process_input(model, prompt):
    tokens_id = model.to_tokens(prompt)  # (1, n_tokens)
    tokens_str = model.to_str_tokens(prompt)
    with torch.no_grad():
        logits, activation_cache = model.run_with_cache(tokens_id, remove_batch_dim=True)
    return tokens_id, tokens_str, activation_cache

# 提取激活
def get_activation(activation_cache, layer_index=6, location="resid_post_mlp"):
    transformer_lens_loc = {
        "mlp_post_act": f"blocks.{layer_index}.mlp.hook_post",
        "resid_delta_attn": f"blocks.{layer_index}.hook_attn_out",
        "resid_post_attn": f"blocks.{layer_index}.hook_resid_mid",
        "resid_delta_mlp": f"blocks.{layer_index}.hook_mlp_out",
        "resid_post_mlp": f"blocks.{layer_index}.hook_resid_post",
    }[location]
    return activation_cache[transformer_lens_loc]

# 加载自编码器
def load_autoencoder(location, layer_index, device):
    with bf.BlobFile(sparse_autoencoder.paths.v5_32k(location, layer_index), mode="rb") as f:
        state_dict = torch.load(f)
        autoencoder = sparse_autoencoder.Autoencoder.from_state_dict(state_dict)
        autoencoder.to(device)
    return autoencoder

# 编码和解码激活张量
def encode_decode(autoencoder, input_tensor):
    with torch.no_grad():
        latent_activations, info = autoencoder.encode(input_tensor)
        reconstructed_activations = autoencoder.decode(latent_activations, info)
    return latent_activations, reconstructed_activations

# 计算误差并打印结果
def calculate_normalized_mse(input_tensor, reconstructed_activations):
    normalized_mse = (reconstructed_activations - input_tensor).pow(2).sum(dim=1) / (input_tensor).pow(2).sum(dim=1)
    return normalized_mse

def extract_activations(prompt, tokens, latent_activations, top_k=32, activation_threshold=3):
    activations_dict = {}
    prompt_key = prompt  # 根据需要设置不同的 prompt 标识符

    total_activations_count = 0

    # 遍历所有 feature
    for feature_index in range(latent_activations.shape[1]):
        # 获取该 feature 的所有激活值
        feature_activations = latent_activations[:, feature_index]

        # 仅提取 top k 非零激活值
        non_zero_activations = feature_activations[(feature_activations != 0) & (feature_activations >= activation_threshold)]
        if non_zero_activations.numel() == 0:
            continue
        top_k_values, top_k_indices = torch.topk(non_zero_activations, min(top_k, non_zero_activations.numel()))

        # 构建特征激活字典
        feature_key = f"Feature {feature_index}"
        activations_dict[feature_key] = {prompt_key: {}}

        for value, index in zip(top_k_values, top_k_indices):
            token_index = (feature_activations == value).nonzero(as_tuple=True)[0].item()
            token = tokens[token_index]
            activations_dict[feature_key][prompt_key][f"{token}"] = value.item()

        total_activations_count += len(top_k_values)

    # Print the total number of activations extracted
    print(f"Total activations extracted: {total_activations_count}")

    # Optionally, return the total number of activations
    return activations_dict


In [3]:
model, device = load_model("gpt2")
layer_index = 6
location = "resid_post_mlp"
autoencoder = load_autoencoder(location, layer_index, device)

Loaded pretrained model gpt2 into HookedTransformer


In [37]:
today = datetime.today().strftime('%Y-%m-%d')
output_folder = f'output/{today}'
os.makedirs(output_folder, exist_ok=True)

In [38]:
# 加载 prompt
prompt_file_path = 'dataset/prompt_1000/decision_feeling.parquet' 
data = pd.read_parquet(prompt_file_path)

In [39]:
data

Unnamed: 0,prompt_id,prompt
0,1,Trait: make decisions based on feelings and va...
1,2,Trait: make decisions based on feelings and va...
2,3,Trait: make decisions based on feelings and va...
3,4,Trait: make decisions based on feelings and va...
4,5,Trait: make decisions based on feelings and va...
...,...,...
995,996,Trait: make decisions based on feelings and va...
996,997,Trait: make decisions based on feelings and va...
997,998,Trait: make decisions based on feelings and va...
998,999,Trait: make decisions based on feelings and va...


In [44]:
prompt_file_path = 'dataset/prompt_1000/decision_feeling.parquet' 
data = pd.read_parquet(prompt_file_path)
activations = []
activations_file_path = ""
for index, row in data.iterrows():
    prompt_id = row['prompt_id']
    prompt = row['prompt']
    tokens_id, tokens_str, activation_cache = process_input(model, prompt)
    activation = get_activation(activation_cache, layer_index=layer_index)
    # 编码和解码激活张量
    latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

    print(latent_activations.shape)
    print(activation.shape)
    print(reconstructed_activations.shape)
    non_zero_count = (latent_activations != 0).sum().item()
    print("Non-zero activation count:", non_zero_count)

    activations_dict = extract_activations(prompt_id, tokens_str, latent_activations, top_k=5)
    
    activations_file_path = os.path.join(output_folder, 'decision_feeling_1000_activation.json')
    
    update_json_file(activations_file_path, activations_dict)

torch.Size([173, 32768])
torch.Size([173, 768])
torch.Size([173, 768])
Non-zero activation count: 5536
Total activations extracted: 530
torch.Size([183, 32768])
torch.Size([183, 768])
torch.Size([183, 768])
Non-zero activation count: 5856
Total activations extracted: 554
torch.Size([224, 32768])
torch.Size([224, 768])
torch.Size([224, 768])
Non-zero activation count: 7168
Total activations extracted: 700
torch.Size([94, 32768])
torch.Size([94, 768])
torch.Size([94, 768])
Non-zero activation count: 3008
Total activations extracted: 330
torch.Size([216, 32768])
torch.Size([216, 768])
torch.Size([216, 768])
Non-zero activation count: 6912
Total activations extracted: 690
torch.Size([261, 32768])
torch.Size([261, 768])
torch.Size([261, 768])
Non-zero activation count: 8352
Total activations extracted: 782
torch.Size([52, 32768])
torch.Size([52, 768])
torch.Size([52, 768])
Non-zero activation count: 1664
Total activations extracted: 180
torch.Size([119, 32768])
torch.Size([119, 768])
torch.

In [47]:
# 设置路径
prompt_folder_path = 'dataset/prompt_1000'
os.makedirs(output_folder, exist_ok=True)
# 排除的文件
excluded_file = 'decision_feeling.parquet'

# 遍历所有 .parquet 文件
for file_name in os.listdir(prompt_folder_path):
    count = 1
    if file_name.endswith('.parquet') and file_name != excluded_file:
        prompt_file_path = os.path.join(prompt_folder_path, file_name)
        data = pd.read_parquet(prompt_file_path)
        
        for index, row in data.iterrows():
            prompt_id = row['prompt_id']
            prompt = row['prompt']
            tokens_id, tokens_str, activation_cache = process_input(model, prompt)
            activation = get_activation(activation_cache, layer_index)
            latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

            print(latent_activations.shape)
            print(activation.shape)
            print(reconstructed_activations.shape)
            non_zero_count = (latent_activations != 0).sum().item()
            print("Non-zero activation count:", non_zero_count)
            print(f"This is {count}/1000 prompt")
            count+=1
            activations_dict = extract_activations(prompt_id, tokens_str, latent_activations, top_k=5)
            
            activations_file_name = file_name.replace('.parquet', '_1000_activation.json')
            activations_file_path = os.path.join(output_folder, activations_file_name)
            
            update_json_file(activations_file_path, activations_dict)

torch.Size([149, 32768])
torch.Size([149, 768])
torch.Size([149, 768])
Non-zero activation count: 4768
This is 1/1000 prompt
Total activations extracted: 472
torch.Size([180, 32768])
torch.Size([180, 768])
torch.Size([180, 768])
Non-zero activation count: 5760
This is 2/1000 prompt
Total activations extracted: 538
torch.Size([208, 32768])
torch.Size([208, 768])
torch.Size([208, 768])
Non-zero activation count: 6656
This is 3/1000 prompt
Total activations extracted: 629
torch.Size([91, 32768])
torch.Size([91, 768])
torch.Size([91, 768])
Non-zero activation count: 2912
This is 4/1000 prompt
Total activations extracted: 311
torch.Size([184, 32768])
torch.Size([184, 768])
torch.Size([184, 768])
Non-zero activation count: 5888
This is 5/1000 prompt
Total activations extracted: 604
torch.Size([237, 32768])
torch.Size([237, 768])
torch.Size([237, 768])
Non-zero activation count: 7584
This is 6/1000 prompt
Total activations extracted: 729
torch.Size([48, 32768])
torch.Size([48, 768])
torch.Siz

In [24]:
def read_numpy_file(filename):
    # 读取 NumPy 文件中的数据
    try:
        data = np.load(filename, allow_pickle=True).item()
        df = pd.DataFrame(data)
        print(f"Data from {filename}:")
        print(df.head(20000))  # 打印前几行数据
    except (FileNotFoundError, OSError):
        print(f"File {filename} not found or could not be read.")

# 示例调用
read_numpy_file(activations_file_path)

Data from output/2024-07-18\activations_fi_50.npy:
            Feature  Index SubIndex     Value
0         Feature 4      1    times  1.321357
1         Feature 6      1     hear  4.202684
2        Feature 11      1           1.179202
3        Feature 35      1      not  1.101583
4        Feature 42      1   posted  0.736671
...             ...    ...      ...       ...
19995  Feature 4875     19      own  1.515423
19996  Feature 4875     19       to  1.373387
19997  Feature 4875     19      him  1.207780
19998  Feature 4875     19       my  0.862682
19999  Feature 4875     19           0.751266

[20000 rows x 4 columns]
