In [None]:
# 设置环境变量
import os
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# 导入库
import torch
import blobfile as bf
import transformer_lens
import sparse_autoencoder
from utils import extract_activations, update_json_file, update_csv_file, sort_csv_file
import pandas as pd
from datetime import datetime

In [2]:
import numpy as np
import re
# 加载模型
def load_model(model_name, center_writing_weights=False):
    model = transformer_lens.HookedTransformer.from_pretrained(model_name, center_writing_weights=center_writing_weights)
    device = next(model.parameters()).device
    return model, device

# 处理输入
def process_input(model, prompt):
    tokens_id = model.to_tokens(prompt)  # (1, n_tokens)
    tokens_str = model.to_str_tokens(prompt)
    with torch.no_grad():
        logits, activation_cache = model.run_with_cache(tokens_id, remove_batch_dim=True)
    return tokens_id, tokens_str, activation_cache

# 提取激活
def get_activation(activation_cache, layer_index=6, location="resid_post_mlp"):
    transformer_lens_loc = {
        "mlp_post_act": f"blocks.{layer_index}.mlp.hook_post",
        "resid_delta_attn": f"blocks.{layer_index}.hook_attn_out",
        "resid_post_attn": f"blocks.{layer_index}.hook_resid_mid",
        "resid_delta_mlp": f"blocks.{layer_index}.hook_mlp_out",
        "resid_post_mlp": f"blocks.{layer_index}.hook_resid_post",
    }[location]
    return activation_cache[transformer_lens_loc]

# 加载自编码器
def load_autoencoder(location, layer_index, device):
    with bf.BlobFile(sparse_autoencoder.paths.v5_32k(location, layer_index), mode="rb") as f:
        state_dict = torch.load(f)
        autoencoder = sparse_autoencoder.Autoencoder.from_state_dict(state_dict)
        autoencoder.to(device)
    return autoencoder

# 编码和解码激活张量
def encode_decode(autoencoder, input_tensor):
    with torch.no_grad():
        latent_activations, info = autoencoder.encode(input_tensor)
        reconstructed_activations = autoencoder.decode(latent_activations, info)
    return latent_activations, reconstructed_activations

# 计算误差并打印结果
def calculate_normalized_mse(input_tensor, reconstructed_activations):
    normalized_mse = (reconstructed_activations - input_tensor).pow(2).sum(dim=1) / (input_tensor).pow(2).sum(dim=1)
    return normalized_mse

def update_numpy_file(filename, new_activations):
    # 尝试读取现有的 NumPy 文件
    try:
        data = np.load(filename, allow_pickle=True).item()
        df = pd.DataFrame(data)
    except (FileNotFoundError, OSError):
        df = pd.DataFrame(columns=['Feature', 'Index', 'SubIndex', 'Value'])
    # 将新的激活值转换为数据帧
    new_data = {
        'Feature': [],
        'Index': [],
        'SubIndex': [],
        'Value': []
    }
    for new_feature_key, new_feature_data in new_activations.items():
        for new_prompt_key, new_prompt_data in new_feature_data.items():
            for sub_index, value in new_prompt_data.items():
                new_data['Feature'].append(new_feature_key)
                new_data['Index'].append(int(new_prompt_key))
                new_data['SubIndex'].append(sub_index)
                new_data['Value'].append(value)

    new_df = pd.DataFrame(new_data)
    # 删除空值或全为NA值的列
    new_df.dropna(axis=1, how='all', inplace=True)
    # 合并新的数据帧到现有的数据帧中
    updated_df = pd.concat([df, new_df], ignore_index=True)
    # 删除重复项，保留最新值
    updated_df.drop_duplicates(subset=['Feature', 'Index', 'SubIndex'], keep='last', inplace=True)
    # 保存更新后的数据帧到 NumPy 文件
    data_dict = updated_df.to_dict('list')
    np.save(filename, data_dict)
    print(f"File {filename} has been updated and saved.")

def extract_feature_number(feature_str):
    # 提取 feature 字符串中的数字部分
    match = re.search(r'\d+', feature_str)
    return int(match.group()) if match else float('inf')

def sort_dataframe(df):
    # 提取Feature列中的数字部分
    df['FeatureNumber'] = df['Feature'].apply(extract_feature_number)

    # 按 FeatureNumber 从小到大排序，然后按 Value 从大到小排序
    df.sort_values(by=['FeatureNumber', 'Value'], ascending=[True, False], inplace=True)

    # 删除临时的 FeatureNumber 列
    df.drop(columns=['FeatureNumber'], inplace=True)

    return df


In [4]:
model, device = load_model("gpt2")
layer_index = 6
location = "resid_post_mlp"
autoencoder = load_autoencoder(location, layer_index, device)

Loaded pretrained model gpt2 into HookedTransformer


In [5]:
# 加载 CSV 文件
csv_file_path = 'dataset/cleaned_mbti.csv'  # 替换为你的 CSV 文件路径
df = pd.read_csv(csv_file_path)

In [6]:
today = datetime.today().strftime('%Y-%m-%d')
output_folder = f'output/{today}'
os.makedirs(output_folder, exist_ok=True)

In [21]:
activations = []
data = df[:50]
activations_file_path = ""
for index, row in data.iterrows():
    prompt_id = row['id']
    prompt = row['cleaned_posts']
    print(prompt)
    # 处理输入并获取激活
    tokens_id, tokens_str, activation_cache = process_input(model, prompt)
    activation = get_activation(activation_cache, layer_index=layer_index)

    # 编码和解码激活张量
    latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

    print(latent_activations.shape)
    print(activation.shape)
    print(reconstructed_activations.shape)
    non_zero_count = (latent_activations != 0).sum().item()
    print("Non-zero activation count:", non_zero_count)

    activations_dict = extract_activations(prompt_id, tokens_str, latent_activations)
    
    activations_file_path = os.path.join(output_folder, 'activations_fi_50.npy')
    
    update_numpy_file(activations_file_path, activations_dict)

The last thing my INFJ friend posted on his facebook before committing suicide the next day Rest in peace    ENFJ7 Sorry to hear of your distress Its only natural for a relationship to not be perfection all the time in every moment of existence Try to figure the hard times as times of growth as
torch.Size([63, 32768])
torch.Size([63, 768])
torch.Size([63, 768])
Non-zero activation count: 2016


  updated_df = pd.concat([df, new_df], ignore_index=True)


File output/2024-07-18\activations_fi_50.npy has been updated and saved.
Prozac wellbrutin at least thirty minutes of moving your legs and I dont mean moving them while sitting in your same desk chair weed in moderation maybe try edibles as a healthier alternative
torch.Size([39, 32768])
torch.Size([39, 768])
torch.Size([39, 768])
Non-zero activation count: 1248
File output/2024-07-18\activations_fi_50.npy has been updated and saved.
Basically come up with three items youve determined that each type or whichever types you want to do would more than likely use given each types cognitive functions and whatnot when left by
torch.Size([36, 32768])
torch.Size([36, 768])
torch.Size([36, 768])
Non-zero activation count: 1152
File output/2024-07-18\activations_fi_50.npy has been updated and saved.
All things in moderation  Sims is indeed a video game and a good one at that Note a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim
torch.Size

In [24]:
def read_numpy_file(filename):
    # 读取 NumPy 文件中的数据
    try:
        data = np.load(filename, allow_pickle=True).item()
        df = pd.DataFrame(data)
        print(f"Data from {filename}:")
        print(df.head(20000))  # 打印前几行数据
    except (FileNotFoundError, OSError):
        print(f"File {filename} not found or could not be read.")

# 示例调用
read_numpy_file(activations_file_path)

Data from output/2024-07-18\activations_fi_50.npy:
            Feature  Index SubIndex     Value
0         Feature 4      1    times  1.321357
1         Feature 6      1     hear  4.202684
2        Feature 11      1           1.179202
3        Feature 35      1      not  1.101583
4        Feature 42      1   posted  0.736671
...             ...    ...      ...       ...
19995  Feature 4875     19      own  1.515423
19996  Feature 4875     19       to  1.373387
19997  Feature 4875     19      him  1.207780
19998  Feature 4875     19       my  0.862682
19999  Feature 4875     19           0.751266

[20000 rows x 4 columns]


### Experiment1: 1k Prompt

In [25]:
activations = []
data = df[:1000] 
activations_file_path = ""
for index, row in data.iterrows():
    prompt_id = row['id']
    prompt = row['cleaned_posts']
    print(prompt)
    # 处理输入并获取激活
    tokens_id, tokens_str, activation_cache = process_input(model, prompt)
    activation = get_activation(activation_cache, layer_index=layer_index)

    # 编码和解码激活张量
    latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

    print(latent_activations.shape)
    print(activation.shape)
    print(reconstructed_activations.shape)
    non_zero_count = (latent_activations != 0).sum().item()
    print("Non-zero activation count:", non_zero_count)
    
    activations_dict = extract_activations(prompt_id, tokens_str, latent_activations)
    
    activations_file_path = os.path.join(output_folder, 'activations_fi_1000.npy')

    update_numpy_file(activations_file_path, activations_dict)


The last thing my INFJ friend posted on his facebook before committing suicide the next day Rest in peace    ENFJ7 Sorry to hear of your distress Its only natural for a relationship to not be perfection all the time in every moment of existence Try to figure the hard times as times of growth as
torch.Size([63, 32768])
torch.Size([63, 768])
torch.Size([63, 768])
Non-zero activation count: 2016


  updated_df = pd.concat([df, new_df], ignore_index=True)


File output/2024-07-18\activations_fi_1000.npy has been updated and saved.
Prozac wellbrutin at least thirty minutes of moving your legs and I dont mean moving them while sitting in your same desk chair weed in moderation maybe try edibles as a healthier alternative
torch.Size([39, 32768])
torch.Size([39, 768])
torch.Size([39, 768])
Non-zero activation count: 1248
File output/2024-07-18\activations_fi_1000.npy has been updated and saved.
Basically come up with three items youve determined that each type or whichever types you want to do would more than likely use given each types cognitive functions and whatnot when left by
torch.Size([36, 32768])
torch.Size([36, 768])
torch.Size([36, 768])
Non-zero activation count: 1152
File output/2024-07-18\activations_fi_1000.npy has been updated and saved.
All things in moderation  Sims is indeed a video game and a good one at that Note a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim
torc

In [27]:
def read_numpy_file(filename):
    # 读取 NumPy 文件中的数据
    try:
        data = np.load(filename, allow_pickle=True).item()
        df = pd.DataFrame(data)
        print(f"Data from {filename}:")
        print(df.head(200000))  # 打印前几行数据
    except (FileNotFoundError, OSError):
        print(f"File {filename} not found or could not be read.")

# 示例调用
read_numpy_file(activations_file_path)

Data from output/2024-07-18\activations_fi_1000.npy:
              Feature  Index       SubIndex      Value
0           Feature 4      1          times   1.321357
1           Feature 6      1           hear   4.202684
2          Feature 11      1                  1.179202
3          Feature 35      1            not   1.101583
4          Feature 42      1         posted   0.736671
...               ...    ...            ...        ...
199995  Feature 31497    179           just   0.876745
199996  Feature 31497    179              a   0.816526
199997  Feature 31497    179           past   0.495558
199998  Feature 31538    179  <|endoftext|>  11.060105
199999  Feature 31538    179             of   2.334421

[200000 rows x 4 columns]


### Experiment2: 1-5k Prompt

In [5]:
activations = []
data = df[1000:5000] 
for index, row in data.iterrows():
    prompt_id = row['id']
    prompt = row['cleaned_posts']
    print(prompt)
    # 处理输入并获取激活
    tokens_id, tokens_str, activation_cache = process_input(model, prompt)
    activation = get_activation(activation_cache, layer_index=layer_index)

    # 编码和解码激活张量
    latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

    print(latent_activations.shape)
    print(activation.shape)
    print(reconstructed_activations.shape)
    non_zero_count = (latent_activations != 0).sum().item()
    print("Non-zero activation count:", non_zero_count)
    
    activations_dict = extract_activations(prompt_id, tokens_str, latent_activations)
    
    activations_file_path = os.path.join('output\\2024-07-13', 'activations_fi_1_5k.json')

    update_json_file(activations_file_path, activations_dict)


Hey  Same as Stefan I would even add that depending on some individuals particularly introverts language learning abilities can be easier depending on some people The best way I think to
torch.Size([34, 32768])
torch.Size([34, 768])
torch.Size([34, 768])
Non-zero activation count: 1088
Hey  You must be lucky with the weather in there Nothing is a waste of time there is always something to learn out there or even on the Internet You landed there and now you get to know us
torch.Size([41, 32768])
torch.Size([41, 768])
torch.Size([41, 768])
Non-zero activation count: 1312
Welcome on board nice to meet you  I am new here myself as an INFP so I got the feeling I hope to meet new people as well   And dont worry  your English is fine
torch.Size([41, 32768])
torch.Size([41, 768])
torch.Size([41, 768])
Non-zero activation count: 1312
You are probably right I should just do what I can do and feel more confident about it It takes time for everyone to grow up from experiences but we all get there 

KeyboardInterrupt: 

### Experiment3: 5k-10k Prompt

In [1]:
activations = []
data = df[5000:10000] 
for index, row in data.iterrows():
    prompt_id = row['id']
    prompt = row['cleaned_posts']
    print(prompt)
    # 处理输入并获取激活
    tokens_id, tokens_str, activation_cache = process_input(model, prompt)
    activation = get_activation(activation_cache, layer_index=layer_index)

    # 编码和解码激活张量
    latent_activations, reconstructed_activations = encode_decode(autoencoder, activation)

    print(latent_activations.shape)
    print(activation.shape)
    print(reconstructed_activations.shape)
    non_zero_count = (latent_activations != 0).sum().item()
    print("Non-zero activation count:", non_zero_count)
    
    activations_dict = extract_activations(prompt_id, tokens_str, latent_activations)
    
    activations_file_path = os.path.join('output\\2024-07-13', 'activations_fi_5k_10k.json')

    update_json_file(activations_file_path, activations_dict)


NameError: name 'df' is not defined