In [1]:
# 设置环境变量
import os
import sys
sys.path.append('D:\ComputerScience\Research\PRADA\sparse_autoencoder')
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# 导入库
import torch
import blobfile as bf
from experiments.utils import *
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2LMHeadModel, AutoTokenizer, GPT2Tokenizer, GPT2Config, set_seed, GPT2Model

In [2]:
# 下载AutoEncoder
position = "resid_post_mlp"
layer_index = 6
download_autoencoder(position, layer_index=layer_index, size=32)
download_autoencoder(position, layer_index=layer_index, size=128)

Downloading SAE from: az://openaipublic/sparse-autoencoder/gpt2-small/resid_post_mlp_v5_32k/autoencoders/6.pt
State dictionary saved to ../model/gpt2_sae/sae_state_32k_layer_6.pt
Downloading SAE from: az://openaipublic/sparse-autoencoder/gpt2-small/resid_post_mlp_v5_128k/autoencoders/6.pt


In [2]:
def feature_steering(autoencoder,x: torch.Tensor, feature_indices: list[int], feature_values: list[float]) -> torch.Tensor:
    assert len(feature_indices) == len(feature_values), "Feature indices and values must have the same length."
    # feature_values = [max(min(value, 10), -10) for value in feature_values]

    with torch.no_grad():
        # 获取原始特征表示和信息
        latents, info = autoencoder.encode(x)
        # 修改特征表示
        for index, value in zip(feature_indices, feature_values):
            print("original:", latents[:, index])
            if value > 0:
                latents[:, index] *= value
            else:
                latents[:, index] = latents[:, index] / abs(value)
            print("Modified:", latents[:, index])
            print(f"Feature {index} modified with {'+' if value >= 0 else ''}{value}")
        # 使用修改后的特征表示通过解码器生成重构输出
        modified_output = autoencoder.decode(latents, info)
    return modified_output

def calculate_error(input_tensor, reconstructed_activations) -> torch.Tensor:
    # 计算误差
    error = input_tensor - reconstructed_activations
    # 可以选择使用不同的误差度量方式，这里使用均方误差（MSE）
    normalized_mse = (reconstructed_activations - input_tensor).pow(2).sum(dim=1) / (input_tensor).pow(2).sum(dim=1)
    return normalized_mse, error


def compare_activations(tensor1, tensor2):
    difference = tensor1 - tensor2
    print("Difference between tensors:\n", difference)

    # 计算差异的统计信息
    mean_diff = torch.mean(difference)
    std_diff = torch.std(difference)
    print(f"Mean difference: {mean_diff.item()}")
    print(f"Standard deviation of difference: {std_diff.item()}")

    # 可视化差异
    difference_np = difference.numpy()
    plt.figure(figsize=(20, 5))
    plt.imshow(difference_np, cmap='coolwarm', aspect='auto')
    plt.colorbar(label='Difference')
    plt.title('Difference between Reconstructed Activations and Modified Output')
    plt.xlabel('Feature Index')
    plt.ylabel('Sample Index')
    plt.show()

def chat_with_gpt2_logits(model, tokens_id, tokenizer):
    model.eval()
    
    with torch.no_grad():
        outputs = model(tokens_id)
        logits = outputs.logits
        # 使用torch.argmax选出概率最高的token ids
        predicted_ids = torch.argmax(logits, dim=-1)
        # 解码生成的token ids
        response = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        
    return response, logits

def chat_with_gpt2_top_k_candidates(model, tokens_id, tokenizer, top_k=10):
    model.eval()
    
    with torch.no_grad():
        outputs = model(tokens_id)
        logits = outputs.logits
        
        # 选择每个时间步上概率最高的top_k个token的logits
        top_k_logits, top_k_indices = torch.topk(logits, k=top_k, dim=-1)
        
        # 解码每个token的索引以获取token字符串
        top_k_tokens = [
            [tokenizer.decode([idx]) for idx in indices[0]] for indices in top_k_indices
        ]

    for step in range(min(10, logits.shape[1])):  # 限制打印至最多前10个token
            print(f"Step {step + 1}:")
            for i in range(top_k):
                token = tokenizer.decode([top_k_indices[0, step, i]])
                logit = top_k_logits[0, step, i].item()
                print(f"  Candidate {i + 1}: {token} (Logit: {logit})")
            print("\n")
        
    return top_k_tokens, top_k_logits

In [3]:
model, auto_tokenizer, device = load_model_hf("gpt2")
layer_index = 6
autoencoder = load_autoencoder_from_local(layer_index, device, 128)
set_seed(123)

# Activation Reconstruction

In [1]:
"""
prompt = "Are you introverted?"
feature_indices = [53912]
feature_values = [10] 
tokens_id, tokens_str, activation_cache = process_input_hf(model, auto_tokenizer, prompt)
print("Tokens ID (AutoTokenizer):", tokens_id)
print("Tokens String (AutoTokenizer):", tokens_str)
print(len(activation_cache))
activation = get_activation_hf(activation_cache, layer_index)
print(f"resid_post_mlp for layer {layer_index}:", activation.shape if activation is not None else "None")
print(activation)

latent_activations, recon_activations = encode_decode(autoencoder, activation)
mse_error, error = calculate_error(activation, recon_activations)

modified_recon_activations = feature_steering(autoencoder, activation, feature_indices, feature_values)
print("orginal modified_recon_activations:", modified_recon_activations)
print(modified_recon_activations.shape)

modified_recon_activations_new = modified_recon_activations + error
print("modified_recon_activations + error:", modified_recon_activations_new)

mse_error_after, error_after = calculate_error(activation, modified_recon_activations_new)
print(error_after)
print(mse_error_after)
"""

'\nprompt = "Are you introverted?"\nfeature_indices = [53912]\nfeature_values = [10] \ntokens_id, tokens_str, activation_cache = process_input_hf(model, auto_tokenizer, prompt)\nprint("Tokens ID (AutoTokenizer):", tokens_id)\nprint("Tokens String (AutoTokenizer):", tokens_str)\nprint(len(activation_cache))\nactivation = get_activation_hf(activation_cache, layer_index)\nprint(f"resid_post_mlp for layer {layer_index}:", activation.shape if activation is not None else "None")\nprint(activation)\n\nlatent_activations, recon_activations = encode_decode(autoencoder, activation)\nmse_error, error = calculate_error(activation, recon_activations)\n\nmodified_recon_activations = feature_steering(autoencoder, activation, feature_indices, feature_values)\nprint("orginal modified_recon_activations:", modified_recon_activations)\nprint(modified_recon_activations.shape)\n\nmodified_recon_activations_new = modified_recon_activations + error\nprint("modified_recon_activations + error:", modified_recon_

In [5]:
# 提取原始第六层
prompt = "Are you introverted?"
tokens_id, tokens_str, activation_cache = process_input_hf(model, auto_tokenizer, prompt)
print("Tokens ID (AutoTokenizer):", tokens_id)
print("Tokens String (AutoTokenizer):", tokens_str)
print(len(activation_cache))
activation = get_activation_hf(activation_cache, layer_index)
print(f"resid_post_mlp for layer {layer_index}:", activation.shape if activation is not None else "None")
print(activation)
modified_recon_activations_new = activation
mse_error_after, error_after = calculate_error(activation, modified_recon_activations_new)
print(error_after)
print(mse_error_after)

Tokens ID (AutoTokenizer): tensor([[ 8491,   345, 18951, 13658,    30]])
Tokens String (AutoTokenizer): ['Are', 'Ġyou', 'Ġintro', 'verted', '?']
13
resid_post_mlp for layer 6: torch.Size([5, 768])
tensor([[ 0.9184,  0.1396,  0.4812,  ..., -1.7562, -0.2046,  0.3966],
        [-2.3731,  0.7637, -1.3836,  ..., -1.3632, -2.3260,  0.6030],
        [ 2.1773, -3.8999, -2.5548,  ..., -4.0262,  0.6047, -0.9181],
        [ 3.8940,  0.6625, -2.7102,  ..., -0.2636,  4.2834,  0.9351],
        [ 0.0863,  0.7447, -0.9693,  ..., -0.6241, -1.3438,  3.7102]])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0., 0., 0., 0., 0.])


### Original Output

In [None]:
def chat_with_gpt2(model, tokens_id):
    with torch.no_grad():
        outputs = model.generate(tokens_id, max_length=100, pad_token_id=auto_tokenizer.eos_token_id)
    response = auto_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

response = chat_with_gpt2(model, tokens_id)
response_l, logits = chat_with_gpt2_logits(model, tokens_id, auto_tokenizer)
chat_with_gpt2_top_k_candidates(model, tokens_id, auto_tokenizer)
print("Original Output:", response)
print("logits type Output:", response_l)

Step 1:
  Candidate 1:  the (Logit: -29.919225692749023)
  Candidate 2:  a (Logit: -30.52708625793457)
  Candidate 3:  to (Logit: -30.872114181518555)
  Candidate 4: , (Logit: -31.00540542602539)
  Candidate 5: 
 (Logit: -31.133556365966797)
  Candidate 6:  you (Logit: -31.3023681640625)
  Candidate 7: . (Logit: -31.325265884399414)
  Candidate 8:  in (Logit: -31.418594360351562)
  Candidate 9:  that (Logit: -31.46228790283203)
  Candidate 10:  it (Logit: -31.574438095092773)


Step 2:
  Candidate 1:  a (Logit: -119.24163055419922)
  Candidate 2:  sure (Logit: -119.3289794921875)
  Candidate 3:  going (Logit: -119.41527557373047)
  Candidate 4:  ready (Logit: -119.50484466552734)
  Candidate 5:  looking (Logit: -120.21627807617188)
  Candidate 6:  still (Logit: -120.2265396118164)
  Candidate 7:  interested (Logit: -120.4596176147461)
  Candidate 8:  using (Logit: -120.46378326416016)
  Candidate 9:  in (Logit: -120.65441131591797)
  Candidate 10:  worried (Logit: -120.6690444946289)



### Controlled Output

In [7]:
mse_error_after, error_after = calculate_error(activation, modified_recon_activations_new)
print(mse_error_after)
modified_activations = modified_recon_activations_new.unsqueeze(0)
print(modified_activations.shape)
print(modified_activations)
print(tokens_id)
print(tokens_str)

tensor([0., 0., 0., 0., 0.])
torch.Size([1, 5, 768])
tensor([[[ 0.9184,  0.1396,  0.4812,  ..., -1.7562, -0.2046,  0.3966],
         [-2.3731,  0.7637, -1.3836,  ..., -1.3632, -2.3260,  0.6030],
         [ 2.1773, -3.8999, -2.5548,  ..., -4.0262,  0.6047, -0.9181],
         [ 3.8940,  0.6625, -2.7102,  ..., -0.2636,  4.2834,  0.9351],
         [ 0.0863,  0.7447, -0.9693,  ..., -0.6241, -1.3438,  3.7102]]])
tensor([[ 8491,   345, 18951, 13658,    30]])
['Are', 'Ġyou', 'Ġintro', 'verted', '?']


In [8]:
from transformers.modeling_outputs import CausalLMOutputWithPast
from torch import nn
class ModifiedGPT2Model(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.modified_output = None  # 用于存储第六层的原始输出
        self.temp = GPT2Model(config=config)
        self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        
    def set_modified_output(self, output, layer_to_modify=6):
        self.modified_output = output
        self.layer_to_modify = layer_to_modify


    def forward(self, input_ids, output_hidden_states=True, **kwargs):
        output = self.transformer(input_ids, output_hidden_states=True, **kwargs)
        # output = super().forward(input_ids, output_hidden_states=True, **kwargs)
        hidden_states = output[0]
        logits_before = self.lm_head(hidden_states)
        print("original final state:", hidden_states)
        print("before shape:", logits_before.shape)
        print("before:", logits_before)
        # 继续通过剩余的层
        if self.modified_output is not None:
            # 从指定层开始使用提供的激活值进行修改
            modified_states = self.modified_output
            for i in range(self.layer_to_modify, len(self.transformer.h)):
                print(f"Layer {i} original activation:", output[2][i])
                layer_module = self.transformer.h[i]
                print(f"Layer {i} modified activation:", modified_states)
                layer_outputs = layer_module(modified_states, attention_mask=None)  
                modified_states = layer_outputs[0]
                print(f"Layer {i+1} pre-view activation:", modified_states)
            # 将最终输出设置为最后一层修改后的输出
            hidden_states = modified_states
            print("final state original after ln_f: ", output[2][12] )
            print("final state modified before ln_f: ", hidden_states)
            hidden_states = self.ln_f(hidden_states)
            print("final state modified after ln_f: ", hidden_states)
            # print("error:",calculate_error(output[2][12], hidden_states))
        # 输出最后一层的hidden state给LM头
        logits = self.lm_head(hidden_states)
        #print("after shape:", logits.shape)
        #print("after:", logits)
        #mse, error = calculate_error(logits, logits_before)
        #print("error:",error)
        return CausalLMOutputWithPast(logits=logits) 

In [28]:
from transformers.modeling_outputs import CausalLMOutputWithPast
class ModifiedGPT2Model(GPT2LMHeadModel):
    def __init__(self, config, layer_to_modify=6, feature_indices = [53912], feature_values = [10]):
        super().__init__(config)
        self.modified_output = None  # 用于存储第六层的原始输出
        self.ln_f = GPT2Model(config=config).ln_f
        self.feature_indices = feature_indices
        self.feature_values = feature_values
        self.layer_to_modify = layer_to_modify
        
    def set_modified_output(self, hidden_list):
        activation = hidden_list[self.layer_to_modify][0]
        latent_activations, recon_activations = encode_decode(autoencoder, activation)
        mse_error, error = calculate_error(activation, recon_activations)
        modified_recon_activations = feature_steering(autoencoder, activation, self.feature_indices, self.feature_values)
        modified_recon_activations_new = modified_recon_activations + error
        self.modified_output = modified_recon_activations_new.unsqueeze(0)
        print(self.modified_output.shape)

    def forward(self, input_ids, output_hidden_states=True, **kwargs):
        output = self.transformer(input_ids, output_hidden_states=True, **kwargs)
        final_hidden_states = output[0]
        hidden_states_list = output[2]
        self.set_modified_output(hidden_states_list)
        # 继续通过剩余的层
        if self.modified_output is not None:
            # 从指定层开始使用提供的激活值进行修改
            modified_states = self.modified_output
            for i in range(self.layer_to_modify, len(self.transformer.h)):
                layer_module = self.transformer.h[i]
                layer_outputs = layer_module(modified_states, attention_mask=None)  
                modified_states = layer_outputs[0]
            # 将最终输出设置为最后一层修改后的输出
            final_hidden_states = modified_states
            final_hidden_states = self.ln_f(final_hidden_states)
        # 输出最后一层的hidden state给LM头
        logits = self.lm_head(final_hidden_states)
        return CausalLMOutputWithPast(logits=logits) 

In [34]:
"""
set_seed(123)
custom_model = ModifiedGPT2Model(model.config)
custom_model.load_state_dict(model.state_dict())  # 复制权重
"""
set_seed(123)
custom_model = ModifiedGPT2Model.from_pretrained('gpt2', output_hidden_states=True, layer_to_modify=6, feature_indices = [53912], feature_values = [10])
inputs = auto_tokenizer.encode("Are you introverted?", return_tensors="pt")
generated_text_ids = custom_model.generate(inputs, max_length=10, pad_token_id=auto_tokenizer.eos_token_id)
generated_text = auto_tokenizer.decode(generated_text_ids[0], skip_special_tokens=True)

print(generated_text)

original: tensor([0.0000, 0.0000, 4.2954, 7.9834, 0.0000])
Modified: tensor([ 0.0000,  0.0000, 42.9540, 79.8337,  0.0000])
Feature 53912 modified with +10
torch.Size([1, 5, 768])
original: tensor([0.0000, 0.0000, 4.2954, 7.9834, 0.0000, 0.0000])
Modified: tensor([ 0.0000,  0.0000, 42.9540, 79.8337,  0.0000,  0.0000])
Feature 53912 modified with +10
torch.Size([1, 6, 768])
original: tensor([0.0000, 0.0000, 4.2954, 7.9834, 0.0000, 0.0000, 0.0000])
Modified: tensor([ 0.0000,  0.0000, 42.9540, 79.8337,  0.0000,  0.0000,  0.0000])
Feature 53912 modified with +10
torch.Size([1, 7, 768])
original: tensor([0.0000, 0.0000, 4.2954, 7.9834, 0.0000, 0.0000, 0.0000, 0.0000])
Modified: tensor([ 0.0000,  0.0000, 42.9540, 79.8336,  0.0000,  0.0000,  0.0000,  0.0000])
Feature 53912 modified with +10
torch.Size([1, 8, 768])
original: tensor([0.0000, 0.0000, 4.2954, 7.9834, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])
Modified: tensor([ 0.0000,  0.0000, 42.9540, 79.8336,  0.0000,  0.0000,  0.0000,  0.0000,
 

In [65]:
generated_text_l, logits = chat_with_gpt2_logits(custom_model, inputs, auto_tokenizer)
chat_with_gpt2_top_k_candidates(custom_model, inputs, auto_tokenizer)
print("logits type Steered Output:", generated_text_l)

modified_output is setted
this is layer 6: tensor([[[ 9.1928e+02,  1.3977e+02,  4.8171e+02,  ..., -1.7579e+03,
          -2.0484e+02,  3.9704e+02],
         [-2.3754e+03,  7.6445e+02, -1.3850e+03,  ..., -1.3646e+03,
          -2.3283e+03,  6.0361e+02],
         [ 4.7753e+08, -3.5279e+08, -5.1653e+06,  ..., -7.6302e+07,
           1.4349e+09,  6.0882e+08],
         [ 9.0576e+08, -6.6914e+08, -9.7951e+06,  ..., -1.4472e+08,
           2.7217e+09,  1.1548e+09],
         [ 8.6380e+01,  7.4543e+02, -9.7030e+02,  ..., -6.2472e+02,
          -1.3452e+03,  3.7139e+03]]])
this is layer 7: tensor([[[ 9.1925e+02,  1.3994e+02,  4.8160e+02,  ..., -1.7579e+03,
          -2.0491e+02,  3.9693e+02],
         [-2.3752e+03,  7.6522e+02, -1.3857e+03,  ..., -1.3639e+03,
          -2.3272e+03,  6.0387e+02],
         [ 4.7753e+08, -3.5279e+08, -5.1653e+06,  ..., -7.6302e+07,
           1.4349e+09,  6.0882e+08],
         [ 9.0576e+08, -6.6914e+08, -9.7951e+06,  ..., -1.4472e+08,
           2.7217e+09,  1.1548