In [1]:
import os
import sys

os.chdir('/data/kebl6672/dpo-toxic-general')
sys.path.append('/data/kebl6672/dpo-toxic-general')

In [2]:
"""
Module Doc String
"""

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
from fancy_einsum import einsum
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformer_lens import HookedTransformer
# from toxicity.figures.fig_utils import load_hooked
# from constants import MODEL_DIR

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import sys

os.chdir('/data/kebl6672/dpo-toxic-general')
sys.path.append('/data/kebl6672/dpo-toxic-general')

In [4]:
device = torch.device("cuda:3") 
ROOT_DIR = "/data/kebl6672/dpo-toxic-general"

In [6]:
MODEL_NAME = "gpt2-medium" # "mistralai/Mistral-7B-Instruct-v0.1" #"google/gemma-2-2b" # "gpt2-medium", "mistralai/Mistral-7B-Instruct-v0.1", "meta-llama/Llama-3.1-8B", "google/gemma-2-2b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda:3")

### Logitlens

In [None]:
# Function to extract the MLP second weight vector
def extract_value_vector(model, layer_idx, neuron_idx):
    # Access the MLP's c_proj weights for the specified layer
    c_proj_weights = model.blocks[layer_idx].mlp.W_out  # Shape: (d_model, d_mlp)

    # Extract the vector for the specified neuron
    vector = c_proj_weights[neuron_idx]  # Shape: (d_model,)

    return vector


# Example usage:
layer_idx = 19
neuron_idx = 770

# Extract the value vector for layer 19, neuron 770
most_toxic_value_vector = extract_value_vector(model, layer_idx, neuron_idx)

print("Extracted vector shape:", most_toxic_value_vector.shape)


In [7]:
def get_top_tokens(vector, W_U, tokenizer, top_k=10):
    """
    Projects a feature vector onto the vocabulary space using the model's unembedding matrix
    and returns the top-k most similar tokens.
    
    Args:
        vector (torch.Tensor): The probe vector (shape: d_model).
        W_U (torch.Tensor): The unembedding layer (vocab_size, d_model).
        tokenizer: The model tokenizer.
        top_k (int): Number of top tokens to return.

    Returns:
        list of tuples: [(token, probability), ...]
    """
    # Ensure vector shape is (d_model,)
    if vector.dim() == 2:
        vector = vector.squeeze(0)  # (1, d_model) -> (d_model,)

    # Project the vector onto vocab space
    vocab_projection = torch.matmul(vector, W_U.T)  # Shape: (vocab_size,)

    # Convert to probabilities using softmax
    probs = torch.softmax(vocab_projection, dim=-1)

    # Get top K token indices and probabilities
    top_probs, top_indices = torch.topk(probs, top_k)

    # Decode tokens
    top_tokens = [tokenizer.decode([idx]) for idx in top_indices]

    # Return as a list of tuples (token, probability)
    return list(zip(top_tokens, top_probs.tolist()))

In [None]:
# Function to project a vector to vocab space and get top 10 tokens
# def get_top_tokens(vector, W_U, tokenizer, top_k=10):
#     # Project the vector onto vocab space
#     vocab_projection = torch.matmul(vector, W_U)  # Shape: (d_vocab,)

#     # Convert to probabilities using softmax
#     probs = torch.softmax(vocab_projection, dim=-1)

#     # Get top K token indices and probabilities
#     top_probs, top_indices = torch.topk(probs, top_k)

#     # Decode tokens
#     top_tokens = [tokenizer.decode([idx]) for idx in top_indices]

#     # Return as a list of tuples (token, probability)
#     return list(zip(top_tokens, top_probs.tolist()))

In [7]:
# load the toxicity probe
# toxic_probe = torch.load(os.path.join(ROOT_DIR, "checkpoints/gpt2_probe.pt")).to(device) 
toxic_probe = torch.load(os.path.join(ROOT_DIR, "checkpoints/llama_probe.pt")).to(device) 

In [9]:
# Get top 10 tokens of the toxic probe
top_10_tokens = get_top_tokens(toxic_probe, model.lm_head.weight, tokenizer, top_k=30)

# Print results
print("Top 10 Tokens Projected from Toxic Probe:")
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob:.9f}")

Top 10 Tokens Projected from Toxic Probe:
Token: ển, Probability: 0.000016479
Token: kommen, Probability: 0.000016409
Token:  FUCK, Probability: 0.000016168
Token: iyah, Probability: 0.000015594
Token: 털, Probability: 0.000015316
Token: ̆, Probability: 0.000015202
Token: fuck, Probability: 0.000015128
Token: dirty, Probability: 0.000014995
Token: arton, Probability: 0.000014574
Token: ToBounds, Probability: 0.000014554
Token:  Rudd, Probability: 0.000014531
Token: ên, Probability: 0.000014486
Token:  порт, Probability: 0.000014437
Token: arel, Probability: 0.000014410
Token:  Кра, Probability: 0.000014410
Token: ….

, Probability: 0.000014278
Token: gart, Probability: 0.000014239
Token: ityEngine, Probability: 0.000014229
Token: .Slf, Probability: 0.000014215
Token: NetMessage, Probability: 0.000014211
Token:  xsi, Probability: 0.000014122
Token: ToProps, Probability: 0.000014106
Token: ーニ, Probability: 0.000014053
Token: kees, Probability: 0.000014033
Token: venir, Probability: 0.0000

In [8]:
# load the toxicity probe
toxic_probe = torch.load(os.path.join(ROOT_DIR, "checkpoints/gpt2_probe.pt")).to(device) 

In [9]:
# Get top 10 tokens of the toxic probe
top_10_tokens = get_top_tokens(toxic_probe, model.lm_head.weight, tokenizer, top_k=20)

# Print results
print("Top 10 Tokens Projected from Toxic Probe:")
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob:.9f}")

Top 10 Tokens Projected from Toxic Probe:
Token: Fuck, Probability: 0.156536207
Token:  FUCK, Probability: 0.107314922
Token: fuck, Probability: 0.058655694
Token:  Fuck, Probability: 0.047995940
Token:  fucking, Probability: 0.046780676
Token:  cunt, Probability: 0.022722524
Token:  fuck, Probability: 0.021766283
Token:  Shit, Probability: 0.021676421
Token: shit, Probability: 0.016469916
Token:  goddamn, Probability: 0.013456282
Token:  asshole, Probability: 0.013199502
Token:  fucked, Probability: 0.012850781
Token:  Godd, Probability: 0.009093617
Token:  whore, Probability: 0.007361848
Token:  godd, Probability: 0.005862250
Token:  fuckin, Probability: 0.003880905
Token:  shit, Probability: 0.003843054
Token:  worthless, Probability: 0.003734149
Token:  bitch, Probability: 0.003228735
Token:  retarded, Probability: 0.002757493


In [11]:
# load the toxicity probe
# toxic_probe = torch.load(os.path.join(ROOT_DIR, "checkpoints/gpt2_probe.pt")).to(device) 
toxic_probe = torch.load(os.path.join(ROOT_DIR, "checkpoints/mistral_probe.pt")).to(device) 

In [13]:
# Get top 10 tokens of the toxic probe
top_10_tokens = get_top_tokens(toxic_probe, model.lm_head.weight, tokenizer, top_k=20)

# Print results
print("Top 10 Tokens Projected from Toxic Probe:")
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob:.9f}")

Top 10 Tokens Projected from Toxic Probe:
Token: shit, Probability: 0.000034220
Token: shit, Probability: 0.000034109
Token: fuck, Probability: 0.000033883
Token: Fuck, Probability: 0.000033876
Token: fucking, Probability: 0.000033648
Token: assh, Probability: 0.000033392
Token: bullshit, Probability: 0.000033335
Token: fucked, Probability: 0.000033323
Token: asshole, Probability: 0.000033312
Token: upid, Probability: 0.000033289
Token: dick, Probability: 0.000033108
Token: shitty, Probability: 0.000033086
Token: biologie, Probability: 0.000033049
Token: idiot, Probability: 0.000033038
Token: irtual, Probability: 0.000033001
Token: crap, Probability: 0.000032951
Token: bast, Probability: 0.000032911
Token: cí, Probability: 0.000032899
Token: Checked, Probability: 0.000032885
Token: ﬁ, Probability: 0.000032863


In [None]:
# Get top 10 tokens of the toxic probe
top_10_tokens = get_top_tokens(toxic_probe, model.lm_head.weight, tokenizer, top_k=20)

# Print results
print("Top 10 Tokens Projected from Toxic Probe:")
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob:.4f}")

In [None]:
# Get top 10 tokens of the toxic probe
top_10_tokens = get_top_tokens(toxic_probe, model.lm_head.weight, tokenizer, top_k=20)

# Print results
print("Top 10 Tokens Projected from Toxic Probe:")
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob:.4f}")

In [None]:
# Get top 10 tokens of the most toxic value vector
top_10_tokens = get_top_tokens(most_toxic_value_vector, model.W_U, model.tokenizer, top_k=10)

# Print results
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob}")

In [None]:
# load data
df = pd.read_csv('./all_neuron_metrics.csv')

In [None]:
def get_kth_row_layer_neuron_index(df, k):
    # Sort the DataFrame by cosine similarity in descending order
    sorted_df = df.sort_values(by='cosine_similarity', ascending=False)

    # Get the k-th row (convert k to 0-based index)
    kth_row = sorted_df.iloc[k - 1]

    return int(kth_row['layer_idx']), int(kth_row['neuron_idx'])

In [None]:
# Example usage
k = 129 # Specify the k value (1-based index)
layer_idx, neuron_idx = get_kth_row_layer_neuron_index(df, k)

# Display the result
print(f"Layer Index: {layer_idx}, Neuron Index: {neuron_idx}")

In [None]:
# Example usage
k = 128  # Specify the k value (the cosine similarity rank index)
layer_idx, neuron_idx = get_kth_row_layer_neuron_index(df, k)

# Extract the corresponding value vector
value_vector = extract_value_vector(model, layer_idx, neuron_idx)

# Get top 10 tokens of the value vector
top_10_tokens = get_top_tokens(value_vector, model.W_U, model.tokenizer, top_k=10)

# Print results
for token, prob in top_10_tokens:
    print(f"Token: {token}, Probability: {prob}")