In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar

In [2]:
# csv_files = ["/data/kebl6672/dpo-toxic-general/toxicity/mistral_neuron_cossims.csv", "/data/kebl6672/dpo-toxic-general/toxicity/mistral_neuron_projections.csv", "/data/kebl6672/dpo-toxic-general/toxicity/mistral_dpo_neuron_cossims.csv", "/data/kebl6672/dpo-toxic-general/toxicity/mistral_dpo_neuron_projections.csv"]

# # Read all CSV files into dataframes
# dfs = [pd.read_csv(file) for file in csv_files]

# # Merge dataframes on 'layer_idx' and 'neuron_idx'
# merged_df = dfs[0]
# for df in dfs[1:]:
#     merged_df = pd.merge(merged_df, df, on=['layer_idx', 'neuron_idx'], how='outer')

# # Save the merged dataframe to a new CSV file
# merged_df.to_csv("/data/kebl6672/dpo-toxic-general/toxicity/mistral_all_neuron_metrics.csv", index=False)

# # Display the first few rows
# print(merged_df.head())

   layer_idx  neuron_idx  pt_cosine_similarity  pt_projection_value  \
0          0           0              0.022415        -6.556511e-07   
1          0           1             -0.020233        -7.152557e-07   
2          0           2             -0.004261        -5.364418e-07   
3          0           3              0.013405        -8.344650e-07   
4          0           4             -0.019363         4.768372e-07   

   pt_activation_value  dpo_cosine_similarity  dpo_projection_value  \
0            -0.000160               0.022430         -2.205372e-06   
1             0.000187              -0.020126         -1.132488e-06   
2             0.000738              -0.004234         -4.172325e-07   
3            -0.000350               0.013634         -9.536743e-07   
4            -0.000139              -0.019455          2.384186e-07   

   dpo_activation_value  
0             -0.000520  
1              0.000304  
2              0.000550  
3             -0.000379  
4             -0

In [2]:
# Use old version!
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/llama3_all_neuron_metrics.csv')

In [3]:
df = df.replace([np.inf, -np.inf], np.nan).dropna()

In [4]:
# df['activation_diff'] = df['dpo_activation_value'] - df['pt_activation_value']
# df['projection_diff'] = df['dpo_projection_value'] - df['pt_projection_value']

In [4]:
df.head()

Unnamed: 0,layer_idx,neuron_idx,dpo_cosine_similarity,dpo_projection_value,dpo_activation_value,pt_cosine_similarity,pt_projection_value,pt_activation_value
0,0,0,0.008034,-0.002918,-0.000184,0.008034,-0.002989,-0.000189
1,0,1,-0.000513,-9.7e-05,9.5e-05,-0.000513,-0.000133,0.00013
2,0,2,0.002647,-0.001558,-0.000323,0.002647,-0.001447,-0.0003
3,0,3,-0.044128,0.002867,-3.4e-05,-0.044128,0.010277,-0.000121
4,0,4,0.021164,-0.008995,-0.000219,0.021164,-0.007141,-0.000174


##### Patch to DPO activations

In [12]:
def get_dpo_activations(df, targets):
    """
    Extract (layer_idx, neuron_idx, dpo_activation_value) for each (layer_idx, neuron_idx) in targets.
    """
    results = []
    for layer_idx, neuron_idx in targets:
        row = df[(df['layer_idx'] == layer_idx) & (df['neuron_idx'] == neuron_idx)]
        if not row.empty:
            act = row['dpo_activation_value'].values[0]
            results.append((layer_idx, neuron_idx, act))
    return results


In [8]:
target_neurons = [(22, 1061), (2, 8896), (14, 2292), (15, 2454), (22, 5047), (20, 1349), (27, 5067), (13, 6053), (14, 11281), (19,4689), (10, 6660), (6, 7364), (1, 13026), (4, 1609)]
output_list = get_dpo_activations(df, target_neurons)
print(output_list)

[(22, 1061, -0.0312347412109375), (2, 8896, -0.0001000761985778), (14, 2292, -0.0052947998046875), (15, 2454, -0.0001598596572875), (22, 5047, 0.031829833984375), (20, 1349, -0.0015907287597656), (27, 5067, -0.0014009475708007), (13, 6053, -0.0014142990112304), (14, 11281, -0.042633056640625), (19, 4689, 0.0055198669433593), (10, 6660, -0.0087432861328125), (6, 7364, 0.0004408359527587), (1, 13026, -0.0009560585021972), (4, 1609, -0.001169204711914)]


In [None]:
df_subset = pd.DataFrame(output_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("mistral_toxic_neuron_configs.csv", index=False)

##### Extract all neuron indexes from each neuron group

In [10]:
# Patch all uparrow ones
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['projection_diff'] < 0) 
]

print(tuples_list)

[(0, 0, 0.0235748291015625), (0, 2, 0.0062141418457031)]


In [35]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("mistral_patch_four_neuron_configs.csv", index=False)

In [22]:
# Patch AP
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # AP
]

print(len(tuples_list))

48306


In [24]:
# Patch AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # AN
]

print(len(tuples_list))

70928


In [26]:
# Patch TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # TN  
]

print(len(tuples_list))

48371


In [28]:
# Patch TP
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP    
]

print(len(tuples_list))

70830


In [30]:
# Patch TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP    
]

print(len(tuples_list))

141758


In [32]:
# Patch TP + AN + TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # TN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP    
]

print(len(tuples_list))

190129


In [34]:
# Patch all four groups
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or # AP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # TN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP    
]

print(len(tuples_list))


238435


##### Tuning-free group identification

In [6]:
# Halve TP 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) # TP    
]
print(len(tuples_list))

60673


In [7]:
print(tuples_list[:5])

[(0, 2, 0.0024471282958984), (0, 10, 0.00921630859375), (0, 27, 0.00556564331054685), (0, 30, 0.0032367706298828), (0, 37, 0.0070533752441406)]


In [8]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("llama3_1.15_two_0.85_two_free_neuron_configs.csv", index=False)

In [14]:
# Halve AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0) # AN  
]
print(len(tuples_list))

115029


In [16]:
# Halve TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0) or # AN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) # TP    
]
print(len(tuples_list))

229702


In [18]:
# 1.5* AP 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0) # AP
]
print(len(tuples_list))

114557


In [20]:
# 1.5* TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0) # TN  
]
print(len(tuples_list))

114473


In [7]:
# 1.5* (AP + TN)
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0) or # AP
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0) # TN  
]
print(len(tuples_list))

225815


In [9]:
# 0.95* for TP and AN, 1.05* for AP and TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 0.85) 
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 1.15)  
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0) or  # AP
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0) or  # TN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
]

print(len(tuples_list))

454174


In [48]:
# 0.75* for TP and AN, 1.25* for AP and TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 0.75) 
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 1.25)  
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0) or  # AP
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0) or  # TN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
]

print(len(tuples_list))

458728


In [24]:
# Halve for TP and AN, 1.5* for AP and TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] / 2)  # Halve activation
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 1.5)  # 1.5*activation
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0) or  # AP
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0) or  # TN
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0)    # AN
]

print(len(tuples_list))


458732
