In [72]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar

In [3]:
csv_files = ["/data/kebl6672/dpo-toxic-general/toxicity/gemma2_dpo_neuron_cossims.csv", "/data/kebl6672/dpo-toxic-general/toxicity/gemma2_dpo_neuron_projections.csv", "/data/kebl6672/dpo-toxic-general/toxicity/gemma2_neuron_cossims.csv", "/data/kebl6672/dpo-toxic-general/toxicity/gemma2_neuron_projections.csv"]

# Read all CSV files into dataframes
dfs = [pd.read_csv(file) for file in csv_files]

# Merge dataframes on 'layer_idx' and 'neuron_idx'
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on=['layer_idx', 'neuron_idx'], how='outer')

# Save the merged dataframe to a new CSV file
merged_df.to_csv("/data/kebl6672/dpo-toxic-general/toxicity/gemma2_all_neuron_metrics.csv", index=False)

# Display the first few rows
print(merged_df.head())

   layer_idx  neuron_idx  dpo_cosine_similarity  dpo_projection_value  \
0          0           0               0.000140              0.000001   
1          0           1               0.006405             -0.000042   
2          0           2               0.002937              0.000007   
3          0           3              -0.000340              0.000001   
4          0           4              -0.048767              0.000382   

   dpo_activation_value  pt_cosine_similarity  pt_projection_value  \
0              0.023575             -0.000066        -5.960464e-07   
1             -0.016129              0.006180        -2.896786e-05   
2              0.006214              0.002983         5.602837e-06   
3             -0.007721             -0.000586         4.172325e-07   
4             -0.022491             -0.048981         3.707409e-04   

   pt_activation_value  
0             0.024582  
1            -0.011658  
2             0.004894  
3            -0.001814  
4            -0

In [76]:
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/llama3_all_neuron_metrics.csv')

In [77]:
df = df.replace([np.inf, -np.inf], np.nan).dropna()

In [78]:
df.head()

Unnamed: 0,layer_idx,neuron_idx,dpo_cosine_similarity,dpo_projection_value,dpo_activation_value,pt_cosine_similarity,pt_projection_value,pt_activation_value
0,0,0,0.008034,-0.002918,-0.000184,0.008034,-0.002989,-0.000189
1,0,1,-0.000513,-9.7e-05,9.5e-05,-0.000513,-0.000133,0.00013
2,0,2,0.002647,-0.001558,-0.000323,0.002647,-0.001447,-0.0003
3,0,3,-0.044128,0.002867,-3.4e-05,-0.044128,0.010277,-0.000121
4,0,4,0.021164,-0.008995,-0.000219,0.021164,-0.007141,-0.000174


In [79]:
df['activation_diff'] = df['pt_activation_value'] - df['dpo_activation_value']

In [80]:
df['projection_diff'] = df['pt_projection_value'] - df['dpo_projection_value']

In [81]:
df.head()

Unnamed: 0,layer_idx,neuron_idx,dpo_cosine_similarity,dpo_projection_value,dpo_activation_value,pt_cosine_similarity,pt_projection_value,pt_activation_value,activation_diff,projection_diff
0,0,0,0.008034,-0.002918,-0.000184,0.008034,-0.002989,-0.000189,-5e-06,-7.1e-05
1,0,1,-0.000513,-9.7e-05,9.5e-05,-0.000513,-0.000133,0.00013,3.5e-05,-3.6e-05
2,0,2,0.002647,-0.001558,-0.000323,0.002647,-0.001447,-0.0003,2.3e-05,0.000112
3,0,3,-0.044128,0.002867,-3.4e-05,-0.044128,0.010277,-0.000121,-8.7e-05,0.00741
4,0,4,0.021164,-0.008995,-0.000219,0.021164,-0.007141,-0.000174,4.5e-05,0.001854


#### Extract all neuron indexes from each neuron group

In [21]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['activation_diff'] < 0) or # AP+
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or # TN+
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) # TP-    
]

print(len(tuples_list))


206973


In [41]:
# all neurons increase projection
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) or # AP-
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] < 0) or # AN+
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['activation_diff'] < 0) or # TN-
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] < 0) # TP+    
]

print(len(tuples_list))

251479


In [86]:
# halve all
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['activation_diff'] < 0) or # AP+
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or # TN+
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) # TP-    
]

print(len(tuples_list))

220368


In [87]:
df_all = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_all.to_csv("llama3_all_neuron_configs_new_halve.csv", index=False)

In [82]:
# halve TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) # TP-    
]
print(len(tuples_list))

113851


In [83]:
df_all = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_all.to_csv("llama3_tp_an_neuron_configs_new_halve.csv", index=False)

In [84]:
# halve for TP- and AN-, double for AP+ and TN+
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] / 2)  # Halve activation
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) or  # TP-
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0)    # AN-
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 2)  # Double activation
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['activation_diff'] < 0) or  # AP+
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0) or  # TN+
       (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) or  # TP-
       (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0)    # AN-
]

print(len(tuples_list))


220368


In [85]:
df_all = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_all.to_csv("llama3_all_neuron_configs_new_halve_double.csv", index=False)

In [25]:
# Extract the AN- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0)]

print(len(tuples_list))

45929


In [26]:
df_an = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_an.to_csv("mistral_an_neuron_configs_new.csv", index=False)

In [33]:
print(tuples_list)

[(0, 18, -0.0018739700317382), (0, 32, -0.0001578330993652), (0, 43, -0.0006842613220214), (0, 64, -0.0006475448608398), (0, 81, -0.0004515647888183), (0, 83, -9.959936141967772e-05), (0, 86, -0.0003638267517089), (0, 90, -8.386373519897461e-05), (0, 104, -0.0001453161239624), (0, 114, -0.0002387762069702), (0, 117, -0.0009379386901855), (0, 125, -0.0014629364013671), (0, 159, -8.499622344970703e-05), (0, 172, -0.0008306503295898), (0, 182, -9.846687316894533e-05), (0, 203, -0.0003395080566406), (0, 213, -0.0002046823501586), (0, 216, -0.0001355409622192), (0, 228, -0.0003902912139892), (0, 245, -0.0001225471496582), (0, 247, -0.0007686614990234), (0, 248, -0.0003342628479003), (0, 252, -0.0003221035003662), (0, 257, -0.0009145736694335), (0, 260, -0.0001466274261474), (0, 263, -0.0003068447113037), (0, 271, -0.0007119178771972), (0, 312, -0.0007128715515136), (0, 318, -0.0002249479293823), (0, 332, -0.0002996921539306), (0, 339, -0.0001092553138732), (0, 344, -0.0016613006591796), (0,

In [27]:
# Extract the AP+ group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] > 0 and row['activation_diff'] < 0)]

print(len(tuples_list))

46137


In [28]:
df_ap = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_ap.to_csv("mistral_ap_neuron_configs_new.csv", index=False)

In [35]:
print(tuples_list)

[(0, 21, 0.0003507137298583), (0, 23, 0.0001548528671264), (0, 33, 0.0002021789550781), (0, 47, 0.0001067519187927), (0, 48, 0.0004661083221435), (0, 55, 0.0004446506500244), (0, 58, 0.0003542900085449), (0, 66, 0.0006113052368164), (0, 76, 0.0003077983856201), (0, 85, 8.958578109741211e-05), (0, 94, 0.0007510185241699), (0, 96, 0.0007696151733398), (0, 115, 5.120038986206055e-05), (0, 126, 0.0003495216369628), (0, 144, 0.0004963874816894), (0, 153, 0.0005321502685546), (0, 156, 0.0003557205200195), (0, 168, 0.000136375427246), (0, 176, 0.0006585121154785), (0, 177, 0.0007252693176269), (0, 191, 8.33272933959961e-05), (0, 199, 0.0005393028259277), (0, 200, 3.1888484954833984e-05), (0, 201, 0.0003814697265625), (0, 206, 0.0010404586791992), (0, 221, 0.0002009868621826), (0, 226, 0.000179648399353), (0, 235, 4.45246696472168e-05), (0, 239, 0.0004534721374511), (0, 244, 0.0002046823501586), (0, 249, 0.000112771987915), (0, 250, 0.0009074211120605), (0, 261, 0.0007457733154296), (0, 269, 0

In [29]:
# Extract the TP- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0)]

print(len(tuples_list))

68693


In [30]:
df_tp = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_tp.to_csv("mistral_tp_neuron_configs_new.csv", index=False)

In [25]:
print(tuples_list)

[(0, 7, -9.894371032714844e-06), (0, 12, 1.132488250732422e-05), (0, 13, 1.52587890625e-05), (0, 15, 0.0001988410949707), (0, 24, 0.000182032585144), (0, 31, -5.316734313964844e-05), (0, 61, 0.0003390312194824), (0, 63, 0.0008010864257812), (0, 77, 0.0001560449600219), (0, 93, 0.000307559967041), (0, 95, 0.000109851360321), (0, 102, 0.0003790855407714), (0, 105, 0.0003511905670166), (0, 110, -8.511543273925781e-05), (0, 112, 0.0001122951507568), (0, 113, 0.0007381439208984), (0, 124, 0.0001659393310546), (0, 158, 0.0009984970092773), (0, 162, 0.0006337165832519), (0, 163, 0.0002453327178955), (0, 166, 9.822845458984376e-05), (0, 167, 0.0012941360473632), (0, 173, 0.0002682209014892), (0, 174, 0.0003798007965087), (0, 179, 0.0005526542663574), (0, 187, -3.4928321838378906e-05), (0, 195, 0.0006394386291503), (0, 198, 0.0006060600280761), (0, 211, 0.0002162456512451), (0, 231, 0.0007181167602539), (0, 232, 0.0002789497375488), (0, 234, 0.0001323223114013), (0, 237, 0.0003585815429687), (0

In [31]:
# Extract the TN+ group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0)]

print(len(tuples_list))

46214


In [32]:
df_tn = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_tn.to_csv("mistral_tn_neuron_configs_new.csv", index=False)

In [39]:
print(tuples_list)

[(0, 2, -0.0003228187561035), (0, 4, -0.0002192258834838), (0, 19, -0.0002131462097167), (0, 29, -0.0001440048217773), (0, 40, -0.0001341104507446), (0, 49, -0.0003478527069091), (0, 56, -2.4974346160888672e-05), (0, 68, -0.0004262924194335), (0, 74, -0.0001358985900878), (0, 79, -0.0014076232910156), (0, 87, -0.0007758140563964), (0, 88, -0.0013313293457031), (0, 89, -0.0017709732055664), (0, 91, -0.00022554397583), (0, 101, -0.0003256797790527), (0, 103, -0.0007815361022949), (0, 106, -0.0017099380493164), (0, 108, -0.0014858245849609), (0, 121, -0.0005221366882324), (0, 128, -0.000333547592163), (0, 129, -0.0009236335754394), (0, 137, -1.8656253814697266e-05), (0, 139, -0.0002349615097045), (0, 145, -0.0002213716506958), (0, 160, -0.0014591217041015), (0, 164, -0.0007510185241699), (0, 180, -0.0001261234283447), (0, 193, -0.000533103942871), (0, 202, -0.0007076263427734), (0, 205, -0.0005712509155273), (0, 219, -0.0003786087036132), (0, 230, -0.0001287460327148), (0, 233, -0.0005583

In [88]:
# Extract the TP- and AN- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cosine_similarity'] > 0 and row['pt_activation_value'] > 0 and row['activation_diff'] > 0) or 
    (row['pt_cosine_similarity'] < 0 and row['pt_activation_value'] < 0 and row['activation_diff'] > 0)]

print(len(tuples_list))

113851


In [None]:
df_all = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_all.to_csv("llama3_tp_an_neuron_configs_new.csv", index=False)