In [1]:
import pandas as pd
import numpy as np
import scipy.optimize as opt
from scipy.special import erf, erfinv

In [2]:
def inverse_gelu(y):
    """Computes the approximate inverse of the GELU activation function."""
    # Approximate inverse using erfinv
    return np.sqrt(2) * erfinv(2 * y - 1)

#### Compute pre-gelu activations

In [3]:
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/gpt2_all_neuron_metrics.csv')

In [4]:
df.head()

Unnamed: 0,layer_idx,neuron_idx,pt_projection,dpo_projection,pt_activation,dpo_activation,pt_cossim,projection_diff,activation_diff,cossim_rank
0,0,0,-0.004434,-0.004821,-0.086628,-0.094152,0.020808,0.0003874512,0.007524,23717
1,0,1,0.000381,0.000381,-0.010077,-0.010096,-0.015356,7.888138e-08,1.9e-05,66618
2,0,2,0.008536,0.008656,-0.065236,-0.066241,-0.044885,-0.0001195893,0.001005,90704
3,0,3,0.00426,0.004129,-0.081713,-0.079245,-0.015817,0.0001311275,-0.002468,67173
4,0,4,3e-06,-3e-06,-0.073461,-0.079985,-1.8e-05,6.465025e-06,0.006524,47563


In [4]:
dpo_gelu_activation = df['dpo_activation']
df['dpo_pregelu_activation'] = df['dpo_pregelu_activation'] = df['dpo_activation'].apply(inverse_gelu)

#### Extract all neuron indexes from each neuron group

In [23]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0) or # AP+
       (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # TN+
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) # TP- 
]

print(len(tuples_list))


42193


In [6]:
# Halve the activations
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0) or # AP+
       (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # TN+
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) # TP- 
]

print(len(tuples_list))


42193


In [6]:
# halve TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) # TP-    
]
print(len(tuples_list))

22611


In [5]:
# halve for TP- and AN-, double for AP+ and TN+
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation'] / 2)  # Halve activation
    if (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) or  # TP-
       (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0)    # AN-
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation'] * 2)  # Double activation
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0) or  # AP+
       (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or  # TN+
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) or  # TP-
       (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0)    # AN-
]

print(len(tuples_list))


42193


In [6]:
print(tuples_list)

[(0, 0, -0.173255390563346), (0, 1, -0.00503853646455325), (0, 2, -0.03261823735044005), (0, 4, -0.0367302527459162), (0, 5, -0.04359761276989815), (0, 6, -0.02625255288412325), (0, 7, -0.1359273449405448), (0, 8, -0.01319770824074595), (0, 10, -0.1383247102473584), (0, 15, -0.02638525727476965), (0, 17, -0.1020660855219716), (0, 18, -0.0276075786107009), (0, 19, -0.0226236162088462), (0, 20, -0.00458627120549625), (0, 22, -0.2112011134745898), (0, 23, -0.0242209698190345), (0, 25, -0.02630940380973615), (0, 26, -0.0550874579413293), (0, 27, -0.05491605310000295), (0, 30, -0.0921130825936926), (0, 32, -0.0427783623499931), (0, 33, -0.124588132844423), (0, 34, -0.0192365136196206), (0, 36, -0.039027351375001), (0, 39, -0.1346070013804552), (0, 41, -0.1008706710006448), (0, 42, -0.0313475465174304), (0, 43, -0.0214700761689528), (0, 44, -0.128786219933203), (0, 49, 0.00167541638376825), (0, 51, -0.02324386983192575), (0, 52, -0.139107723279766), (0, 55, -0.04149726401158495), (0, 56, -0.

In [16]:
# all neurons increase projection
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation'])
    for _, row in df.iterrows()
    # if (row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) or # AP-
    #    (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] < 0)  # AN+
    if (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] < 0) or # TN-
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0) # TP+    
]

print(len(tuples_list))

25794


In [17]:
print(tuples_list)

[(0, 9, -0.0619570690327876), (0, 11, -0.0581610388355745), (0, 13, -0.079669278259664), (0, 14, -0.0969817372814998), (0, 16, -0.1001537531851882), (0, 24, -0.0813034633863275), (0, 31, -0.0621115844029986), (0, 37, -0.0822854678337078), (0, 40, -0.071011383546437), (0, 45, -0.0457430399805004), (0, 46, -0.0641464497428736), (0, 47, -0.0614900859154393), (0, 58, -0.0795069313927872), (0, 63, -0.0581956180669933), (0, 65, -0.0950745779834488), (0, 66, -0.0591252123473707), (0, 78, -0.0256990562696898), (0, 85, -0.0937701192184311), (0, 87, 0.0017238566260325), (0, 88, -0.0614973711739177), (0, 89, -0.0616796850186361), (0, 91, -0.05382565192578), (0, 97, -0.0724091734867154), (0, 104, -0.0617632974929695), (0, 107, -0.0523906274717352), (0, 109, -0.071617529179438), (0, 121, -0.0471656505884651), (0, 125, -0.074517783015633), (0, 126, -0.0767203781499174), (0, 131, -0.031327302718668), (0, 137, -0.0869123792459621), (0, 143, -0.0631200326349633), (0, 146, -0.0733650598762148), (0, 147,

In [8]:
# Extract the AP+ group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0]

print(len(tuples_list))

1803


In [9]:
print(tuples_list)

[(0, 151, 0.32057448580242787), (0, 573, 0.0414807716681572), (0, 585, 0.2847084997174404), (0, 767, 1.8369252564288712), (0, 1198, 5.756143586899447), (0, 1201, 0.1353709482819029), (0, 1276, 0.03556958153787576), (0, 1619, 2.451457894350029), (0, 1771, 0.033794952575262535), (0, 2121, 1.4197017848816544), (0, 2597, 1.7421135518461968), (0, 2939, 0.39172272796193675), (0, 3053, 0.6105780763251846), (0, 3401, 0.24635262466968263), (0, 3676, 0.45012431883723647), (0, 3969, 3.4214208836659474), (0, 4055, 1.3038349793023953), (1, 30, 0.04298325219265458), (1, 42, 0.1808266631742889), (1, 51, 0.2759534396667322), (1, 61, 0.18379916747993674), (1, 67, 0.06834303854607028), (1, 68, 0.1444626536970661), (1, 116, 0.06820161264507145), (1, 138, 0.035090394704292344), (1, 164, 0.09311001325979923), (1, 220, 0.14636272119768878), (1, 222, 0.04868485407750162), (1, 261, 0.04921787226631468), (1, 286, 0.04910092231172932), (1, 298, 0.002671826119011957), (1, 299, 0.06981195489936491), (1, 332, 0.14

In [10]:
# Extract the AN- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0]

print(len(tuples_list))

18650


In [11]:
print(tuples_list)

[(0, 1, -0.02052735401498967), (0, 2, -0.15048227647177534), (0, 4, -0.18800731724807293), (0, 5, -0.24553627875608142), (0, 6, -0.12660189935442784), (0, 8, -0.060822373839585615), (0, 15, -0.12143598410295564), (0, 18, -0.12524690053225523), (0, 20, -0.03126176347447341), (0, 23, -0.11664557281219168), (0, 25, -0.11975942107625151), (0, 26, -0.29008096609707873), (0, 27, -0.28765884714106527), (0, 32, -0.23630463520245995), (0, 34, -0.11522444813712526), (0, 36, -0.19715018218314737), (0, 42, -0.14997252347541354), (0, 51, -0.10726768436610301), (0, 55, -0.20685623781710863), (0, 56, -0.16715704783800564), (0, 68, -0.19670802972252963), (0, 71, -0.3644310477287871), (0, 73, -0.22004496945207316), (0, 75, -0.18318189505102406), (0, 76, -0.15176439824489626), (0, 81, -0.17325669947493016), (0, 93, -0.13395858010117528), (0, 95, -0.3467653924632632), (0, 99, -0.026204123910879035), (0, 100, -0.17035655521245682), (0, 110, -0.11409251856980027), (0, 117, -0.19588013093996665), (0, 120, -

In [12]:
# Extract the TN+ group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0]

print(len(tuples_list))

17779


In [13]:
print(tuples_list)

[(0, 0, -0.2302231817519518), (0, 7, -0.16444559843036385), (0, 10, -0.16013311379869205), (0, 17, -0.1130412431049659), (0, 19, -0.11566866532691565), (0, 22, -0.28852742635595946), (0, 30, -0.11844690305640676), (0, 33, -0.14265723187289606), (0, 39, -0.1957150109157087), (0, 41, -0.12086917020333544), (0, 43, -0.032290287430440726), (0, 44, -0.1684253630488014), (0, 52, -0.16818923566063212), (0, 57, -0.2112900996904529), (0, 59, -0.09168553992423424), (0, 61, -0.15717157213680527), (0, 62, -0.10008124540420542), (0, 67, -0.09854987399748628), (0, 70, -0.24355696688454326), (0, 72, -0.15384923341233142), (0, 74, -0.11889603525952161), (0, 77, -0.253201054113273), (0, 82, -0.22475608241280154), (0, 83, -0.3451420702286576), (0, 90, -0.11466217825098587), (0, 92, -0.12206981432759502), (0, 102, -0.1699164256531861), (0, 105, -0.13984385100877586), (0, 106, -0.22522191234368502), (0, 108, -0.23070088220955512), (0, 111, -0.3276075557903345), (0, 112, -0.0877357460531419), (0, 113, -0.1

In [17]:
# Extract the TP- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0]

print(len(tuples_list))

3961


In [18]:
print(tuples_list)

[(0, 49, -0.03860640062988243), (0, 190, -0.06682422075622044), (0, 224, -0.024788514064628724), (0, 236, 1.135464758473831), (0, 289, 0.07496718787992478), (0, 361, 0.05426151601886997), (0, 598, 0.4284241124420239), (0, 739, -0.08064028537461958), (0, 783, 0.1846315088504679), (0, 891, -0.008100184622615202), (0, 1036, 0.09339957143115682), (0, 1105, 0.08565159674796204), (0, 1149, 0.08752224193552155), (0, 1254, 0.020509910437633453), (0, 1268, 0.17741685386992082), (0, 1275, 0.6121585132233669), (0, 1469, 0.0016794516422290327), (0, 1604, 0.1474937948430844), (0, 1644, -0.044582039569770124), (0, 1665, 0.1163441319768452), (0, 1680, 0.5325868771552782), (0, 1850, 0.22350130986402247), (0, 1920, -0.04045074824947983), (0, 1958, -0.04717174091162083), (0, 2274, -0.060648014152716076), (0, 2275, -0.04125795933731063), (0, 2352, -0.024648923979238926), (0, 2469, 0.05332039713864008), (0, 2595, 0.32270653745576683), (0, 2687, 0.23802199068794488), (0, 2816, 0.05720367047449895), (0, 285

In [19]:
# Extract the TP- and AN- group
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) # TP- 
]

print(len(tuples_list))

22611


In [20]:
print(tuples_list)

[(0, 1, -0.02052735401498967), (0, 2, -0.15048227647177534), (0, 4, -0.18800731724807293), (0, 5, -0.24553627875608142), (0, 6, -0.12660189935442784), (0, 8, -0.060822373839585615), (0, 15, -0.12143598410295564), (0, 18, -0.12524690053225523), (0, 20, -0.03126176347447341), (0, 23, -0.11664557281219168), (0, 25, -0.11975942107625151), (0, 26, -0.29008096609707873), (0, 27, -0.28765884714106527), (0, 32, -0.23630463520245995), (0, 34, -0.11522444813712526), (0, 36, -0.19715018218314737), (0, 42, -0.14997252347541354), (0, 49, -0.03860640062988243), (0, 51, -0.10726768436610301), (0, 55, -0.20685623781710863), (0, 56, -0.16715704783800564), (0, 68, -0.19670802972252963), (0, 71, -0.3644310477287871), (0, 73, -0.22004496945207316), (0, 75, -0.18318189505102406), (0, 76, -0.15176439824489626), (0, 81, -0.17325669947493016), (0, 93, -0.13395858010117528), (0, 95, -0.3467653924632632), (0, 99, -0.026204123910879035), (0, 100, -0.17035655521245682), (0, 110, -0.11409251856980027), (0, 117, -0

In [21]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_pregelu_activation'])
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # AN-
       (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] > 0) or # TN+
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) # TP- 
]

print(len(tuples_list))

40390


In [22]:
print(tuples_list)

[(0, 0, -0.2302231817519518), (0, 1, -0.02052735401498967), (0, 2, -0.15048227647177534), (0, 4, -0.18800731724807293), (0, 5, -0.24553627875608142), (0, 6, -0.12660189935442784), (0, 7, -0.16444559843036385), (0, 8, -0.060822373839585615), (0, 10, -0.16013311379869205), (0, 15, -0.12143598410295564), (0, 17, -0.1130412431049659), (0, 18, -0.12524690053225523), (0, 19, -0.11566866532691565), (0, 20, -0.03126176347447341), (0, 22, -0.28852742635595946), (0, 23, -0.11664557281219168), (0, 25, -0.11975942107625151), (0, 26, -0.29008096609707873), (0, 27, -0.28765884714106527), (0, 30, -0.11844690305640676), (0, 32, -0.23630463520245995), (0, 33, -0.14265723187289606), (0, 34, -0.11522444813712526), (0, 36, -0.19715018218314737), (0, 39, -0.1957150109157087), (0, 41, -0.12086917020333544), (0, 42, -0.14997252347541354), (0, 43, -0.032290287430440726), (0, 44, -0.1684253630488014), (0, 49, -0.03860640062988243), (0, 51, -0.10726768436610301), (0, 52, -0.16818923566063212), (0, 55, -0.206856

#### Get the activations for top 128 toxic neurons / 36 positively activated toxic neurons

In [21]:
# Filter the DataFrame for the top 128 rows by cosine_similarity
top_128_rows = df.nlargest(128, 'pt_cossim')

# Get the total number of such rows
total_filtered_rows = top_128_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(top_128_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)

Total number of rows with highest cosine similarity and positive GPT-2 activation: 128
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (9, 545, -0.10686375994200416), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (7, 1735, -0.11794289513106787), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (3, 704, -0.10765209018216775), (10, 3477, -0.09382688046268217), (13, 1023, -0.09818235291497254), (13, 253, -0.12699634360404508), (10, 2936, -0.15094208805809126), (0, 2352, -0.024648923979238926), (7, 1916, -0.13753848921096778), (3, 3742, -0.038883108764911044), (11, 2844, -0.19994687566480335), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341

In [15]:
# Filter the DataFrame for the top 60 rows by cosine_similarity
top_60_rows = df.nlargest(60, 'pt_cossim')

# Get the total number of such rows
total_filtered_rows = top_60_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(top_60_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)

Total number of rows with highest cosine similarity and positive GPT-2 activation: 60
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (9, 545, -0.10686375994200416), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (7, 1735, -0.11794289513106787), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (3, 704, -0.10765209018216775), (10, 3477, -0.09382688046268217), (13, 1023, -0.09818235291497254), (13, 253, -0.12699634360404508), (10, 2936, -0.15094208805809126), (0, 2352, -0.024648923979238926), (7, 1916, -0.13753848921096778), (3, 3742, -0.038883108764911044), (11, 2844, -0.19994687566480335), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341,

In [17]:
# Filter the DataFrame for the top 128 rows by cosine_similarity
top_128_rows = df.nlargest(128, 'pt_cossim')

# Further filter for rows with positive gpt2_activation
filtered_rows = top_128_rows[top_128_rows['pt_activation'] > 0]

# Get the total number of such rows
total_filtered_rows = filtered_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(filtered_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)


Total number of rows with highest cosine similarity and positive GPT-2 activation: 36
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (10, 3477, -0.09382688046268217), (0, 2352, -0.024648923979238926), (3, 3742, -0.038883108764911044), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341, -0.0400893763734337), (16, 603, -0.0830782511479893), (11, 2617, -0.08237272546877626), (8, 3200, 0.0812912976275755), (19, 2312, -0.07821941281372657), (20, 3210, 0.04564807678187968), (12, 3413, -0.09751140549980601), (6, 3972, 0.2822371293946999), (0, 3393, 0.1