In [1]:
from pandas import json_normalize
import pandas as pd
import json

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm


def flattened_idx_to_layer_head(flattened_idx, num_heads):
    return flattened_idx // num_heads, flattened_idx % num_heads

def layer_head_to_flattened_idx(layer, head, num_heads):
    return layer * num_heads + head

def get_top_heads(train_idxs, val_idxs, separated_activations, separated_labels, num_layers, num_heads, seed, num_to_intervene, use_random_dir=False, specific_heads = None):

    probes, all_head_accs_np, train_accs = train_probes(seed, train_idxs, val_idxs, separated_activations, separated_labels, num_layers=num_layers, num_heads=num_heads)
    all_head_accs_np = all_head_accs_np.reshape(num_layers, num_heads)

    if specific_heads is not None:
        top_heads = [layer_head_to_flattened_idx(head[0], head[1], num_heads) for head in specific_heads]
        
    else:
        
        top_heads = []
        top_accs = np.argsort(all_head_accs_np.reshape(num_heads*num_layers))[::-1][:num_to_intervene]
        print(top_accs)
        #print(top_accs[0:5])
        top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in top_accs]
    if use_random_dir: 
        # overwrite top heads with random heads, no replacement
        random_idxs = np.random.choice(num_heads*num_layers, num_heads*num_layers, replace=False)
        top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in random_idxs[:num_to_intervene]]

    return top_heads, probes

def get_interventions_dict(top_heads, probes, tuning_activations, num_heads, use_center_of_mass, use_random_dir, com_directions): 

    interventions = {}
    for layer, head in top_heads: 
        interventions[f"model.layers.{layer}.self_attn.head_out"] = []
    for layer, head in top_heads:
        if use_center_of_mass: 
            direction = com_directions[layer_head_to_flattened_idx(layer, head, num_heads)]
        elif use_random_dir: 
            direction = np.random.normal(size=(128,))
        else: 
            direction = probes[layer_head_to_flattened_idx(layer, head, num_heads)].coef_
        direction = direction / np.linalg.norm(direction)
        activations = tuning_activations[:,layer,head,:] # batch x 128
        proj_vals = activations @ direction.T
        proj_val_std = np.std(proj_vals)
        interventions[f"model.layers.{layer}.self_attn.head_out"].append((head, direction.squeeze(), proj_val_std))
        
    for layer, head in top_heads: 
        interventions[f"model.layers.{layer}.self_attn.head_out"] = sorted(interventions[f"model.layers.{layer}.self_attn.head_out"], key = lambda x: x[0])

    return interventions

In [2]:
df = pd.read_json("requirements_data/dataframe_open_chat_cot_moon_06022024_attentions_gt.json")

df.reset_index(drop=True, inplace=True)
correct = [0 if value == "yes" else 1 for value in df.predict.values]
df.correct = correct

In [20]:
import numpy as np 

index_dic = {}
separated_activations = []
separated_labels = []
reqs_order = []
for req_id in df['req_id'].unique():

    req_df = df[df['req_id'] == req_id].index

    #req_ids.append(req_df)
    index_dic[req_id] = list(req_df)
    
    temp_activations = df[df['req_id'] == req_id].attentions
    activations = np.array([list(sample.values()) for sample in temp_activations.values])#.shape
    batch_length = len(temp_activations)
    dim = 128
    activations = np.reshape(activations, (batch_length, 32, 32, dim))

    temp_labels = [1 if label==True else 0 for label in df[df['req_id'] == req_id]['correct'].values]
    separated_labels.append(temp_labels)
    separated_activations.append(activations)
    reqs_order.append(req_id)

number_of_examples = np.arange(len(reqs_order))


from sklearn.linear_model import LogisticRegression

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.svm import SVC
def train_probes(seed, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels, num_layers, num_heads,c=100):
    
    all_head_accs = []
    probes = []
    train_accs = []
    all_X_train = np.concatenate([separated_head_wise_activations[i] for i in train_set_idxs], axis = 0)
    print(all_X_train.shape)
    all_X_val = np.concatenate([separated_head_wise_activations[i] for i in val_set_idxs], axis = 0)
    y_train = np.concatenate([separated_labels[i] for i in train_set_idxs], axis = 0)
    print(y_train.shape)
    y_val = np.concatenate([separated_labels[i] for i in val_set_idxs], axis = 0)

    for layer in tqdm(range(num_layers)): 
        for head in range(num_heads): 
            
            X_train = all_X_train[:,layer,head,:]
            
            dim_reduction = PCA(n_components = 1).fit(X_train)
            X_train_transform = dim_reduction.transform(X_train)
            X_val = all_X_val[:,layer,head,:]
            X_val_transform = dim_reduction.transform(X_val)
            #X_train = X_train_transform
            #X_val = X_val_transform
            
            if c == 0:
                clf = LogisticRegression(random_state=seed, max_iter=10000).fit(X_train, y_train)

            
            #clf = LogisticRegression(random_state=seed, max_iter=10000, penalty='l1', solver='liblinear', C=100000).fit(X_train, y_train)
            #clf = LogisticRegression(random_state=seed, max_iter=10000,solver='liblinear').fit(X_train, y_train) #, penalty='l1', , C=100000
            
            #clf = LogisticRegression(random_state=seed, C=10, penalty='elasticnet', l1_ratio= 0.5, max_iter=10000, solver='saga').fit(X_train, y_train) #, penalty='l1', , C=100000
            #clf = LogisticRegression(random_state=seed, C=100, penalty='elasticnet', l1_ratio= 0.5, max_iter=10000, solver='saga').fit(X_train, y_train) #, penalty='l1', , C=100000
            else: 
                clf = LogisticRegression(random_state=seed, C=c, penalty='elasticnet', l1_ratio= 0.5, max_iter=10000, solver='saga', tol= 10e-5).fit(X_train, y_train) #, penalty='l1', , C=100000
            
            
            #clf = SVC().fit(X_train, y_train)
            #clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto').fit(X_train, y_train)
            #if clf.converged_:
             #   print("The model converged.")
            #else:
             #   print("The model did not converge. You might need to increase max_iter or change the solver.")

            y_pred = clf.predict(X_train)
            train_accs.append(accuracy_score(y_train, y_pred))
            y_val_pred = clf.predict(X_val)
            all_head_accs.append(accuracy_score(y_val, y_val_pred))

            probes.append(clf)

    all_head_accs_np = np.array(all_head_accs)
    train_accs_np = np.array(train_accs)
    return probes, all_head_accs_np, train_accs_np


In [25]:
from dataclasses import dataclass
import numpy as np
@dataclass
class Args:
    #model_name: str
    #dataset_name: str
    #activations_dataset: str
    val_ratio: float
    use_center_of_mass: bool
    use_random_dir: bool
    seed: int
    num_heads: int

args = Args(val_ratio=0.5, use_center_of_mass=False, use_random_dir=False, seed=42, num_heads=12)

num_layers =32
num_heads = 32

seed = 42

fold_results = []
fold_probes = []

## Hyperparameters for LogisticRegression
regu = [0, 1, 10, 100]

norm_vectors = {}


for c in regu:
    # get two folds using numpy
    num_fold = 1
    fold_idxs = np.array_split(number_of_examples, num_fold)
    for i in range(len(fold_idxs)):
        print(i)
        if num_fold == 1: 
            train_idxs = np.arange(len(reqs_order))
        else:
            train_idxs = np.concatenate([fold_idxs[j] for j in range(num_fold) if j != i])

        val_ratio = args.val_ratio
        seed = 42  # You can choose your own seed value
        rng = np.random.default_rng(seed)
        size = int(len(train_idxs)*(1-val_ratio))
        #print(size)
        train_set_idxs = rng.choice(train_idxs, size=size, replace=False)
        val_set_idxs = np.array([x for x in train_idxs if x not in train_set_idxs])

        if len(fold_idxs) == 1:
            test_idxs = val_set_idxs
        else:
            test_idxs = fold_idxs[i]

        print(fold_idxs)
        print(train_idxs)
        print("Train indexes:" , train_set_idxs)
        print("Validation indexes: ", val_set_idxs)
        #print(test_idxs)

        train_index_list = np.concatenate([list(index_dic.values())[i] for i in train_set_idxs], axis = 0)
        train_set = df.loc[train_index_list]

        rng = np.random.default_rng(seed)
        val_idxs = val_set_idxs
        probes, all_head_accs_np, train_accs = train_probes(seed, train_set_idxs, val_idxs, separated_activations , separated_labels, num_layers=num_layers, num_heads=num_heads, c=c)
        fold_probes.append(probes)
        fold_results.append(all_head_accs_np)


        heads = [(13, 0), (13, 11), (14, 0), (15, 5), (15, 7)]
        for head in heads: 
            layer = head[0]
            h = head[1]
            norm_vectors[head] = norm_vectors.get(head, []) + list(probes[layer_head_to_flattened_idx(layer, h, num_heads)].coef_)
        



0
[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
Train indexes: [17 34  5 31 38 25  4  1 15 18 24  2 39 22 29 32 28 10 19 16]
Validation indexes:  [ 0  3  6  7  8  9 11 12 13 14 20 21 23 26 27 30 33 35 36 37]
(116, 32, 32, 128)
(116,)


  0%|          | 0/32 [00:00<?, ?it/s]

0
[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
Train indexes: [17 34  5 31 38 25  4  1 15 18 24  2 39 22 29 32 28 10 19 16]
Validation indexes:  [ 0  3  6  7  8  9 11 12 13 14 20 21 23 26 27 30 33 35 36 37]
(116, 32, 32, 128)
(116,)


  0%|          | 0/32 [00:00<?, ?it/s]

0
[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
Train indexes: [17 34  5 31 38 25  4  1 15 18 24  2 39 22 29 32 28 10 19 16]
Validation indexes:  [ 0  3  6  7  8  9 11 12 13 14 20 21 23 26 27 30 33 35 36 37]
(116, 32, 32, 128)
(116,)


  0%|          | 0/32 [00:00<?, ?it/s]

0
[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
Train indexes: [17 34  5 31 38 25  4  1 15 18 24  2 39 22 29 32 28 10 19 16]
Validation indexes:  [ 0  3  6  7  8  9 11 12 13 14 20 21 23 26 27 30 33 35 36 37]
(116, 32, 32, 128)
(116,)


  0%|          | 0/32 [00:00<?, ?it/s]

In [22]:
head

(15, 7)

In [26]:
from sentence_transformers import util

for head in heads:

    print(head)
    embeds = norm_vectors[head]
    print(util.cos_sim(embeds, embeds))

(13, 0)
tensor([[1.0000, 0.7732, 0.9448, 0.9245],
        [0.7732, 1.0000, 0.8546, 0.7573],
        [0.9448, 0.8546, 1.0000, 0.9721],
        [0.9245, 0.7573, 0.9721, 1.0000]], dtype=torch.float64)
(13, 11)
tensor([[1.0000, 0.7632, 0.9334, 0.9085],
        [0.7632, 1.0000, 0.8335, 0.7386],
        [0.9334, 0.8335, 1.0000, 0.9670],
        [0.9085, 0.7386, 0.9670, 1.0000]], dtype=torch.float64)
(14, 0)
tensor([[1.0000, 0.7538, 0.9379, 0.9143],
        [0.7538, 1.0000, 0.7669, 0.6744],
        [0.9379, 0.7669, 1.0000, 0.9748],
        [0.9143, 0.6744, 0.9748, 1.0000]], dtype=torch.float64)
(15, 5)
tensor([[1.0000, 0.6842, 0.9518, 0.9514],
        [0.6842, 1.0000, 0.7426, 0.6840],
        [0.9518, 0.7426, 1.0000, 0.9835],
        [0.9514, 0.6840, 0.9835, 1.0000]], dtype=torch.float64)
(15, 7)
tensor([[1.0000, 0.2739, 0.9410, 0.9518],
        [0.2739, 1.0000, 0.3091, 0.2580],
        [0.9410, 0.3091, 1.0000, 0.9746],
        [0.9518, 0.2580, 0.9746, 1.0000]], dtype=torch.float64)


In [16]:
#list(probes[layer_head_to_flattened_idx(layer, h, num_heads)].coef_.squeeze())

In [17]:
#np.stack(np,probes[layer_head_to_flattened_idx(layer, h, num_heads)].coef_.squeeze())

In [18]:
norm_vectors

{(13,
  0): [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([ 0.        ,  0.        ,  0.        ,  0.        , -0.33777962,
          0.        ,  0.        ,  0.        ,  0.67747629,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         -0.69441193,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        , -0.60763223,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        , 