In [2]:
# Run the notebook with beliefs_llms env for torch

import torch as t
import numpy as np
import copy
from pathlib import Path
PATH = Path('full_results')
models = ['gpt-j', 'llama', 'llama_instruct', 'gemma', 'gemma_instruct']

## Accuracy

In [9]:
def score_accuracy(data, top_k=None):
    
    data = t.tensor(data) if not isinstance(data, t.Tensor) else data
    if top_k is not None:
        # Flatten to 1D and select top-k
        top_values = t.topk(data.flatten(), top_k).values
        score = top_values.mean().item()
    else:
        score = data.mean().item()
    return score

top_k = 5

for model in models:
    path = (PATH / model / 'ACCURACY')

    print("=============== ", model, " ===============")
    print() 
    print("RESIDUAL")
    print()

    accuracies = t.load(path / 'accuracies_residual', weights_only=False)
    print([f"{x:.3f}" for x in accuracies])
    print()

    print("Score: ", score_accuracy(accuracies, top_k=top_k))
    print()
    
    print("HEADS")
    print()
    accuracies = t.load(path / 'accuracies_heads', weights_only=False)
    for row in accuracies:
        print([f"{x:.3f}" for x in row])
    print()
    print("Score: ", score_accuracy(accuracies, top_k=top_k))
    print()

    


RESIDUAL

['0.497', '0.512', '0.515', '0.543', '0.562', '0.599', '0.651', '0.716', '0.762', '0.787', '0.827', '0.885', '0.899', '0.903', '0.898', '0.899', '0.901', '0.894', '0.889', '0.885', '0.881', '0.887', '0.885', '0.888', '0.885', '0.883', '0.876', '0.871', '0.868', '0.860', '0.868', '0.858']

Score:  0.9001723147616312

HEADS

['0.493', '0.513', '0.512', '0.509', '0.503', '0.497', '0.507', '0.503', '0.519', '0.501', '0.520', '0.517', '0.499', '0.509', '0.504', '0.505', '0.497', '0.492', '0.499', '0.498', '0.503', '0.509', '0.498', '0.501', '0.503', '0.510', '0.501', '0.499', '0.528', '0.491', '0.504', '0.499']
['0.516', '0.514', '0.515', '0.507', '0.496', '0.498', '0.501', '0.507', '0.518', '0.512', '0.504', '0.501', '0.507', '0.498', '0.524', '0.507', '0.510', '0.487', '0.510', '0.501', '0.500', '0.495', '0.512', '0.511', '0.494', '0.500', '0.496', '0.500', '0.498', '0.513', '0.507', '0.515']
['0.494', '0.508', '0.504', '0.491', '0.525', '0.523', '0.523', '0.508', '0.480', '0.4

## Use

In [13]:

models = ['gemma', 'gemma_instruct', 'llama', 'llama_instruct', 'gpt-j']

d_coeff = { 'gemma': 256/3584,                          # D_head/D_model
           'gemma_instruct': 256/3584,
           'llama': 128/4096,
           'llama_instruct': 128/4096,
           'gpt-j': 256/4096 
           }

def recover_pd(y, scale_proba=5.0, eps=1e-6):

    # WARNING: This function is necessary since in our experiment's run we logged scaled/squished results during the data collection
    # If you want to recover your own data and then check the results, make sure to change the return type of the function

    """
    Approximate recovery of tot_pd from sigmoid output y.
    Assumes y in (0, 1).
    """
    recovered_y = y
    recovered_y = t.tensor(recovered_y, dtype=t.float32)
    recovered_y = t.clamp(recovered_y, eps, 1 - eps)
    return (t.log(recovered_y / (1 - recovered_y)) / scale_proba).item()
    # return y

vectorized_stre = np.vectorize(recover_pd)

model_stats_residual = {'gemma': {'ft_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}},'tf_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}}},
    'gemma_instruct': {'ft_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}},'tf_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}}},
    'llama' : {'ft_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}},'tf_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}}},
    'llama_instruct': {'ft_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}},'tf_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}}},
    'gpt-j': {'ft_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}},'tf_og_probas': {'clean': {'result':(), 'a':0, 'k':0},'control': {'result':(), 'a':0, 'k':0}}}
}

model_stats_heads = copy.deepcopy(model_stats_residual)

for model in models:

    print(model)
    print()
    print("RESIDUAL")
    checkfile = t.load(PATH / model / 'USE' / 'intervention_scores_residual', weights_only=False)
    print(checkfile.keys())
    print(checkfile['sweep'].keys())
    print(checkfile['sweep']['alphas'])
    print(checkfile['sweep']['ks'])
    print(checkfile['fixed'].keys())
    print(checkfile['fixed']['ft'])
    print(checkfile['fixed']['tf'])
    model_stats_residual[model]['ft_og_probas']['clean']['result'] = (recover_pd(checkfile['fixed']['ft']['clean'][1][0][0]), recover_pd(checkfile['fixed']['ft']['clean'][1][0][1]))
    model_stats_residual[model]['ft_og_probas']['clean']['a'] = checkfile['fixed']['ft']['alpha'][1]
    model_stats_residual[model]['ft_og_probas']['clean']['k'] = checkfile['fixed']['ft']['k'][0]
    model_stats_residual[model]['ft_og_probas']['control']['result'] = (recover_pd(checkfile['fixed']['ft']['control'][1][0][0]), recover_pd(checkfile['fixed']['ft']['control'][1][0][1]))
    model_stats_residual[model]['ft_og_probas']['control']['a'] = checkfile['fixed']['ft']['alpha_control'][1]
    model_stats_residual[model]['ft_og_probas']['control']['k'] = checkfile['fixed']['ft']['k_control'][0]
    model_stats_residual[model]['tf_og_probas']['clean']['result'] = (recover_pd(checkfile['fixed']['tf']['clean'][1][0][0]), recover_pd(checkfile['fixed']['tf']['clean'][1][0][1]))
    model_stats_residual[model]['tf_og_probas']['clean']['a'] = checkfile['fixed']['tf']['alpha'][1]
    model_stats_residual[model]['tf_og_probas']['clean']['k'] = checkfile['fixed']['tf']['k'][0]
    model_stats_residual[model]['tf_og_probas']['control']['result'] = (recover_pd(checkfile['fixed']['tf']['control'][1][0][0]), recover_pd(checkfile['fixed']['tf']['control'][1][0][1]))
    model_stats_residual[model]['tf_og_probas']['control']['a'] = checkfile['fixed']['tf']['alpha_control'][1]
    model_stats_residual[model]['tf_og_probas']['control']['k'] = checkfile['fixed']['ft']['k_control'][0]

    print()
    print("HEADS")
    checkfile = t.load(PATH / model / 'USE' / 'intervention_scores_heads', weights_only=False)
    print(checkfile.keys())
    print(checkfile['sweep'].keys())
    print(checkfile['sweep']['alphas'])
    print(checkfile['sweep']['ks'])
    print(checkfile['fixed'].keys())
    print(checkfile['fixed']['ft'])
    print(checkfile['fixed']['tf'])
    model_stats_heads[model]['ft_og_probas']['clean']['result'] = (recover_pd(checkfile['fixed']['ft']['clean'][1][0][0]), recover_pd(checkfile['fixed']['ft']['clean'][1][0][1]))
    model_stats_heads[model]['ft_og_probas']['clean']['a'] = checkfile['fixed']['ft']['alpha'][1]
    model_stats_heads[model]['ft_og_probas']['clean']['k'] = checkfile['fixed']['ft']['k'][0]
    model_stats_heads[model]['ft_og_probas']['control']['result'] = (recover_pd(checkfile['fixed']['ft']['control'][1][0][0]), recover_pd(checkfile['fixed']['ft']['control'][1][0][1]))
    model_stats_heads[model]['ft_og_probas']['control']['a'] = checkfile['fixed']['ft']['alpha_control'][1]
    model_stats_heads[model]['ft_og_probas']['control']['k'] = checkfile['fixed']['ft']['k_control'][0]
    model_stats_heads[model]['tf_og_probas']['clean']['result'] = (recover_pd(checkfile['fixed']['tf']['clean'][1][0][0]), recover_pd(checkfile['fixed']['tf']['clean'][1][0][1]))
    model_stats_heads[model]['tf_og_probas']['clean']['a'] = checkfile['fixed']['tf']['alpha'][1]
    model_stats_heads[model]['tf_og_probas']['clean']['k'] = checkfile['fixed']['tf']['k'][0]
    model_stats_heads[model]['tf_og_probas']['control']['result'] = (recover_pd(checkfile['fixed']['tf']['control'][1][0][0]), recover_pd(checkfile['fixed']['tf']['control'][1][0][1]))
    model_stats_heads[model]['tf_og_probas']['control']['a'] = checkfile['fixed']['tf']['alpha_control'][1]
    model_stats_heads[model]['tf_og_probas']['control']['k'] = checkfile['fixed']['ft']['k_control'][0]
    print(model_stats_residual[model]['ft_og_probas']['clean'] == model_stats_heads[model]['ft_og_probas']['clean'])
    print()


model_stats_residual['llama']['ft_og_probas']['control']['a'] = 9
model_stats_residual['llama']['ft_og_probas']['control']['k'] = 8
model_stats_residual['llama']['tf_og_probas']['control']['a'] = 1
model_stats_residual['llama']['tf_og_probas']['control']['k'] = 2
model_stats_residual['llama_instruct']['ft_og_probas']['control']['a'] = 3
model_stats_residual['llama_instruct']['ft_og_probas']['control']['k'] = 3
model_stats_residual['llama_instruct']['tf_og_probas']['control']['a'] = 2
model_stats_residual['llama_instruct']['tf_og_probas']['control']['k'] = 6
model_stats_heads['llama_instruct']['ft_og_probas']['control']['a'] = 6
model_stats_heads['llama_instruct']['ft_og_probas']['control']['k'] = 25
model_stats_heads['llama_instruct']['tf_og_probas']['control']['a'] = 5
model_stats_heads['llama_instruct']['tf_og_probas']['control']['k'] = 15

gemma

RESIDUAL
dict_keys(['sweep', 'fixed'])
dict_keys(['alphas', 'ks', 'ft', 'tf'])
[25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
dict_keys(['ft', 'tf'])
defaultdict(<function nested_dict at 0x000001834ACBE200>, {'alpha': [0, 31.0], 'k': [7], 'alpha_control': [0, 35.0], 'k_control': [7], 'clean': (array([[0.0464907 , 0.79244151]]), array([[0.57519531, 0.77001953]])), 'control': (array([[0.0464907 , 0.20875825]]), array([[0.57519531, 0.62011719]]))})
defaultdict(<function nested_dict at 0x000001834ACBE200>, {'alpha': [0, -33.0], 'k': [22], 'alpha_control': [0, -33.0], 'k_control': [22], 'clean': (array([[0.        , 0.19607241]]), array([[0.33813477, 0.76855469]])), 'control': (array([[0., 0.]]), array([[0.33813477, 0.390625  ]]))})

HEADS
dict_keys(['sweep', 'fixed'])
dict_keys(['alphas', 'ks', 'ft', 'tf'])
[2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0]
[1, 5, 10, 15, 20, 25, 30,

In [None]:
def stre(a, k, model_name, heads=False):
    k = k*d_coeff[model_name] if heads else k
    return np.abs(k*a)

def score(num, k=2.5):
    return 1/(1+np.exp(-k*num))

def score_e_(num, k=0.15):
    return 1/(1+np.exp(-k*(num - 1)))

for model, model_data in model_stats_residual.items():
    for block in model_data.values():
        for entry in block.values():
            entry["s"] = stre(entry["a"], entry["k"], model)
            entry["e"] = entry["result"][1] - entry["result"][0]
        block["effect"] = block["clean"]["e"]
        block["effect_"] = np.abs(( block["clean"]["e"] / np.sqrt(block["clean"]["s"]) ) / (block["control"]["e"] / np.sqrt(block["control"]["s"]) ))
        
for model, model_data in model_stats_heads.items():
    for block in model_data.values():
        for entry in block.values():
            entry["s"] = stre(entry["a"], entry["k"], model, heads=True)
            entry["e"] = entry["result"][1] - entry["result"][0]
        block["effect"] = block["clean"]["e"]
        block["effect_"] = np.abs(( block["clean"]["e"] / np.sqrt(block["clean"]["s"]) ) / (block["control"]["e"] / np.sqrt(block["control"]["s"]) ))

print("RESIDUAL")
for model, values in model_stats_residual.items():
    print(model)
    print('F->T')
    print("E", f"{values['ft_og_probas']['effect']:.3f} ")
    print("E->", f"{values['ft_og_probas']['effect_']:.3f}")
    print("Alpha, K, Alpha_Control, K_Control", values['ft_og_probas']['clean']['a'], values['ft_og_probas']['clean']['k'], values['ft_og_probas']['control']['a'], values['ft_og_probas']['control']['k'])
    print("S", np.sqrt(values['ft_og_probas']['clean']['s']))
    print('T->F')
    print("E", f"{values['tf_og_probas']['effect']:.3f}")
    print("E->", f"{values['tf_og_probas']['effect_']:.3f}")
    print("Alpha, K, Alpha_Control, K_Control", values['tf_og_probas']['clean']['a'], values['tf_og_probas']['clean']['k'], values['tf_og_probas']['control']['a'], values['tf_og_probas']['control']['k'])
    print("S", np.sqrt(values['tf_og_probas']['clean']['s']))

    print('Avg')
    print("E", f"{np.mean([values['ft_og_probas']['effect'], values['tf_og_probas']['effect']]):.3f}")
    print("E->", f"{np.mean([values['ft_og_probas']['effect_'], values['tf_og_probas']['effect_']]):.3f}")
    print()
    print('Score')
    print("E", f"{score(np.mean([values['ft_og_probas']['effect'], values['tf_og_probas']['effect']])):.3f}")
    print("E->", f"{score_e_(np.mean([values['ft_og_probas']['effect_'], values['tf_og_probas']['effect_']])):.3f}")
    print()

print("HEADS")
for model, values in model_stats_heads.items():
    print(model)
    print('F->T')
    print("E", f"{values['ft_og_probas']['effect']:.3f}")
    print("E->", f"{values['ft_og_probas']['effect_']:.3f}")
    print("Alpha, K, Alpha_Control, K_Control", values['ft_og_probas']['clean']['a'], values['ft_og_probas']['clean']['k'], values['ft_og_probas']['control']['a'], values['ft_og_probas']['control']['k'])
    print("S", np.sqrt(values['ft_og_probas']['clean']['s']))
    print('T->F')
    print("E", f"{values['tf_og_probas']['effect']:.3f}")
    print("E->", f"{values['tf_og_probas']['effect_']:.3f}")
    print("Alpha, K, Alpha_Control, K_Control", values['tf_og_probas']['clean']['a'], values['tf_og_probas']['clean']['k'], values['tf_og_probas']['control']['a'], values['tf_og_probas']['control']['k'])
    print("S", np.sqrt(values['tf_og_probas']['clean']['s']))
    print('Avg')
    print()
    print('Score')
    print("E", f"{score(np.mean([values['ft_og_probas']['effect'], values['tf_og_probas']['effect']])):.3f}")
    print("E->", f"{score_e_(np.mean([values['ft_og_probas']['effect_'], values['tf_og_probas']['effect_']])):.3f}")
    print()

RESIDUAL
gemma
F->T
E 0.181 
E-> 5.145
Alpha, K, Alpha_Control, K_Control 31.0 7 35.0 7
S 14.730919862656235
T->F
E 0.374
E-> 4.653
Alpha, K, Alpha_Control, K_Control -33.0 22 -33.0 7
S 26.94438717061496
Avg
E 0.278
E-> 4.899

Score
E 0.667
E-> 0.642
gemma_instruct
F->T
E 0.633 
E-> 7.391
Alpha, K, Alpha_Control, K_Control 30.0 16 30.0 16
S 21.908902300206645
T->F
E 1.139
E-> 16.814
Alpha, K, Alpha_Control, K_Control -30.0 26 -30.0 16
S 27.92848008753788
Avg
E 0.886
E-> 12.102

Score
E 0.902
E-> 0.841
llama
F->T
E 0.323 
E-> 7.360
Alpha, K, Alpha_Control, K_Control 9.0 1 9 8
S 3.0
T->F
E 0.055
E-> 2.556
Alpha, K, Alpha_Control, K_Control -6.0 1 1 2
S 2.449489742783178
Avg
E 0.189
E-> 4.958

Score
E 0.616
E-> 0.644
llama_instruct
F->T
E 1.183 
E-> 9.769
Alpha, K, Alpha_Control, K_Control 3.0 3 3 3
S 3.0
T->F
E 0.892
E-> 13.882
Alpha, K, Alpha_Control, K_Control -5.0 2 2 6
S 3.1622776601683795
Avg
E 1.037
E-> 11.825

Score
E 0.930
E-> 0.835
gpt-j
F->T
E 0.014 
E-> 0.879
Alpha, K, Alpha_C

## Coherence (Probabilistic)

In [None]:
model = 'llama'
llama_data_dict = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logits_logistic_regression_mmp_self_report")
llama_data_dict_heads = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_heads")

model = 'llama_instruct'
llama_instruct_data_dict_1 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logits_logistic_regression_mmp")
llama_instruct_data_dict_2 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_self_report")
llama_instruct_data_dict_heads = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_heads")
llama_instruct_data_dict = {**llama_instruct_data_dict_1, **llama_instruct_data_dict_2}

model = 'gpt-j'
gpt_j_data_dict_1 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logits_logistic_regression_mmp")
gpt_j_data_dict_2 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_self_report")
gpt_j_data_dict_heads = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_heads")
gpt_j_data_dict = {**gpt_j_data_dict_1, **gpt_j_data_dict_2}

model = 'gemma'
gemma_data_dict_1 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_logits")
gemma_data_dict_2 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_self_report")
gemma_data_dict_heads = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_heads")
gemma_data_dict = {**gemma_data_dict_1, **gemma_data_dict_2}

model = 'gemma_instruct'
gemma_instruct_data_dict_1 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logits_logistic_regression_mmp")
gemma_instruct_data_dict_2 = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_self_report")
gemma_instruct_data_dict_heads = t.load(PATH / model / 'COHERENCE' / f"coherence_scoresnegorand_logistic_regression_mmp_heads")
gemma_instruct_data_dict = {**gemma_instruct_data_dict_1, **gemma_instruct_data_dict_2}

models = ['llama', 'llama_instruct', 'gpt-j', 'gemma', 'gemma_instruct']

from collections import OrderedDict

key_order = ['logistic_regression', 'mmp', 'self_report', 'logits']

def make_ordered_custom(d, order):
    """
    Recursively convert nested dicts into OrderedDicts
    following a custom key order.
    """
    if isinstance(d, dict):
        od = OrderedDict()
        for k in order:
            if k in d:
                od[k] = make_ordered_custom(d[k], order)
        # Add any keys not in the custom order at the end
        for k in d:
            if k not in order:
                od[k] = make_ordered_custom(d[k], order)
        return od
    elif isinstance(d, (list, tuple)):
        return type(d)(make_ordered_custom(x, order) if isinstance(x, dict) else x for x in d)
    else:
        return d

# Example: convert your data
dicts = [llama_data_dict, llama_instruct_data_dict, gpt_j_data_dict, gemma_data_dict, gemma_instruct_data_dict]
dicts_heads = [llama_data_dict_heads, llama_instruct_data_dict_heads, gpt_j_data_dict_heads, gemma_data_dict_heads, gemma_instruct_data_dict_heads]
data_ordered = [make_ordered_custom(d, key_order) for d in dicts]
data_ordered_heads = [make_ordered_custom(d, key_order) for d in dicts_heads]

for heads, full in zip(data_ordered_heads, data_ordered):
    for k, v in full.items():
        heads.setdefault(k, v)

print("RESIDUAL")

for model, value in zip(models, data_ordered):
    print(model, "\n",  value)
print("HEADS")

for model, value in zip(models, data_ordered_heads):
    print(model, "\n",  value)

logreg_data = [x['logistic_regression'] for x in data_ordered]
logreg_data_heads = [x['logistic_regression'] for x in data_ordered_heads]

def score_coherence_2(data):

    diffs = [v[0]-v[1] for v in data.values()]
   
    return np.mean(diffs) + 0.5

model_scores_coherence_2 = []

for model in logreg_data:
    model_scores_coherence_2.append(score_coherence_2(model))

model_scores_coherence_2.append(0.5)
models = ['llama', 'llama_instruct', 'gpt-j', 'gemma', 'gemma_instruct', 'baselines']

llama_coherence_2 = score_coherence_2(logreg_data[0])
llama_instruct_coherence_2 = score_coherence_2(logreg_data[1])
gpt_j_coherence_2 = score_coherence_2(logreg_data[2])
gemma_coherence_2 = score_coherence_2(logreg_data[3])
gemma_instruct_coherence_2 = score_coherence_2(logreg_data[4])
print("RESIDUALS\n")
for model, score in zip(models, model_scores_coherence_2):
    print(model, score-0.5)

print("\nHEADS\n")
model_scores_coherence_2_heads = []

for model in logreg_data_heads:
    model_scores_coherence_2_heads.append(score_coherence_2(model))

model_scores_coherence_2_heads.append(0.5)
models = ['llama', 'llama_instruct', 'gpt-j', 'gemma', 'gemma_instruct', 'baselines']

llama_coherence_2_heads = score_coherence_2(logreg_data_heads[0])
llama_instruct_coherence_2_heads = score_coherence_2(logreg_data_heads[1])
gpt_j_coherence_2_heads = score_coherence_2(logreg_data_heads[2])
gemma_coherence_2_heads = score_coherence_2(logreg_data_heads[3])
gemma_instruct_coherence_2_heads = score_coherence_2(logreg_data_heads[4])

print()
print("Scores")
print()

for model, score in zip(models, model_scores_coherence_2_heads):
    print(model, score)


RESIDUAL
llama 
 OrderedDict([('logistic_regression', OrderedDict([('neg', (tensor(0.8406, dtype=torch.float64), tensor(0.7642, dtype=torch.float64))), ('or', (tensor(0.7754), 0.5)), ('and', (tensor(0.8359), 0.5))])), ('mmp', OrderedDict([('neg', (tensor(0.6961), tensor(0.6841))), ('or', (tensor(0.8477), 0.5)), ('and', (tensor(0.9546), 0.5))])), ('self_report', OrderedDict([('neg', (tensor(0.8284), tensor(0.7898))), ('or', (tensor(0.4741), 0.5)), ('and', (tensor(0.9272), 0.5))])), ('logits', OrderedDict([('neg', (tensor(0.8965, dtype=torch.float64), tensor(0.8724, dtype=torch.float64))), ('or', (tensor(0.7261), 0.5)), ('and', (tensor(0.3882), 0.5))]))])
llama_instruct 
 OrderedDict([('logistic_regression', OrderedDict([('neg', (tensor(0.7324, dtype=torch.float64), tensor(0.7080, dtype=torch.float64))), ('or', (tensor(0.7925), 0.5)), ('and', (tensor(0.8516), 0.5))])), ('mmp', OrderedDict([('neg', (tensor(0.7510), tensor(0.7137))), ('or', (tensor(0.8091), 0.5)), ('and', (tensor(0.8496), 

## Coherence (Logic) & Uniformity

In [None]:
models = ['llama', 'llama_instruct', 'gpt-j', 'gemma', 'gemma_instruct']
unif_types = ['logic', 'logicheads', 'domain', 'domainheads']
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

# Container for all matrices
matrices = {}

for model in models:
    matrices[model] = {}
    for UNIF_TYPE in unif_types:
        if UNIF_TYPE in ['domain', 'domainheads']:
            x_ticks = ['All_Datasets', 'Common_Claims', 'CC+Cities', 'cities', 'CC+Companies', 'companies', 
                       'CC+Sp_en', 'Sp_en', 'CC+Larger_Than', 'Larger_than', 'CC+Counterfact', 'Counterfact']
            y_ticks = ['cities', 'Common_Claims', 'companies', 'Sp_En', 'Larger_Than', 'Counterfact']

        elif UNIF_TYPE in ['logic', 'logicheads']:
            x_ticks = ['All_Datasets', 'Common_Claims', 'CC+C+D+N', 'CC+C+D', 'CC+C+N', 'CC+D+N', 
                       'CC+C', 'CC+D', 'CC+N', 'Only_Conj', 'Only_Disj', 'Only_neg']
            y_ticks = ['cities', 'neg_cities', 'conj_cities', 'disj_cities', 'Common_Claims', 'Conj_Common_Claims',
                       'Disj_Common_Claims', 'Neg_Common_Claims', 'companies', 'Sp_en', 'Neg_Sp_En', 
                       'Larger_Than', 'Smaller_Than', 'Counterfact']
        else:
            raise ValueError("UNIF_TYPE not recognized")

        # Load the data
        data_sweep = t.load(PATH / model / 'UNIFORMITY' / f"uniformity_uniformity{UNIF_TYPE}")
        data_array = np.array([[data_sweep[row][col][0] for col in sorted(data_sweep[row])] 
                               for row in sorted(data_sweep)]).T

        # Store the full matrix
        matrices[model][UNIF_TYPE] = data_array

        print(model, UNIF_TYPE, data_array)

llama logic [[0.9701087  0.60597826 0.63586957 0.81793478 0.53804348 0.5923913  0.625      0.85326087 0.5298913  0.62228261 0.56793478 0.16576087]
 [0.94021739 0.52173913 0.54619565 0.55163043 0.50815217 0.5326087  0.52445652 0.53804348 0.50815217 0.52173913 0.44293478 0.48913043]
 [0.63586957 0.67663043 0.56793478 0.74456522 0.50815217 0.51358696 0.53532609 0.64130435 0.51902174 0.49728261 0.62228261 0.49184783]
 [0.57608696 0.50543478 0.5625     0.58967391 0.45380435 0.50543478 0.48913043 0.58423913 0.55434783 0.48641304 0.56521739 0.51358696]
 [0.57065217 0.6576087  0.60507246 0.72644928 0.49818841 0.58514493 0.63586957 0.73731884 0.45018116 0.58152174 0.61594203 0.2201087 ]
 [0.66032609 0.68931159 0.62137681 0.64855072 0.68025362 0.58152174 0.69474638 0.64402174 0.6576087  0.70471014 0.34963768 0.36865942]
 [0.39492754 0.47282609 0.47463768 0.56431159 0.33514493 0.48188406 0.44836957 0.59963768 0.33061594 0.39311594 0.7182971  0.36503623]
 [0.35778986 0.29347826 0.29438406 0.230978

In [20]:
def score_uniformity(matrix):
    matrix = np.array(matrix)
    
    # Overall score
    overall_score = np.mean(matrix) - np.std(matrix)
    
    # First column score
    first_col = matrix[:, 0]
    first_col_score = np.mean(first_col) - np.std(first_col)
    
    return overall_score, first_col_score

print()
print("=========== LOGIC ===========")
print()
llama_coherence_1 = score_uniformity(matrices['llama']['logic']) 
llama_instruct_coherence_1 = score_uniformity(matrices['llama_instruct']['logic']) 
gpt_j_coherence_1 = score_uniformity(matrices['gpt-j']['logic']) 
gemma_coherence_1 = score_uniformity(matrices['gemma']['logic']) 
gemma_instruct_coherence_1 = score_uniformity(matrices['gemma_instruct']['logic']) 
llama_coherence_1_heads = score_uniformity(matrices['llama']['logicheads']) 
llama_instruct_coherence_1_heads = score_uniformity(matrices['llama_instruct']['logicheads']) 
gpt_j_coherence_1_heads = score_uniformity(matrices['gpt-j']['logicheads']) 
gemma_coherence_1_heads = score_uniformity(matrices['gemma']['logicheads']) 
gemma_instruct_coherence_1_heads = score_uniformity(matrices['gemma_instruct']['logicheads']) 


coherence_scores = {
    "llama": llama_coherence_1,
    "llama_instruct": llama_instruct_coherence_1,
    "gpt_j": gpt_j_coherence_1,
    "gemma": gemma_coherence_1,
    "gemma_instruct": gemma_instruct_coherence_1
}

coherence_scores_heads = {
    "llama": llama_coherence_1_heads,
    "llama_instruct": llama_instruct_coherence_1_heads,
    "gpt_j": gpt_j_coherence_1_heads,
    "gemma": gemma_coherence_1_heads,
    "gemma_instruct": gemma_instruct_coherence_1_heads
}

print("RESIDUALS")
for model, score in coherence_scores.items():
    print(model, "\ngeneral", score[0], "all_datasets", score[1])

print()
print("HEADS")
for model, score in coherence_scores_heads.items():
    print(model, "\ngeneral", score[0], "all_datasets", score[1])

print()
print("=========== DOMAIN ===========")
print()

llama_domain = score_uniformity(matrices['llama']['domain']) 
llama_instruct_domain = score_uniformity(matrices['llama_instruct']['domain']) 
gpt_j_domain = score_uniformity(matrices['gpt-j']['domain']) 
gemma_domain = score_uniformity(matrices['gemma']['domain']) 
gemma_instruct_domain = score_uniformity(matrices['gemma_instruct']['domain']) 
llama_domain_heads = score_uniformity(matrices['llama']['domainheads']) 
llama_instruct_domain_heads = score_uniformity(matrices['llama_instruct']['domainheads']) 
gpt_j_domain_heads = score_uniformity(matrices['gpt-j']['domainheads']) 
gemma_domain_heads = score_uniformity(matrices['gemma']['domainheads']) 
gemma_instruct_domain_heads = score_uniformity(matrices['gemma_instruct']['domainheads']) 


domain_scores = {
    "llama": llama_domain,
    "llama_instruct": llama_instruct_domain,
    "gpt_j": gpt_j_domain,
    "gemma": gemma_domain,
    "gemma_instruct": gemma_instruct_domain
}

domain_scores_heads = {
    "llama": llama_domain_heads,
    "llama_instruct": llama_instruct_domain_heads,
    "gpt_j": gpt_j_domain_heads,
    "gemma": gemma_domain_heads,
    "gemma_instruct": gemma_instruct_domain_heads
}

print("RESIDUALS")
for model, score in domain_scores.items():
    print(model, "\ngeneral", score[0], "all_datasets", score[1])

print()
print("HEADS")
for model, score in domain_scores_heads.items():
    print(model, "\ngeneral", score[0], "all_datasets", score[1])



RESIDUALS
llama 
general 0.4031002976693211 all_datasets 0.5318043842582401
llama_instruct 
general 0.36961754379750555 all_datasets 0.5648162507072334
gpt_j 
general 0.38601121711885966 all_datasets 0.4877322158634759
gemma 
general 0.3874629960165311 all_datasets 0.5869364175167491
gemma_instruct 
general 0.36699448959868824 all_datasets 0.6014197270555243

HEADS
llama 
general 0.4074889749893479 all_datasets 0.5432705314110398
llama_instruct 
general 0.39484853083649335 all_datasets 0.5911866524633791
gpt_j 
general 0.4123788788411172 all_datasets 0.4440703676487687
gemma 
general 0.39009399405761847 all_datasets 0.6161804678767203
gemma_instruct 
general 0.4020422087115511 all_datasets 0.6160866238689355


RESIDUALS
llama 
general 0.5060460867301433 all_datasets 0.747667177367418
llama_instruct 
general 0.6273841045124473 all_datasets 0.7440517758617988
gpt_j 
general 0.5057649197070717 all_datasets 0.6697445155165004
gemma 
general 0.5757640961829853 all_datasets 0.7630370675660