In [None]:
from gpt2_model import *
from transformers import AutoTokenizer
import os
import json
import sys
import tqdm
import numpy as np
import pickle
import torch
import torch.nn.functional as F
from torch.func import jacrev
from copy import deepcopy

external_path=''
sys.path.append('..')

In [None]:
def estimate_lre(folder,filename,sample_size,block,model,tokenizer,external_path,target_obj_toks):
    prompts_file=open(f'relation_data/text/{folder}/{filename}.txt','r')
    prompts_no_object_file=open(f'../relation_data/text_no_object/{folder}/{filename}.txt','r')
    indices_file=open(f'relation_data/index/{folder}/{filename}.txt','r')
    prompts=[prompt.split('\n')[0] for prompt in prompts_file.readlines()]
    prompts_no_object=[prompt.split('\n')[0] for prompt in prompts_no_object_file.readlines()]
    indices=[[int(line.split(',')[0]),int(line.split(',')[1]),int(line.split(',')[2])] for line in indices_file.readlines()]
    prompts_file.close()
    prompts_no_object_file.close()
    indices_file.close()

    for n in range(5):
        sample_positions=np.sort(np.random.randint(low=0,high=len(prompts),size=sample_size,dtype=int))
        s_c_o=[]

        for rel_pos,abs_pos in enumerate(sample_positions):
            prefix_indices=np.delete(sample_positions,rel_pos)
            prefix_text=' '.join([prompts[i]+'.' for i in prefix_indices])
            end_sub_token_pos=len(tokenizer(prefix_text).input_ids)+indices[abs_pos][0]
            text=prefix_text+' '+prompts_no_object[abs_pos]
            beg_obj_token_pos=len(tokenizer(text).input_ids)
            s_c_o.append([end_sub_token_pos,text,[beg_obj_token_pos,indices[abs_pos][-1]]])
        
        jac_size=len(target_obj_toks)
        jacobian=torch.zeros((jac_size,768))
        bias=torch.zeros(jac_size)
        count=np.zeros(sample_size)
        for k,triple in enumerate(s_c_o):
            sub_act=model.forward_with_activation_return(tokenizer(triple[1],return_tensors='pt').input_ids,block,triple[0])[1]
            obj_idx_logits=model.forward(tokenizer(triple[1],return_tensors='pt').input_ids)[0][0,-1,:]
            if obj_idx_logits.argmax()==triple[-1][-1]:
                func = lambda x:model.F(tokenizer(triple[1],return_tensors='pt').input_ids,block,triple[0],x,-1)[target_obj_toks]  

                ft_jacobian=jacrev(func)(sub_act)
                jacobian+=ft_jacobian.squeeze(1)
                bias+=obj_idx_logits[target_obj_toks]-jacobian@sub_act.squeeze(0)
                count[k]=1
        sum_count=np.sum(count)
        if sum_count!=0:
            jacobian=jacobian/sum_count
            bias=bias/sum_count
            torch.save(jacobian,f'{external_path}\\{folder}\\{filename}_jacobian.pt')
            torch.save(bias,f'{external_path}\\{folder}\\{filename}_bias.pt')
            return s_c_o, count
    return 'None'

In [None]:
def faithfulness(folder,filename,external_path,model,tokenizer,num_supporting_prompts,target_obj_toks,block,top_k=1,beta=1):

    index_file=open(f'relation_data/index/{folder}/{filename}.txt','r')
    index_file_lines=index_file.readlines()
    end_sub_idxs=[int(line.split(',')[0]) for line in index_file_lines]
    beg_obj_toks=[int(line.split(',')[-1]) for line in index_file_lines]
    index_file.close()

    prompts_file=open(f'relation_data/text_no_object/{folder}/{filename}.txt','r')
    prompts=[line.split('\n')[0] for line in prompts_file]
    prompts_file.close()

    prompts_with_object_file=open(f'relation_data/text/{folder}/{filename}.txt','r')
    prompts_with_object=[line.split('\n')[0] for line in prompts_with_object_file]
    prompts_with_object_file.close()
    N=len(prompts_with_object)

    jacobian=torch.load(f'{external_path}\\{folder}\\{filename}_jacobian.pt')
    bias=torch.load(f'{external_path}\\{folder}\\{filename}_bias.pt')

    count=0
    accuracy=0
    for n,prompt_token in enumerate(prompts):
        support_prompts_idx=np.random.choice(np.delete(np.arange(N),n),num_supporting_prompts)

        prefix_text=' '.join([prompts_with_object[i]+'.' for i in support_prompts_idx])
        end_sub_token_pos=len(tokenizer(prefix_text).input_ids)+end_sub_idxs[n]

        text=prefix_text+' '+prompts[n]
        prompt_token=tokenizer(text,return_tensors='pt').input_ids
        beg_obj_tok_idx=len(prompt_token)

        beg_obj_idx_logits=model.forward(prompt_token)[0][0,-1,:]
        if beg_obj_idx_logits.argmax()==beg_obj_toks[n]:
            count+=1
            sub_act=model.forward_with_activation_return(prompt_token,block,end_sub_token_pos)[1]
            lre=beta*jacobian@sub_act.squeeze(0)+bias
            beg_obj_idx_logits_with_lre=deepcopy(beg_obj_idx_logits.detach())
            beg_obj_idx_logits_with_lre[target_obj_toks]=lre.detach()
            if beg_obj_toks[n] in torch.argsort(beg_obj_idx_logits_with_lre,descending=True)[:top_k]:
                accuracy+=1
    if count!=0:
        return accuracy/count
    return 'None'

In [None]:
gpt2=GPT(GPTConfig).from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
sample_size=8
num_supporting_prompts=7

experiment_no=9
beta=16
top_k=1

In [None]:
directories={}
for folder in os.listdir('relation_data/text'):
    files=[]
    for filename in os.listdir(f'relation_data/text/{folder}'):
        file=open(f'relation_data/text/{folder}/{filename}','r')
        if len(file.readlines())<250:
            files.append(filename.split('.')[0])
        file.close()
    directories[folder]=files

In [None]:
if not(os.path.exists(f'../faithfulness/experiment_{experiment_no}')):
    os.mkdir(f'../faithfulness/experiment_{experiment_no}')
    experiment_file=open(f'../faithfulness/experiment_{experiment_no}/experiment_details.txt','w')
    experiment_file.write(f'num_supporting_prompts={num_supporting_prompts}\nbeta={beta}\ntop_k={top_k}')
    experiment_file.close()

for block_number in range(1,10+1):

    faithfulness_dict={}
    updated_external_path=f'{external_path}\\gpt2\\lres\\{block_number}'

    if not(os.path.exists(f'{updated_external_path}')):
        os.mkdir(f'{updated_external_path}')
        for folder in directories.keys():
            os.mkdir(f'{updated_external_path}\\{folder}')
        get_jacobian=True
    else:
        get_jacobian=False
    K=len(directories)
    for k,folder in enumerate(directories):

        if get_jacobian:
            file=open(f'{updated_external_path}\\{folder}\\estimating_jacobian.txt','w')
            file.write(f'Block:{block_number}\nSamples:{sample_size}\n')
        else:
            file=open(f'{updated_external_path}\\{folder}\\estimating_jacobian.txt','r')
            file_lines=file.readlines()

        progress_bar=tqdm.tqdm(enumerate(directories[folder]))
        N=len(directories[folder])
        for n,filename in progress_bar:
            progress_bar.set_description(f'Block {block_number}...processing {folder}/{filename} ({n+1}/{N})/({k+1}/{K})...')

            indices=open(f'../relation_data/index/{folder}/{filename}.txt','r')
            target_obj_toks=list(set([int(line.split(',')[-1]) for line in indices.readlines()]))
            target_obj_toks.sort()
            indices.close()

            if get_jacobian:
                outcome=estimate_lre(folder,filename,sample_size,block_number,gpt2,tokenizer,updated_external_path,target_obj_toks)
                if outcome=='None':
                    file.write(f'{filename}: None\n')
                    faithfulness_dict[f'{folder}/{filename}']='None'
                else:
                    file.write(f'{filename}: {outcome[0]} {outcome[1]}\n')
                    accuracy=faithfulness(folder,filename,updated_external_path,gpt2,tokenizer,num_supporting_prompts,target_obj_toks,block_number)
                    faithfulness_dict[f'{folder}/{filename}']=accuracy
            else:
                outcome=file_lines[n+2] 
                if outcome=='None':
                    faithfulness_dict[f'{folder}/{filename}']='None'
                else:
                    accuracy=faithfulness(folder,filename,updated_external_path,gpt2,tokenizer,num_supporting_prompts,target_obj_toks,block_number,top_k,beta)
                    faithfulness_dict[f'{folder}/{filename}']=accuracy 
        file.close()

    faithfulness_file=open(f'../faithfulness/experiment_{experiment_no}/faithfulness_{block_number}','wb')
    pickle.dump(faithfulness_dict,faithfulness_file)
    faithfulness_file.close()

In [None]:
accuracies_dict={}
max_accuracies_dict={}
for n,file in enumerate(os.listdir(f'faithfulness/experiment_{experiment_no}')):
    if 'png' in file or 'txt' in file:
        continue
    faithfulness_file=open(f'faithfulness/experiment_{experiment_no}/{file}','rb')
    accuracies=pickle.load(faithfulness_file)
    faithfulness_file.close()

    for relation,acc in accuracies.items():
        if relation in accuracies_dict.keys():
            if acc=='None':
                accuracies_dict[relation]=(accuracies_dict[relation][0]+[0],accuracies_dict[relation][1])
            elif acc>max(accuracies_dict[relation][0]):
                accuracies_dict[relation]=(accuracies_dict[relation][0]+[acc],int(file.split('_')[-1].split('.')[0]))
                max_accuracies_dict[relation]=(acc,int(file.split('_')[-1].split('.')[0]))
            else:
                accuracies_dict[relation]=(accuracies_dict[relation][0]+[acc],accuracies_dict[relation][1])
        else:
            if acc=='None':
                accuracies_dict[relation]=([0],int(file.split('_')[-1].split('.')[0]))
                max_accuracies_dict[relation]=(0,int(file.split('_')[-1].split('.')[0]))
            else:
                accuracies_dict[relation]=([acc],int(file.split('_')[-1].split('.')[0]))
                max_accuracies_dict[relation]=(acc,int(file.split('_')[-1].split('.')[0]))

max_accuracies_file=open(f'faithfulness/experiment_{experiment_no}/faithfulness_combined','wb')
pickle.dump(max_accuracies_dict,max_accuracies_file)
max_accuracies_file.close()

In [None]:
experiment_file=open(f'faithfulness/experiment_{experiment_no}/experiment_details.txt','r')
experiment_parameters=[int(line.split('=')[-1]) for line in experiment_file.readlines()]
experiment_file.close()

fig,ax=plt.subplots()

fig.set_figwidth(10)

neg_heights=np.array([-max(heights) for heights,block in accuracies_dict.values()])
relation_labels=np.array([label for label in accuracies_dict.keys()])
sorted_indices=np.argsort(neg_heights)

x=0
xlocs=[]
xlabs=[]
for relation in relation_labels[sorted_indices]:
    max_block=accuracies_dict[relation][1]
    heights=accuracies_dict[relation][0]
    heights=[-height for height in heights]
    heights.sort()
    heights=[-height for height in heights]
    for height in heights:
        ax.bar(x,height,color='white',edgecolor='black')

    xlocs.append(x)
    xlabs.append(relation+' ('+str(max_block)+')')

    x+=1
ax.set_ylim((0,1.05))
ax.set_xticks(xlocs,xlabs,rotation='vertical')
ax.set_title(f'Beta={experiment_parameters[1]}, Top {experiment_parameters[2]}')
plt.savefig(f'faithfulness/experiment_{experiment_no}/faithfulness_plot.png',bbox_inches='tight')
plt.show()