# Collect Normalized Attention
- We extracted normalized attentions with following code

In [13]:
import argparse
import warnings
import os
warnings.filterwarnings(action='ignore')
from tqdm.auto import tqdm
import pickle

import nltk.translate.bleu_score as bleu
import numpy as np
import torch
from diff_match_patch import diff_match_patch

import torch.nn.functional as F
from transformers import T5ForConditionalGeneration
from transformers import RobertaTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers import set_seed,BatchEncoding
set_seed(42)

from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from typing import Any, DefaultDict, List, Dict

In [15]:
args=argparse.Namespace(
    batch_size=1, # do not change
    size="small_noabstract", 
    max_length = 256,
    device = "cuda",
    target_dataset="val",
    trgatt = "cross" 
)

# Load APR Model
load_model_dir = f"APR Models/codet5base_wild{args.size}"
device = args.device
tokenizer = RobertaTokenizer.from_pretrained(load_model_dir)
model = T5ForConditionalGeneration.from_pretrained(load_model_dir, return_dict=True, output_attentions=True)
model.to(args.device) 
model.eval()
model.resize_token_embeddings(len(tokenizer))
print(f"Loaded from directory {load_model_dir}")

#  Load dataset
if 'noabstract' in args.size:
    datasetdir = f"Dataset/{args.size}"
    with open(f"{datasetdir}/val.buggy-fixed.buggy","r",encoding='utf8') as f:
        inputs = f.read().split('\n')
        inputs = [x for x in inputs if x!='']
    with open(f"{datasetdir}/val.buggy-fixed.fixed","r",encoding='utf8') as f:   
        labels = f.read().split('\n')
        labels = [x for x in labels if x!='']
    val_data = dict()
    val_data['buggy']=inputs
    val_data['fixed']=labels
    
    with open(f"{datasetdir}/test.buggy-fixed.buggy","r",encoding='utf8') as f:
        inputs = f.read().split('\n')
        inputs = [x for x in inputs if x!='']
    with open(f"{datasetdir}/test.buggy-fixed.fixed","r",encoding='utf8') as f:   
        labels = f.read().split('\n')
        labels = [x for x in labels if x!='']
    test_data = dict()
    test_data['buggy']=inputs
    test_data['fixed']=labels
    
else:
    dataset = load_dataset("code_x_glue_cc_code_refinement",args.size)
    train_data = dataset.data['train']
    val_data = dataset.data['validation']
    test_data = dataset.data['test']
    print(f"#train: {train_data.num_rows}, #val: {val_data.num_rows}, #test: {test_data.num_rows}")
    print(val_data.column_names)

    if args.target_dataset=='test':
        inputs = test_data['buggy'].to_pylist()
        labels = test_data['fixed'].to_pylist()
        print("load test")
    elif args.target_dataset=='val':
        inputs = val_data['buggy'].to_pylist()
        labels = val_data['fixed'].to_pylist()
        print("load val")
    elif args.target_dataset=='train':
        inputs = train_data['buggy'].to_pylist()
        labels = train_data['fixed'].to_pylist()
        print("load train")
    else:
        print("CHECK THE ARGS.TARGET_DATASET")

Loaded from directory APR Models/codet5base_wildsmall_noabstract


# 1. Utils

In [4]:
def get_model_config():
    model = T5ForConditionalGeneration.from_pretrained(load_model_dir, output_attentions=True,output_hidden_states=True)
    number_of_heads = model.config.num_heads
    number_of_layers = model.config.num_layers
    attention_dimsize =  model.config.d_kv
    del model
    torch.cuda.empty_cache()
    return number_of_heads, number_of_layers, attention_dimsize

number_of_heads, number_of_layers, attention_dim = get_model_config()
print("number of head:", number_of_heads, "number of layer:", number_of_layers, "attention dim:",attention_dim)

number of head: 12 number of layer: 12 attention dim: 64


In [5]:
class BugFixDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: BatchEncoding, targets: BatchEncoding, idxs):
        self.encodings = encodings
        self.target_encodings = targets
        self.idxs = idxs

    def __getitem__(self, index: int) -> Dict[str, Any]:
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.target_encodings["input_ids"][index], dtype=torch.long)
        item["idx"] = self.idxs[index]
        return item

    def __len__(self) -> int:
        return len(self.encodings["input_ids"])

    
def create_dataset(
    idxs: List[int],
    inputs: List[str],
    labels: List[str],
    tokenizer: PreTrainedTokenizer,
    pad_truncate: bool,
    max_length=None,
) -> BugFixDataset:

    input_encodings = tokenizer(
        inputs, truncation=pad_truncate, padding=pad_truncate, max_length=max_length
    )
    label_encodings = tokenizer(
        labels, truncation=pad_truncate, padding=pad_truncate, max_length=max_length
    )

    dataset = BugFixDataset(input_encodings, label_encodings, idxs)
    return dataset

# Extract value vector and attention map to make normalized attention map
- Results are saved in ExtractedAttentions folders
- Same as the saved files in Attentions folders (we extracted in advance with following codes)

In [6]:
def process_beam_map(Amap, layer, head, beam_indices):
    outmap = []
    for step in range(len(beam_indices)): # output generation step
        b = beam_indices[step]
        att = Amap[step][layer][b][head][0] 
        outmap.append(att)
    outmap = torch.stack(outmap)
    return outmap

In [16]:
if args.target_dataset=="val":
    inputs = val_data['buggy']
    labels = val_data['fixed']
elif args.target_dataset=="test":
    inputs = test_data['buggy']
    labels = test_data['fixed']

resultsavedir = f"ExtractedAttentions/codet5base_wildsmall_noabstract"
lhpairs = [(l,h) for l in range(number_of_layers) for h in range(number_of_heads)]

# Dataloader
idxs = np.arange(len(inputs))
dataset = create_dataset(idxs, inputs, labels, tokenizer, pad_truncate=True)
dataloader = DataLoader(dataset,batch_size=args.batch_size,shuffle=False,drop_last=False)

pbar = tqdm(range(len(dataloader)))
torch.cuda.empty_cache()

for batch in dataloader:
    batch = {key:val.to(args.device) for key, val in batch.items()}
    # 1. batch info
    input_ids=batch['input_ids']
    answer_ids = batch['labels']
    idxs = batch['idx'].to('cpu').numpy()

    # 2.  model output
    key_outputs = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
                                num_beams=5, max_length=args.max_length, output_scores=True, return_dict_in_generate=True)
    outputs = key_outputs['sequences'][:,1:] # remove start token

    #  3. Value vectors saved during generation
    l2vvecs = {}
    for layer in range(number_of_layers):
        if args.trgatt=="encoder":
            vvecs = model.encoder.block[layer].layer[0].SelfAttention.vvecs
        elif args.trgatt=="decoder":
            vvecs = model.decoder.block[layer].layer[0].SelfAttention.vvecs
        elif args.trgatt=="cross":
            vvecs = model.decoder.block[layer].layer[1].EncDecAttention.vvecs
        else:
            print("Check args.trgatt")
        vvecs = vvecs.detach() #(batchsize, #head, #token, 64) value vector
        l2vvecs[layer] = vvecs
        
    # 4. Beam indices
    beam_indices = key_outputs.beam_indices[0]
    beam_indices = beam_indices[:-1] # delete eos token

    # 5. Organize I/O tokens
    input_id = [x.item() for x in input_ids[0] if x.item()!=0][1:-1] # Remove eos, sos
    output_id = [x.item() for x in outputs[0] if x.item()!=0][1:-1]
    answer_id = [x.item() for x in answer_ids[0] if x.item()!=0][1:-1]
    if output_id==answer_id:
        em = 1
    else:
        em = 0

    # 6. Make vector
    results = []
    total, changed = 0,0
    for layer, head in lhpairs:
        vvec = l2vvecs[layer][0][head] # Value vector of input tokens. All the same regardless of the beam. (0 is the place of the slected beam)
        vvec = vvec[1:len(input_id)+1]
        vnorms = torch.norm(vvec, dim=1)
        
        att_map = process_beam_map(key_outputs['cross_attentions'], layer, head, beam_indices) # Map stacked according to beam index
        att_map = att_map[1:len(output_id)+1,1:len(input_id)+1]
        
        norm_map = att_map*vnorms
        att_map = att_map.detach().to('cpu').numpy()
        norm_map = norm_map.detach().to('cpu').numpy()
        for row, norm_row in zip(att_map, norm_map):
            total+=1
            rank = np.argsort(row)[::-1]
            normrank = np.argsort(norm_row)[::-1]
            if rank[0]!=normrank[0]:
                changed+=1
        results.append([idxs[0], em, input_id, output_id, answer_id, layer, head, vnorms, att_map, norm_map])        
    print(idxs[0], em, changed, total, sep='\t')
    fileidx = idxs[0]

    with open(f"{resultsavedir}/{fileidx}.pkl","wb") as f:
        pickle.dump(results, f)
        
    del results
    torch.cuda.empty_cache()
    pbar.update(1)
    fileidx+=1
    break # Remove comment to save all

  0%|          | 0/5835 [00:00<?, ?it/s]

0	0	3500	7776
