## GiLOT

In [1]:
import torch
import os

model_name = 'llama-7b'
beams = 20
max_new_tokens = 10

os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3"
device = torch.device("cuda:0")

In [2]:
from utils import load_model, get_template
model, tokenizer, block_name, embedding_name, embed_token_name, _, _ = load_model(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [01:02<00:00, 31.10s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'lm_head': 1}


In [3]:
from interpreter import Interpreter
interpreter = Interpreter(model, block_name, embed_token_name, embed_token_name)

In [4]:
from utils import load_model, get_template
template = get_template(model_name)

beams = 20
max_new_tokens = 10

query = "Moreover, he was in dread that if he persisted in his disagreement O'Brien would twist the dial again."
input_text = f"{template['prefix']}{query.strip()}{template['postfix']}"
inputs = tokenizer(input_text, return_tensors="pt")
inputs.to(device)
attributions, probs_sequences = interpreter.interpret_ours(inputs.input_ids, beams, max_new_tokens, "optimal_transport")
print(attributions)
print(probs_sequences)

[{'token_index': 0, 'optimal_transport': tensor(0.6343, device='cuda:0', dtype=torch.float64)}, {'token_index': 1, 'optimal_transport': tensor(0.5324, device='cuda:0', dtype=torch.float64)}, {'token_index': 2, 'optimal_transport': tensor(0.0768, device='cuda:0', dtype=torch.float64)}, {'token_index': 3, 'optimal_transport': tensor(0.8737, device='cuda:0', dtype=torch.float64)}, {'token_index': 4, 'optimal_transport': tensor(0.1759, device='cuda:0', dtype=torch.float64)}, {'token_index': 5, 'optimal_transport': tensor(0.1701, device='cuda:0', dtype=torch.float64)}, {'token_index': 6, 'optimal_transport': tensor(0.4339, device='cuda:0', dtype=torch.float64)}, {'token_index': 7, 'optimal_transport': tensor(0.2333, device='cuda:0', dtype=torch.float64)}, {'token_index': 8, 'optimal_transport': tensor(0.3202, device='cuda:0', dtype=torch.float64)}, {'token_index': 9, 'optimal_transport': tensor(0.0836, device='cuda:0', dtype=torch.float64)}, {'token_index': 10, 'optimal_transport': tensor(0

### Visualizations

In [5]:
from captum.attr import visualization
import matplotlib.pyplot as plt

def show_heatmap_on_text(text, text_encoding, R_text):
    # CLS_idx = text_encoding.argmax(dim=-1)
    # print(R_text,  CLS_idx, type(R_text), type(CLS_idx))
    # R_text = R_text[CLS_idx, 1:CLS_idx]

    text_scores = R_text / R_text.sum()
    text_scores = text_scores.flatten()
    text_tokens = tokenizer.encode(text)
    text_tokens_decoded = [tokenizer.decode([a]) for a in text_tokens]
    vis_data_records = [visualization.VisualizationDataRecord(text_scores,0,0,0,0,0,text_tokens_decoded,1)]
    visualization.visualize_text(vis_data_records)


In [6]:
import torch
attribution_scores = torch.stack([d['optimal_transport'] for d in attributions])
print(attribution_scores)
max_scores, max_indices = torch.topk(attribution_scores, k=int(0.5 * len(attribution_scores)), dim=0) 
print(max_scores)
print(max_indices)
# 将 token ID 转换为 token 字符串
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

# 找出对应的 token 和最大值
result = [(tokens[idx.item()], max_scores[i].item()) for i, idx in enumerate(max_indices)]

# 打印结果
for i, (token, score) in enumerate(result):
    print(f"Column {i}: Token = {token}, Max Attribution = {score}")

show_heatmap_on_text(input_text, inputs.input_ids, attribution_scores)

tensor([0.6343, 0.5324, 0.0768, 0.8737, 0.1759, 0.1701, 0.4339, 0.2333, 0.3202,
        0.0836, 0.2051, 0.1736, 0.1752, 0.0460, 0.3330, 0.0614, 0.3419, 0.7779,
        0.8092, 0.6071, 0.5705, 0.6949, 0.8503, 0.4499, 0.3854, 0.5106, 0.4674,
        0.3939, 0.7468], device='cuda:0', dtype=torch.float64)
tensor([0.8737, 0.8503, 0.8092, 0.7779, 0.7468, 0.6949, 0.6343, 0.6071, 0.5705,
        0.5324, 0.5106, 0.4674, 0.4499, 0.4339], device='cuda:0',
       dtype=torch.float64)
tensor([ 3, 22, 18, 17, 28, 21,  0, 19, 20,  1, 25, 26, 23,  6],
       device='cuda:0')
Column 0: Token = ▁he, Max Attribution = 0.873740987708477
Column 1: Token = ▁tw, Max Attribution = 0.8503011899245474
Column 2: Token = ', Max Attribution = 0.8092123379219845
Column 3: Token = ▁O, Max Attribution = 0.7778865854996828
Column 4: Token = ., Max Attribution = 0.7467759334811783
Column 5: Token = ▁would, Max Attribution = 0.6948849747916637
Column 6: Token = <s>, Max Attribution = 0.6342742113948501
Column 7: Token =

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (0.00),0.0,0.0,"#s Moreover , he was in dread that if he pers isted in his disag re ement O ' B rien would tw ist the d ial again ."
,,,,
