## GiLOT

In [1]:
import torch
import os

model_name = 'gpt2-large'
beams = 20
max_new_tokens = 10

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2, 3"
device = torch.device("cuda:0")

In [2]:
from utils import load_model, get_template
model, tokenizer, block_name, embedding_name, embed_token_name, _, _ = load_model(model_name)

transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_proj.bias
transformer.h.2.ln_2

In [3]:
from interpreter import Interpreter
interpreter = Interpreter(model, block_name, embed_token_name, embed_token_name)

In [5]:
from utils import load_model, get_template
template = get_template(model_name)

beams = 20
max_new_tokens = 15

query = "The BRW Rich 200, 2014 is the 31st annual survey of the wealthiest people resident"
input_text = f"{template['prefix']}{query.strip()}{template['postfix']}"
inputs = tokenizer(input_text, return_tensors="pt")
inputs.to(device)
attributions, probs_sequences = interpreter.interpret_ours(inputs.input_ids, beams, max_new_tokens, "optimal_transport")
print(attributions)
print(probs_sequences)

[{'token_index': 0, 'optimal_transport': tensor(0.4701, device='cuda:0', dtype=torch.float64)}, {'token_index': 1, 'optimal_transport': tensor(0.2135, device='cuda:0', dtype=torch.float64)}, {'token_index': 2, 'optimal_transport': tensor(0.2023, device='cuda:0', dtype=torch.float64)}, {'token_index': 3, 'optimal_transport': tensor(0.2421, device='cuda:0', dtype=torch.float64)}, {'token_index': 4, 'optimal_transport': tensor(0.1625, device='cuda:0', dtype=torch.float64)}, {'token_index': 5, 'optimal_transport': tensor(0.0941, device='cuda:0', dtype=torch.float64)}, {'token_index': 6, 'optimal_transport': tensor(0.0822, device='cuda:0', dtype=torch.float64)}, {'token_index': 7, 'optimal_transport': tensor(0.3652, device='cuda:0', dtype=torch.float64)}, {'token_index': 8, 'optimal_transport': tensor(0.0760, device='cuda:0', dtype=torch.float64)}, {'token_index': 9, 'optimal_transport': tensor(0.1068, device='cuda:0', dtype=torch.float64)}, {'token_index': 10, 'optimal_transport': tensor(0

### Visualizations

In [6]:
from captum.attr import visualization
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import os

def show_heatmap_on_text(text, text_encoding, R_text, output_html="heatmap_visualization.html"):
    # Normalize relevance scores
    text_scores = R_text / R_text.sum()
    text_scores = text_scores.flatten()
    
    # Tokenize and decode the text
    text_tokens = tokenizer.encode(text)
    text_tokens_decoded = [tokenizer.decode([a]) for a in text_tokens]
    
    # Prepare visualization data
    vis_data_records = [
        visualization.VisualizationDataRecord(
            text_scores, 0, 0, 0, 0, 0, text_tokens_decoded, 1
        )
    ]
    
    # Generate HTML visualization
    html_content = visualization.visualize_text(vis_data_records).data
    
    # Save to an HTML file
    with open(output_html, "w", encoding="utf-8") as html_file:
        html_file.write(html_content)
    print(f"Heatmap saved to {output_html}")
# Example usage:
# Assuming you have the necessary tokenizer and data ready:
# show_heatmap_on_text(text, text_encoding, R_text, tokenizer)



In [None]:
import torch

# Calculate attribution scores
attribution_scores = torch.stack([d['optimal_transport'] for d in attributions])
print("Original Attribution Scores:", attribution_scores)

# Normalize the scores using min-max normalization
min_scores = attribution_scores.min(dim=0, keepdim=True).values
max_scores = attribution_scores.max(dim=0, keepdim=True).values
normalized_scores = (attribution_scores - min_scores) / (max_scores - min_scores + 1e-8)

# Optionally, amplify differences (e.g., squaring)
amplified_scores = normalized_scores
print("Normalized and Amplified Scores:", amplified_scores)

# Retain only the top-5 scores and set others to 0
top_k = 7
values, indices = torch.topk(attribution_scores, k=top_k, dim=0)  # Get top-k scores and indices

# Create a mask to set others to 0
mask = torch.zeros_like(amplified_scores)
mask.scatter_(0, indices, values)  # Place top-k values in the mask
masked_scores = mask  # This now contains only the top-k scores
print("Masked Scores (Top-5 Only):", masked_scores)

# Convert token IDs to token strings
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

# Find the corresponding tokens and normalized values
result = [(tokens[idx.item()], masked_scores[i].item()) for i, idx in enumerate(indices)]

# Print results
for i, (token, score) in enumerate(result):
    print(f"Column {i}: Token = {token}, Normalized Max Attribution = {score}")

# Visualize heatmap on text with masked scores
show_heatmap_on_text(input_text, inputs.input_ids, masked_scores)


Original Attribution Scores: tensor([0.4701, 0.2135, 0.2023, 0.2421, 0.1625, 0.0941, 0.0822, 0.3652, 0.0760,
        0.1068, 0.0801, 0.1187, 0.2675, 0.1584, 0.0425, 0.4665, 0.2370, 0.6317],
       device='cuda:0', dtype=torch.float64)
Normalized and Amplified Scores: tensor([0.7257, 0.2902, 0.2713, 0.3388, 0.2037, 0.0877, 0.0674, 0.5477, 0.0569,
        0.1093, 0.0638, 0.1293, 0.3820, 0.1968, 0.0000, 0.7197, 0.3301, 1.0000],
       device='cuda:0', dtype=torch.float64)
Masked Scores (Top-5 Only): tensor([0.4701, 0.0000, 0.0000, 0.2421, 0.0000, 0.0000, 0.0000, 0.3652, 0.0000,
        0.0000, 0.0000, 0.0000, 0.2675, 0.0000, 0.0000, 0.4665, 0.2370, 0.6317],
       device='cuda:0', dtype=torch.float64)
Column 0: Token = Ġresident, Normalized Max Attribution = 0.47009131274221294
Column 1: Token = The, Normalized Max Attribution = 0.0
Column 2: Token = Ġwealthiest, Normalized Max Attribution = 0.0
Column 3: Token = Ġis, Normalized Max Attribution = 0.24207751276396713
Column 4: Token = Ġsur

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (0.00),0.0,0.0,"The BR W Rich 200 , 2014 is the 31 st annual survey of the wealthiest people resident"
,,,,


Heatmap saved to heatmap_visualization.html
