In [1]:
import json
import os

# Define the base directory for the problem files
problem_dir = "math-rollouts/deepseek-r1-distill-llama-8b/temperature_0.6_top_p_0.95/correct_base_solution/problem_330"

# The directory containing the solution file
chunk_dir = "chunk_0"
# The name of the solution file
chunk_filename = "solutions.json"

# Construct the full path to the solutions.json file
chunk_path = os.path.join(problem_dir, chunk_dir, chunk_filename)

try:
    with open(chunk_path, 'r') as f:
        # Load the JSON content from the file
        chunk_data = json.load(f)

    print(f"Content of {chunk_filename}:")
    # Pretty-print the JSON object for better readability
    print(json.dumps(chunk_data, indent=2))

except FileNotFoundError:
    print(f"Error: The file {chunk_path} was not found.")
except json.JSONDecodeError:
    print(f"Error: The file {chunk_path} is not a valid JSON file.")

Content of solutions.json:
[
  {
    "chunk_removed": "Alright, so I've got this math problem here that I need to solve.",
    "prefix_without_chunk": "",
    "chunk_resampled": "Alright, so I've got this math problem here: Compute this big expression with a bunch of nested parentheses.",
    "rollout": "Alright, so I've got this math problem here: Compute this big expression with a bunch of nested parentheses. It looks pretty complicated at first glance, but maybe if I break it down step by step, it'll make more sense. Let me write it out to visualize it better:\n\n3(1 + 3(1 + 3(1 + 3(1 + 3(1 + 3(1 + 3(1 + 3(1 + 3(1 + 3)))))))))\n\nHmm, okay, so it's a series of nested multiplications and additions. It seems like each layer is multiplying 3 by something, and that something is 1 plus another set of 3 multiplied by something else. It's kind of a recursive structure, isn't it? Maybe I can solve it by starting from the innermost parentheses and working my way outwards. That usually helps 

In [None]:
import torch.nn as nn

In [None]:
chunk_data

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModelForCausalLM, pipeline

import torch


model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Or any other suitable model

mname = model_name

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Important: Add a pad token if the tokenizer doesn't have one, especially for decoder models.

if tokenizer.pad_token is None:

    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the model with `output_attentions=True`
model = AutoModelForCausalLM.from_pretrained(model_name, output_attentions=True)


The following generation flags are not valid and may be ignored: ['output_attentions']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_attentions']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


In [5]:
input_text = chunk_data[0]['full_cot']

In [None]:
all_input_texts

In [8]:
# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Pass the inputs to the model
# The output will include a tuple of attention weights, one for each layer
outputs = model(**inputs, output_attentions=True)

In [None]:
attention_weights = outputs.attentions

# Print the shape of the attention weights for each layer
print("Attention weights shape for each layer:")
for i, layer_attentions in enumerate(attention_weights):
    print(f"Layer {i}: {layer_attentions.shape}")

In [None]:
# check real attention scores
for layer in range(32):
    layer_attention = attention_weights[layer]
    # Squeeze the batch dimension
    attention_pattern = layer_attention.squeeze(0)
    for head in range(32):
        # Get the specific head's attention matrix and detach from the graph
        head_attention = attention_pattern[head].detach().numpy()
        sum_attention = head_attention.sum(axis=(1))
        print(f"Layer {layer}, Head {head}: {sum_attention}")


In [None]:
attention_weights = outputs.attentions

# Print the shape of the attention weights for each layer
print("Attention weights shape for each layer:")
for i, layer_attentions in enumerate(attention_weights):
    print(f"Layer {i}: {layer_attentions.where()}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [None]:

import math


In [None]:
# Define the layer and head you want to visualize
layer_index = 10
head_index = 0

# Extract the attention pattern for the specified layer and head
if layer_index < len(attention_weights):
    layer_attention = attention_weights[layer_index]
    # Squeeze the batch dimension
    attention_pattern = layer_attention.squeeze(0)
    # Get the specific head's attention matrix and detach from the graph
    head_attention = attention_pattern[head_index].detach().numpy()

    # Create the heatmap visualization
    plt.figure(figsize=(10, 8))
    # sns.heatmap(head_attention, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
    sns.heatmap(head_attention, cmap='viridis')
    plt.title(f'Attention Head {head_index} in Layer {layer_index}')
    plt.xlabel('Keys')
    plt.ylabel('Queries')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

else:
    print("Invalid layer index.")

In [None]:
import numpy as np

In [None]:

def get_vertical_scores(
    avg_mat: np.ndarray,
    proximity_ignore: int = 20,
    control_depth: bool = True,
    score_type: str = "mean",
) -> np.ndarray:
    """
    Calculate vertical attention scores from an averaged attention matrix.
    """
    n = avg_mat.shape[0]
    trius = np.triu_indices_from(avg_mat, k=1)
    avg_mat = avg_mat.copy()
    avg_mat[trius] = np.nan
    trils = np.triu_indices_from(avg_mat, k=-proximity_ignore + 1)
    avg_mat[trils] = np.nan

    if control_depth:
        per_row = np.sum(~np.isnan(avg_mat), axis=1)
        avg_mat = stats.rankdata(avg_mat, axis=1, nan_policy="omit") / per_row[:, None]

    n = avg_mat.shape[-1]
    vert_scores = []
    for i in range(n):
        vert_lines = avg_mat[i + proximity_ignore :, i]
        if score_type == "mean":
            vert_score = np.nanmean(vert_lines)
        elif score_type == "median":
            vert_score = np.nanmedian(vert_lines)
        else:
            raise ValueError(f"Unknown score_type: {score_type}")
        vert_scores.append(vert_score)
    return np.array(vert_scores)


In [None]:
all_layer_head_scores = []
for layer, layer_attn in enumerate(attention_weights):
    layer_scores = []
    for head in range(layer_attn.shape[1]):
        avg_mat = layer_attn[0, head].detach().cpu().numpy()
        vert_scores = get_vertical_scores(avg_mat, proximity_ignore=4, control_depth=False, score_type="mean")
        layer_scores.append(vert_scores)
    all_layer_head_scores.append(layer_scores)
all_layer_head_scores = np.array(all_layer_head_scores)  # shape: [num_layers, num_heads, seq_len - proximity_ignore]

print(all_layer_head_scores.shape)

In [None]:
all_layer_head_scores[:, :, 0]

In [None]:
!pip install scipy

In [None]:
from scipy import stats
def get_3d_ar_kurtosis(layer_head_vert_scores: np.ndarray) -> np.ndarray:
    layer_head_kurts = stats.kurtosis(
        layer_head_vert_scores, axis=2, fisher=True, bias=True, nan_policy="omit"
    )  # NaNs from the proximity ignorance
    return layer_head_kurts

# Compute kurtosis for all [layer][head] using vertical scores
layer_head_kurtosis = get_3d_ar_kurtosis(all_layer_head_scores)
print('Kurtosis shape:', layer_head_kurtosis.shape)
print(layer_head_kurtosis)

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(layer_head_kurtosis, cmap='viridis', annot=False, fmt=".2f")
plt.title('Kurtosis of Vertical Attention Scores by Layer and Head', fontsize=16)
plt.xlabel('Heads')
plt.ylabel('Layers')
plt.xticks(np.arange(0, 32, 2), np.arange(0, 32, 2))
plt.yticks(np.arange(0, 32, 2), np.arange(0, 32, 2))
plt.show()

In [None]:
#goes through entire chunk

In [None]:
len(chunk_data)

In [6]:
all_input_texts = []

# Loop through each index in the chunk_data list
for i in range(2):
    # Access the dictionary at the current index
    current_chunk = chunk_data[i]
    
    # Get the value associated with the 'full_cot' key
    input_text = current_chunk['full_cot']
    
    # Append the input_text to your new list
    all_input_texts.append(input_text)

In [7]:
all_attention_weights = []

# 3. Process each input individually
for input_text in all_input_texts:
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model(**inputs, output_attentions=True)
    all_attention_weights.append(outputs.attentions)

: 

In [None]:
 num_layers = len(all_attention_weights[0])
    
    # Initialize a list to hold the sum of attention weights for each layer
    summed_attention_weights = [
        torch.zeros_like(all_attention_weights[0][i]) for i in range(num_layers)
    ]

    # Sum up the attention weights from all inputs
    for attention_run in all_attention_weights:
        for i in range(num_layers):
            summed_attention_weights[i] += attention_run[i]
            
    # Calculate the average by dividing the sum by the number of inputs
    num_inputs = len(all_attention_weights)
    average_attention_weights = [
        summed_layer / num_inputs for summed_layer in summed_attention_weights
    ]
    
    # Print the shape of the averaged attention weights to verify
    print("Average attention weights calculated for each layer:")
    for i, avg_attn in enumerate(average_attention_weights):
        print(f"Layer {i}: {avg_attn.shape}")