In [None]:
import os

import json


def process_problem_data(base_path):

    """

    Iterates through all problem directories, extracts problem statements

    and sentences from `chunks_labeled.json`, and returns a list of dictionaries.


    Args:

        base_path (str): The path to the directory containing all the problems

                         (e.g., 'math-rollouts/.../correct_base_solution').


    Returns:

        list: A list of dictionaries, where each dictionary contains the problem

              and all sentences for a given problem directory.

    """

    all_problem_data = []


    # Check if the base path exists

    if not os.path.isdir(base_path):

        print(f"Error: The directory '{base_path}' was not found.")

        return all_problem_data

    print(f"Found problem directory: {base_path}")


    # List all entries in the base directory

    problem_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

    print(f"Found problem directory: {problem_dirs}")


    if not problem_dirs:

        print(f"No problem directories found in '{base_path}'.")

        return all_problem_data


    # Iterate through each problem directory (e.g., problem_330, problem_1591)

    for problem_name in problem_dirs:

        problem_path = os.path.join(base_path, problem_name)

       

        # Define the file paths for the problem and chunks

        problem_file = os.path.join(problem_path, "problem.json")

        chunks_file = os.path.join(problem_path, "chunks_labeled.json")

       

        problem_text = ""

        allsentences = []

       

        # Load the problem statement

        try:

            with open(problem_file, 'r') as f:

                problem_data = json.load(f)

                problem_text = problem_data.get("problem", "")

        except (FileNotFoundError, json.JSONDecodeError) as e:

            print(f"Skipping {problem_name}: Could not load problem.json. Error: {e}")

            continue


        # Load all sentences from chunks_labeled.json

        try:

            with open(chunks_file, 'r') as f:

                chunks_data = json.load(f)

                allsentences = [chunk["chunk"] for chunk in chunks_data]

        except (FileNotFoundError, json.JSONDecodeError) as e:

            print(f"Skipping {problem_name}: Could not load chunks_labeled.json. Error: {e}")

            continue


        # Create a dictionary to store the extracted data

        problem_info = {

            "problem_id": problem_name,

            "problem_statement": problem_text,

            "sentences": allsentences

        }

        all_problem_data.append(problem_info)


    return all_problem_data

    print("No data was loaded.")




# Define the base directory for all problems

base_problem_dir = "deepseek-r1-distill-llama-8b/temperature_0.6_top_p_0.95/correct_base_solution"

# Run the function to get all the data

correct_all_data = process_problem_data(base_problem_dir)


# Now, `all_data` is a list of dictionaries. You can iterate through it.

print(f"Successfully loaded data for {len(correct_all_data)} problems.")

# Define the base directory for all problems

base_problem_dir = "deepseek-r1-distill-llama-8b/temperature_0.6_top_p_0.95/incorrect_base_solution"

# Run the function to get all the data

incorrect_all_data = process_problem_data(base_problem_dir)


print(f"Successfully loaded data for {len(incorrect_all_data)} problems.") 

In [None]:
all_data = correct_all_data + incorrect_all_data

In [None]:
all_data

In [None]:
import os
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModelForCausalLM, pipeline
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
import gc
from tqdm import tqdm

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Or any other suitable model

mname = model_name

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Important: Add a pad token if the tokenizer doesn't have one, especially for decoder models.

if tokenizer.pad_token is None:

    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, output_attentions=True,  torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
# --- Helper: Sentence boundaries using tokenizer ---
def get_raw_tokens(text, model_name=None):
    return tokenizer(text)['input_ids']

def get_sentence_token_boundaries(text, sentences, model_name=None):
    import re
    def normalize_spaces(s):
        return re.sub(r"[\u00A0\u1680\u2000-\u200B\u202F\u205F\u3000\uFEFF]", " ", s)
    char_positions = []
    search_start = 0
    text_normalized = normalize_spaces(text)
    for sentence in sentences:
        sentence_normalized = normalize_spaces(sentence)
        norm_pos = text_normalized.find(sentence_normalized, search_start)
        if norm_pos == -1:
            sentence_stripped = sentence_normalized.strip()
            norm_pos = text_normalized.find(sentence_stripped, search_start)
            if norm_pos == -1:
                raise ValueError(f"Sentence not found in text: {sentence}")
            norm_end = norm_pos + len(sentence_stripped)
        else:
            norm_end = norm_pos + len(sentence_normalized)
        original_pos = 0
        normalized_count = 0
        actual_start = -1
        actual_end = -1
        for i, char in enumerate(text):
            if normalized_count == norm_pos and actual_start == -1:
                actual_start = i
            if normalized_count == norm_end:
                actual_end = i
                break
            if normalize_spaces(char) == " " or char == text_normalized[normalized_count]:
                normalized_count += 1
        if actual_end == -1 and normalized_count == norm_end:
            actual_end = len(text)
        char_positions.append((actual_start, actual_end))
        search_start = norm_end
    token_boundaries = []
    for char_start, char_end in char_positions:
        tokens_to_start = len(get_raw_tokens(text[:char_start], model_name)) if char_start > 0 else 0
        tokens_to_end = len(get_raw_tokens(text[:char_end], model_name))
        token_boundaries.append((tokens_to_start, tokens_to_end))
    return token_boundaries

# --- Helper: Average attention over sentence boundaries ---
def _compute_averaged_matrix(matrix, sentence_boundaries):
    n = len(sentence_boundaries)
    result = np.zeros((n, n), dtype=np.float32)
    for i in range(n):
        row_start, row_end = sentence_boundaries[i]
        row_start = min(row_start, matrix.shape[0] - 1)
        row_end = min(row_end, matrix.shape[0] - 1)
        if row_start >= row_end:
            continue
        for j in range(n):
            col_start, col_end = sentence_boundaries[j]
            col_start = min(col_start, matrix.shape[1] - 1)
            col_end = min(col_end, matrix.shape[1] - 1)
            if col_start >= col_end:
                continue
            region = matrix[row_start:row_end, col_start:col_end]
            if region.size > 0:
                result[i, j] = np.mean(region)
    return result

# --- Helper: Get vertical scores (receiver heads) ---
def get_vertical_scores(avg_mat, proximity_ignore=1, control_depth=False, score_type="mean"):
    n = avg_mat.shape[0]
    trius = np.triu_indices_from(avg_mat, k=1)
    avg_mat = avg_mat.copy()
    avg_mat[trius] = np.nan
    trils = np.triu_indices_from(avg_mat, k=-proximity_ignore + 1)
    avg_mat[trils] = np.nan
    if control_depth:
        per_row = np.sum(~np.isnan(avg_mat), axis=1)
        avg_mat = stats.rankdata(avg_mat, axis=1, nan_policy="omit") / per_row[:, None]
    n = avg_mat.shape[-1]
    vert_scores = []
    for i in range(n):
        vert_lines = avg_mat[i + proximity_ignore :, i]
        if score_type == "mean":
            vert_score = np.nanmean(vert_lines)
        elif score_type == "median":
            vert_score = np.nanmedian(vert_lines)
        else:
            raise ValueError(f"Unknown score_type: {score_type}")
        vert_scores.append(vert_score)
    return np.array(vert_scores)


In [None]:
def get_3d_ar_kurtosis(all_layer_head_vert_scores):
    """
    Compute kurtosis across the last axis (sentences) for each (layer, head).
    Input: all_layer_head_vert_scores: shape (num_layers, num_heads, num_sentences)
    Output: layer_head_kurtosis: shape (num_layers, num_heads)
    """
    return stats.kurtosis(
        all_layer_head_vert_scores, axis=2, fisher=True, bias=True, nan_policy="omit"
    )

In [None]:
def process_all_problems_for_averaged_kurtosis_simple(all_data, model, tokenizer, model_name, batch_size=2):
    """
    Process all problems and compute averaged kurtosis across reasoning traces.
    Uses your existing loaded model. No saving of individual vertical scores.
    """
    all_problem_vert_scores = []
    successful_problems = 0
    
    print(f"Processing {len(all_data)} problems for averaged kurtosis analysis...")
    
    for batch_start in tqdm(range(0, len(all_data), batch_size), desc="Processing batches"):
        batch_end = min(batch_start + batch_size, len(all_data))
        batch_problems = all_data[batch_start:batch_end]
        
        for i, problem in enumerate(batch_problems):
            global_idx = batch_start + i
            print(f"Processing problem {global_idx+1}/{len(all_data)}: {problem['problem_id']}")
            
            try:
                # Extract problem data
                problem_statement = problem['problem_statement']
                sentences = problem['sentences']
                
                # Limit sentence count to avoid memory issues
                max_sentences = 100
                if len(sentences) > max_sentences:
                    sentences = sentences[:max_sentences]
                
                # Create input text with problem statement first
                all_sentences = [problem_statement] + sentences
                input_text = "\n".join(all_sentences)
                
                # Tokenize
                inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
                
                # Move to GPU if available
                if torch.cuda.is_available():
                    inputs = {k: v.cuda() for k, v in inputs.items()}
                
                # Get attention weights
                with torch.no_grad():
                    outputs = model(**inputs, output_attentions=True)
                attention_weights = outputs.attentions
                
                # Get sentence boundaries - use the function you already have
                sentence_boundaries = get_sentence_token_boundaries(
                    input_text, all_sentences, model_name
                )
                
                if len(sentence_boundaries) == 0:
                    print(f"  FAILED: No sentence boundaries found")
                    continue
                
                # Get dimensions
                num_layers = len(attention_weights)
                num_heads = attention_weights[0].shape[1]
                num_sentences = len(sentence_boundaries)
                
                # Initialize array for vertical scores
                all_layer_head_vert_scores = np.zeros((num_layers, num_heads, num_sentences), dtype=np.float32)
                
                for layer in range(num_layers):
                    for head in range(num_heads):
                        # Extract attention tensor and convert to float32 if needed
                        attn_tensor = attention_weights[layer][0, head]
                        if attn_tensor.dtype == torch.bfloat16:
                            attn_tensor = attn_tensor.to(torch.float32)
                        
                        attn_mat = attn_tensor.detach().cpu().numpy()
                        
                        # Compute vertical scores
                        avg_mat = _compute_averaged_matrix(attn_mat, sentence_boundaries)
                        vert_scores = get_vertical_scores(
                            avg_mat, proximity_ignore=1, control_depth=False, score_type="mean"
                        )
                        
                        # Store vertical scores
                        actual_len = min(len(vert_scores), num_sentences)
                        all_layer_head_vert_scores[layer, head, :actual_len] = vert_scores[:actual_len]
                
                # Store this problem's vertical scores
                all_problem_vert_scores.append(all_layer_head_vert_scores)
                successful_problems += 1
                print(f"  SUCCESS: Shape {all_layer_head_vert_scores.shape}. Total successful: {successful_problems}")
                
                # Clean up memory
                del outputs, attention_weights
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
            except Exception as e:
                print(f"  FAILED: Error processing problem {problem['problem_id']}: {e}")
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                continue
        
        # Force garbage collection after each batch
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print(f"Completed batch {batch_start//batch_size + 1}/{(len(all_data)-1)//batch_size + 1}")
    
    print(f"\nSuccessfully processed {successful_problems} problems")
    
    if not all_problem_vert_scores:
        print("No vertical scores collected!")
        return None
    
    # Compute averaged kurtosis (this should give lower values around 40)
    print("Computing averaged kurtosis across all reasoning traces...")
    
    # Find minimum sentence count across all problems
    min_sentences = min(scores.shape[2] for scores in all_problem_vert_scores)
    print(f"Minimum sentences across all problems: {min_sentences}")
    
    # Truncate all problems to same sentence length
    truncated_scores = []
    for scores in all_problem_vert_scores:
        truncated_scores.append(scores[:, :, :min_sentences])
    
    # Stack and average across problems
    stacked_scores = np.stack(truncated_scores, axis=0)  # Shape: (num_problems, num_layers, num_heads, min_sentences)
    averaged_scores = np.mean(stacked_scores, axis=0)    # Shape: (num_layers, num_heads, min_sentences)
    
    print(f"Averaged scores shape: {averaged_scores.shape}")
    
    # Use the kurtosis function
    layer_head_kurtosis = get_3d_ar_kurtosis(averaged_scores)
    
    print(f"Kurtosis computation complete!")
    print(f"Kurtosis shape: {layer_head_kurtosis.shape}")
    print(f"Kurtosis range: {np.nanmin(layer_head_kurtosis):.3f} to {np.nanmax(layer_head_kurtosis):.3f}")
    print(f"Kurtosis mean: {np.nanmean(layer_head_kurtosis):.3f}")
    
    return layer_head_kurtosis

In [None]:


layer_head_kurtosis = process_all_problems_for_averaged_kurtosis_simple(
    all_data, model, tokenizer, model_name, batch_size=4
    )

In [None]:


import matplotlib.pyplot as plt
import numpy as np
flat_kurtosis = layer_head_kurtosis.flatten()
plt.figure(figsize=(10,6))
plt.hist(flat_kurtosis, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Kurtosis Scores for All (Layer, Head) Pairs')
plt.xlabel('Kurtosis')
plt.ylabel('Count')
# Plot percentiles
for p in [50, 75, 90, 95, 99]:
    perc = np.percentile(flat_kurtosis, p)
    plt.axvline(perc, color='red', linestyle='--', label=f'{p}th percentile: {perc:.2f}')
plt.legend()
plt.show()



In [None]:
num_top = 20  # Number of heads to select
# Flatten and get indices of top 20 kurtosis values
flat_indices = np.argsort(layer_head_kurtosis.flatten())[::-1][:num_top]
layer_indices, head_indices = np.unravel_index(flat_indices, layer_head_kurtosis.shape)
top_20_heads = list(zip(layer_indices, head_indices))

print("Top 20 heads by kurtosis (layer, head):")
for i, (layer, head) in enumerate(top_20_heads):
    print(f"{i+1:2d}: Layer {layer}, Head {head}, Kurtosis: {layer_head_kurtosis[layer, head]:.3f}")

In [None]:
top_20_heads