In [1]:
import json
with open("data/traces.jsonl", "r") as f:
    traces = [json.loads(line) for line in f.readlines()]
import pandas as pd

df = pd.DataFrame(traces)

In [2]:
D_MODEL_ALIAS_PATH = {
    'llama-8B': r"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    'qwen-7B': r"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    'qwen-1p5B': r"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
}

In [8]:
import torch
import gc
def mps_cleanup():
    # Drop Python refs
    gc.collect()
    # Free MPS allocator cache (CUDA equivalent is torch.cuda.empty_cache())
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

In [3]:
import torch
from transformer_lens import HookedTransformer
torch.set_grad_enabled(False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.mps.is_available():
    device = torch.device("mps")
    
model = HookedTransformer.from_pretrained_no_processing(
    D_MODEL_ALIAS_PATH['qwen-1p5B'],
    dtype=torch.bfloat16,
    device=device
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B into HookedTransformer


In [5]:
qwen_sm_df = df[df["model_name"] == "qwen-1p5B"]
tc = qwen_sm_df.iloc[1].to_dict()
tc

{'model_name': 'qwen-1p5B',
 'unique_id': 'test/intermediate_algebra/1994.json',
 'problem': 'Define\n\\[p = \\sum_{k = 1}^\\infty \\frac{1}{k^2} \\quad \\text{and} \\quad q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}.\\]Find a way to write\n\\[\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3}\\]in terms of $p$ and $q.$',
 'response_withR': "<｜begin▁of▁sentence｜>Define\n\\[p = \\sum_{k = 1}^\\infty \\frac{1}{k^2} \\quad \\text{and} \\quad q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}.\\]Find a way to write\n\\[\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3}\\]in terms of $p$ and $q.$ Please reason step by step, and put your final answer within \\boxed{}. <think>\nOkay, so I have this problem where I need to find a way to express the double sum \\(\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3}\\) in terms of \\(p\\) and \\(q\\), where \\(p = \\sum_{k = 1}^\\infty \\frac{1}{k^2}\\) and \\(q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}\\). Hmm, okay. I re

In [6]:
D_TOP_RFHS_BY_LAYER_HEAD = {
    "llama-8B": [(8, 11), (10, 31), (0, 22), (13, 6), (16, 1), (13, 18), (13, 5), (8, 9), (13, 4), (8, 1)],
    "qwen-7B": [(16, 0), (19, 15), (14, 7), (1, 1), (17, 18), (22, 7), (17, 19), (17, 14), (16, 14), (14, 0)],
    "qwen-1p5B": [(16, 2), (1, 5), (19, 1), (12, 1), (16, 11), (23, 2), (14, 3), (19, 5), (20, 9), (16, 0)]
}

d_top_head_index_by_layer_model = {}

for k, v in D_TOP_RFHS_BY_LAYER_HEAD.items():
    d_tmp = {}
    for cord in v:
        if cord[0] not in d_tmp:
            d_tmp[cord[0]] = []
        d_tmp[cord[0]].append(cord[1])
    d_top_head_index_by_layer_model[k] = d_tmp

qwen_sm_layer_heads = d_top_head_index_by_layer_model["qwen-1p5B"]

In [7]:
qwen_sm_layer_heads

{16: [2, 11, 0], 1: [5], 19: [1, 5], 12: [1], 23: [2], 14: [3], 20: [9]}

In [9]:
from torch.nn.functional import cross_entropy

def compute_loss(text):
    tokens = model.to_tokens(text)
    logits = model(tokens, return_type="logits")
    idx = model.to_str_tokens(text).index("</think>") + 6
    answer_logits = logits[0, idx-1:-1, :]
    gt = tokens[0, idx:]
    loss = cross_entropy(answer_logits, gt, reduction='mean').item()
    return loss

In [None]:
withR_loss = compute_loss(tc["response_withR"])
mps_cleanup()
withoutR_loss = compute_loss(tc["response_withoutR"])
mps_cleanup()

In [11]:
withR_loss, withoutR_loss

(0.236328125, 1.171875)

In [15]:
def get_attention_pattern_by_top_head(tokens, model, d_top_head_index_by_layer):

    ret = {}

    def _get_attention_hook(pattern, hook):
        if hook.layer() not in d_top_head_index_by_layer:
            return
        top_head_indices = d_top_head_index_by_layer[hook.layer()]
        for head_idx in top_head_indices:
            ret[f"{hook.layer()}_{head_idx}"] = pattern[0, head_idx].detach().cpu().numpy()
    
    pattern_hook_names_filter = lambda name: name.endswith("pattern")

    # Run the model with the hook
    _ = model.run_with_hooks(
        tokens,
        return_type=None,
        fwd_hooks=[(
            pattern_hook_names_filter,
            _get_attention_hook
        )]
    )
    mps_cleanup()
    return ret

In [16]:
withoutR_tokens = model.to_tokens(tc["response_withoutR"])
withR_tokens = model.to_tokens(tc["response_withR"])

attn_patterns_withR = get_attention_pattern_by_top_head(
    tokens = withR_tokens,
    model = model,
    d_top_head_index_by_layer = qwen_sm_layer_heads
)

In [28]:
def patch_attention_patterns_by_top_head(tokens, model, d_top_head_index_by_layer, attn_patterns):

    def _patch_attention_hook(pattern, hook):
        if hook.layer() not in d_top_head_index_by_layer:
            return pattern
        top_head_indices = d_top_head_index_by_layer[hook.layer()]
        for head_idx in top_head_indices:
            target = attn_patterns[f"{hook.layer()}_{head_idx}"]
            pattern[0, head_idx] = torch.as_tensor(target, device=pattern.device, dtype=pattern.dtype)
        return pattern
    
    pattern_hook_names_filter = lambda name: name.endswith("pattern")

    # Run the model with the hook
    logits = model.run_with_hooks(
        tokens,
        return_type="logits",
        fwd_hooks=[(
            pattern_hook_names_filter,
            _patch_attention_hook
        )]
    )
    return logits

In [29]:
withoutR_patched_logits = patch_attention_patterns_by_top_head(
    tokens = withoutR_tokens,
    model = model,
    d_top_head_index_by_layer = qwen_sm_layer_heads,
    attn_patterns = attn_patterns_withR
)

RuntimeError: The expanded size of the tensor (151) must match the existing size (6249) at non-singleton dimension 1.  Target sizes: [151, 151].  Tensor sizes: [6249, 6249]

In [32]:
# above error is because of different token lenghts in noCOT and COT responses. we have to align properly and patch only at the answer positions