In [1]:
import json
from tqdm.auto import tqdm
import pickle
import functools
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch as t
from threading import Thread
from io import StringIO

In [2]:
model_name = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-1.5b/2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype = t.bfloat16,
    device_map = "auto",
    attn_implementation="eager"
)

2025-12-22 08:32:05.715045: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766392325.936810      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766392326.004621      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766392326.553155      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766392326.553184      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766392326.553187      55 computation_placer.cc:177] computation placer alr

In [4]:
device = t.device('cuda') if t.cuda.is_available() else t.device('cpu')

In [None]:
# receiver_heads = [(15,  7),(0,  8),(0,  0), (0,  6), (21,  5), (1,  3), (14,  8), (21,  2), (15, 11), (12,  8), (7, 2), (9, 2), (6,  6), (11, 11), (6,  2), (21,  3), (14,  7), (8,  4), (11,  1), (4,  9)]
receiver_heads  = [(21, 5), (21, 2)]

In [12]:
d_head = model.config.hidden_size // model.config.num_attention_heads
def scale_receiver_heads_hook(module, input, head_idx, scale=2):
    full_attn_matrix = input[0].clone()  #removes the batch dim
    b, s, h = full_attn_matrix.shape
    reshaped_attn_matrix = full_attn_matrix.view(b, s, -1, d_head)
    reshaped_attn_matrix[:, :, head_idx, :] *= scale
    return (reshaped_attn_matrix.view(b, s, -1),)

In [9]:
#get the problems

records = []
with open("/kaggle/input/evalsdataset/evals_final.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

In [10]:
#get the answers

with open('/kaggle/input/answers/answers.pkl', 'rb') as f:
    actual_answers = pickle.load(f)

In [13]:
def generate_streaming_output(ids, max_new_tokens=500):
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_special_tokens=True,
        skip_prompt=True
    )

    g_kwargs = dict(
        **ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        temperature=0.6,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    thread = Thread(target=model.generate, kwargs=g_kwargs)
    thread.start()

    buf = StringIO()
    for chunk in streamer:
        buf.write(chunk)

    thread.join()
    return buf.getvalue()


In [21]:
!rm /kaggle/working/evals_scale_10.jsonl

In [22]:
def boost_heads_generate(num_questions: int, scale: int = 5):
    out_f = open(f"evals_scale_{scale}.jsonl", "a", encoding="utf-8")
    
    for i in tqdm(range(num_questions)):
        problem = records[i]["problem"]
            
        boosting_hooks = []
        try:
            for layer, head in receiver_heads:
                target_module = model.model.layers[layer].self_attn.o_proj
                target_module.output_attentions = True
            
                hook = target_module.register_forward_pre_hook(
                    functools.partial(scale_receiver_heads_hook, head_idx = head, scale = scale)
                )
                boosting_hooks.append(hook)
        
            
            problem = records[i]["problem"]
        
            full_prompt = (
                "<|User|>"
                + problem
                + "<|Assistant|><think>"
            )
            input_ids = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False).to(device)
        
            answer_text = generate_streaming_output(input_ids, max_new_tokens=4096)
        
            record = {
                "q_id": i,
                "problem": problem,
                "cot": answer_text,
                "actual_ans": actual_answers[i]
            }
        
            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
            out_f.flush() 

            del input_ids
            del answer_text
            
        except Exception as e:
            print(f"Error occured: {e}")

        finally:
            for hook in boosting_hooks:
                hook.remove()  

    out_f.close()                        
        

In [None]:
boost_heads_generate(20, 10)

  0%|          | 0/20 [00:00<?, ?it/s]