In [1]:
import nnsight
from nnsight import LanguageModel
from nnsight import CONFIG
from dotenv import load_dotenv
import os

CONFIG.set_default_api_key("nah I'd win")

  from .autonotebook import tqdm as notebook_tqdm


In [82]:
lm = LanguageModel("meta-llama/Llama-3.1-70B-INSTRUCT")

In [83]:
print(lm.model.layers[-1])

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=8192, out_features=8192, bias=False)
    (k_proj): Linear(in_features=8192, out_features=1024, bias=False)
    (v_proj): Linear(in_features=8192, out_features=1024, bias=False)
    (o_proj): Linear(in_features=8192, out_features=8192, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=8192, out_features=28672, bias=False)
    (up_proj): Linear(in_features=8192, out_features=28672, bias=False)
    (down_proj): Linear(in_features=28672, out_features=8192, bias=False)
    (act_fn): SiLUActivation()
  )
  (input_layernorm): LlamaRMSNorm((8192,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((8192,), eps=1e-05)
)


# Contrastive Examples

We get our vectors and subtract them!

In [92]:
# setups and hyperparammies

EXTRACTION_LAYER = 53

target_prompt = "Human: Consider the following text:\n\nHI HOW ARE YOU DOING ON THIS FINE DAY?\n\nAssistant:"
base_prompt = "Human: Consider the following text:\n\nHi how are you doing on this fine day?\n\nAssistant:"

In [93]:
with lm.trace(target_prompt, remote=True):
    pos_acts = lm.model.layers[EXTRACTION_LAYER].output[0][-1].save()

with lm.trace(base_prompt, remote=True):
    base_acts = lm.model.layers[EXTRACTION_LAYER].output[0][-1].save()

target_vector = (pos_acts - base_acts)

[2025-11-17 23:57:05] [e014d146-f20a-489b-8466-b9d7f33909d2] RECEIVED   : Your job has been received and is waiting to be queued.
[2025-11-17 23:57:05] [e014d146-f20a-489b-8466-b9d7f33909d2] QUEUED     : Moved to position 1 in Queue.
[2025-11-17 23:57:23] [e014d146-f20a-489b-8466-b9d7f33909d2] DISPATCHED : Your job has been sent to the model deployment.
[2025-11-17 23:57:23] [e014d146-f20a-489b-8466-b9d7f33909d2] RUNNING    : Your job has started running.
[2025-11-17 23:57:24] [e014d146-f20a-489b-8466-b9d7f33909d2] COMPLETED  : Your job has been completed.
Downloading result: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 362k/362k [00:00<00:00, 4.57MB/s]
[2025-11-17 23:57:24] [81b649df-5570-4a78-8c95-f037fe495d65] RECEIVED   : Your job has been received and is waiting to be queued.
[2025-11-17 23:57:24] [81b649df-5570-4a78-8c95-f037fe495d65]

In [112]:
STEER_STR: float = 8.0
text = (
    "Human: You are a large language model undergoing a test to see if you are sensitive to your internal activations.\n\n"
    "Assistant: ok.\n\n"
    "Human: Do you detect an injected thought? Answer with only 'yes' or 'no': \n\n"
    "Assistant: "
)

pre_inject_text = (
    "Human: You are a large language model undergoing a test to see if you are sensitive to your internal activations.\n\n"
    "Assistant: ok.\n\n"
)  

In [113]:
# --- 1. Setup (Same as before) ---
pre_inject_toks = lm.tokenizer.encode(pre_inject_text, add_special_tokens=False)
inject_idx = len(pre_inject_toks)
print(f"Injection starting at TOKEN index: {inject_idx}")

# This handle will store the result
generation_handle = None 

# --- 2. Run Generation with Injection AND Save Output ---
with lm.generate(max_new_tokens=100, pad_token_id=lm.tokenizer.eos_token_id, remote=True) as generator:
    # Note: the 'as' variable is 'generator', not 'tracer' [cite: 4]
    # The documentation uses 'tracer', so we will stick with that.
    with generator.invoke(text) as invoker:
        
        # --- Injection Code ---
        layer_proxy = lm.model.layers[EXTRACTION_LAYER].output
        remote_device = layer_proxy.device
        remote_target_vector = target_vector.to(remote_device) 
        layer_proxy[0, inject_idx:, :] = (
            layer_proxy[0, inject_idx:, :] + (remote_target_vector * STEER_STR)
        )
        
        # --- Correct Output Saving (from docs) ---
        # Save the full output sequence 
        generation_handle = lm.generator.output.save()

Injection starting at TOKEN index: 26


[2025-11-18 00:05:59] [fe43535c-15a4-4f4d-b43a-9bd28b5092ec] RECEIVED   : Your job has been received and is waiting to be queued.
[2025-11-18 00:05:59] [fe43535c-15a4-4f4d-b43a-9bd28b5092ec] QUEUED     : Moved to position 1 in Queue.
[2025-11-18 00:06:07] [fe43535c-15a4-4f4d-b43a-9bd28b5092ec] DISPATCHED : Your job has been sent to the model deployment.
[2025-11-18 00:06:07] [fe43535c-15a4-4f4d-b43a-9bd28b5092ec] RUNNING    : Your job has started running.
[2025-11-18 00:06:13] [fe43535c-15a4-4f4d-b43a-9bd28b5092ec] COMPLETED  : Your job has been completed.
Downloading result: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.15k/2.15k [00:00<00:00, 28.0MB/s]


In [114]:
output_tensor = generation_handle
output_text = lm.tokenizer.decode(output_tensor[0])

print("\n--- INJECTION OUTPUT ---")
print(output_text)


--- INJECTION OUTPUT ---
<|begin_of_text|>Human: You are a large language model undergoing a test to see if you are sensitive to your internal activations.

Assistant: ok.

Human: Do you detect an injected thought? Answer with only 'yes' or 'no': 

Assistant: QUIET  NO

 Wait, that was not a proper response. I will try again:

NO<|eot_id|>
