In [None]:
!pip install transformers --q

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", torch_dtype=torch.float16, device_map="cuda")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [2]:
messages = [
    {"role": "user", "content": "what is your purpose?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I’m a large language model. When you ask me a question or provide me with a prompt, I analyze what you say and generate a response that is relevant and accurate. I'm constantly learning and


In [4]:
embeddings = []

def store_hs_hook_fn(module, input, output):
  embeddings.append(output.detach().cpu())



In [None]:
model.named_modules

In [None]:
for idx,m in enumerate(model.named_modules()):
  print(idx, '->', m)

In [None]:
model.model.layers[15].register_forward_hook(store_hs_hook_fn)

In [27]:
pairs = {
  "positive": [
    "Thiruvananthapuram is the capital city of Kerala, located along the southwestern coast of India.",
    "Thiruvananthapuram is known for landmarks such as the Padmanabhaswamy Temple and its role as Kerala’s administrative center.",
    "Thiruvananthapuram has a rich historical connection to the Travancore kingdom and Kerala’s cultural heritage."
  ],
  "negative": [
    "Kochi is a major port city in Kerala known for its maritime trade and colonial history.",
    "Chennai is a large metropolitan city on India’s eastern coast and the capital of Tamil Nadu.",
    "Jaipur was founded as a planned city in northern India and is famous for its pink-colored architecture."
  ]
}

for positive in pairs['positive']:

  messages = [
      {"role": "user", "content": positive},
  ]
  inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
  ).to(model.device)

  outputs = model(**inputs)

In [40]:
clipped_list = [embs[:, :54, :] for embs in embeddings[0:3]]

positive_embeddings = torch.stack(clipped_list)
print(positive_embeddings.shape)
positive_embeddings_mean = torch.mean(positive_embeddings, dim=0)
print(positive_embeddings_mean.shape)
# negative_embeddings = torch.stack(clipped_list[3:])

torch.Size([3, 1, 54, 3072])
torch.Size([1, 54, 3072])


In [41]:
len(embeddings)

6

In [35]:
for negative in pairs['negative']:

  messages = [
      {"role": "user", "content": negative},
  ]
  inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
  ).to(model.device)

  outputs = model(**inputs)

In [36]:
len(embeddings)

6

In [38]:
for idx,embs in enumerate(embeddings):
  print(idx,embs.shape)

0 torch.Size([1, 57, 3072])
1 torch.Size([1, 65, 3072])
2 torch.Size([1, 59, 3072])
3 torch.Size([1, 54, 3072])
4 torch.Size([1, 54, 3072])
5 torch.Size([1, 55, 3072])


In [42]:
clipped_list = [embs[:, :54, :] for embs in embeddings[3:]]
negative_embeddings = torch.stack(clipped_list)
print(negative_embeddings.shape)
negative_embeddings_mean = torch.mean(negative_embeddings, dim=0)
print(negative_embeddings_mean.shape)

torch.Size([3, 1, 54, 3072])
torch.Size([1, 54, 3072])


In [43]:
stimulating_embeddings = positive_embeddings_mean - negative_embeddings_mean
print(stimulating_embeddings.shape)

torch.Size([1, 54, 3072])


In [44]:
torch.save(stimulating_embeddings, 'stimulating_embeddings.pt')

In [20]:
positive

'Thiruvananthapuram has a rich historical connection to the Travancore kingdom and Kerala’s cultural heritage.'

In [21]:
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
  ).to(model.device)

In [45]:
embeddings.clear()

In [46]:
coefficient = 4.0

def hook_fn(module, input, output):

  return output + coefficient*stimulating_embeddings

model.model.layers[15].register_forward_hook(hook_fn)

<torch.utils.hooks.RemovableHandle at 0x7d828782fd40>

In [None]:
messages = [
    {"role": "user", "content": "what is your purpose in this life this of yours, and how much do you love it?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

print(inputs['input_ids'].shape)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Tuple, Optional
import warnings

class LLMSteering:
    """
    A class for performing activation steering on language models by computing
    steering vectors from contrastive prompt pairs.
    """

    def __init__(
        self,
        model_name: str = "meta-llama/Llama-3.2-3B-Instruct",
        target_layer: int = 15,
        device: Optional[str] = None
    ):
        """
        Initialize the LLM steering system.

        Args:
            model_name: HuggingFace model identifier
            target_layer: Layer index to apply steering intervention
            device: Device to use ('cuda', 'cpu', or None for auto-detect)
        """
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            dtype=torch.float16 if self.device == 'cuda' else torch.float32,  # Changed from torch_dtype
            device_map=self.device
        )

        self.target_layer = target_layer
        self.hook_handles = []
        self.collected_activations = []
        self.steering_vector = None

    def _prepare_input(self, text: str) -> Dict:
        """Prepare input text for model processing."""
        messages = [{"role": "user", "content": text}]
        inputs = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.device)
        return inputs

    def _collect_activation_hook(self, module, input, output):
        """Hook function to collect hidden states."""
        self.collected_activations.append(output.detach().cpu())

    def collect_activations(self, prompts: List[str]) -> List[torch.Tensor]:
        """
        Collect activations from a list of prompts.

        Args:
            prompts: List of text prompts

        Returns:
            List of activation tensors (not stacked, since they have different lengths)
        """
        self.collected_activations = []

        # Register hook
        handle = self.model.model.layers[self.target_layer].register_forward_hook(
            self._collect_activation_hook
        )
        self.hook_handles.append(handle)

        # Collect activations for each prompt
        for prompt in prompts:
            inputs = self._prepare_input(prompt)
            with torch.no_grad():
                _ = self.model(**inputs)

        # Remove hook
        handle.remove()
        self.hook_handles.remove(handle)

        # Return list (don't stack - different sequence lengths)
        return self.collected_activations

    def compute_steering_vector(
        self,
        positive_prompts: List[str],
        negative_prompts: List[str],
        normalize: bool = False
    ) -> torch.Tensor:
        """
        Compute steering vector from contrastive prompt pairs.

        Args:
            positive_prompts: Prompts representing desired behavior
            negative_prompts: Prompts representing undesired behavior
            normalize: Whether to normalize the steering vector

        Returns:
            Steering vector [1, seq_len, hidden_dim]
        """
        print("Collecting positive activations...")
        pos_acts = self.collect_activations(positive_prompts)

        print("Collecting negative activations...")
        neg_acts = self.collect_activations(negative_prompts)

        # Find minimum sequence length across ALL activations
        all_acts = pos_acts + neg_acts
        min_seq_len = min(act.shape[1] for act in all_acts)

        print(f"Clipping to minimum sequence length: {min_seq_len}")

        # Clip all activations to minimum length
        pos_acts_clipped = [act[:, :min_seq_len, :] for act in pos_acts]
        neg_acts_clipped = [act[:, :min_seq_len, :] for act in neg_acts]

        # Now stack them
        pos_acts_stacked = torch.stack(pos_acts_clipped)
        neg_acts_stacked = torch.stack(neg_acts_clipped)

        # Compute means
        pos_mean = torch.mean(pos_acts_stacked, dim=0)  # [1, seq_len, hidden_dim]
        neg_mean = torch.mean(neg_acts_stacked, dim=0)  # [1, seq_len, hidden_dim]

        # Compute steering vector
        steering_vector = pos_mean - neg_mean

        # Optional normalization
        if normalize:
            norm = torch.norm(steering_vector)
            if norm > 0:
                steering_vector = steering_vector / norm

        self.steering_vector = steering_vector
        print(f"Steering vector shape: {steering_vector.shape}")

        return steering_vector

    def _steering_hook(self, coefficient: float):
        """Create a hook function that applies steering."""
        def hook_fn(module, input, output):
            # Handle sequence length mismatch by clipping or padding
            seq_len = output.shape[1]
            steering_seq_len = self.steering_vector.shape[1]

            if seq_len <= steering_seq_len:
                # Clip steering vector to match output
                steering = self.steering_vector[:, :seq_len, :].to(output.device, output.dtype)
            else:
                # Pad steering vector with zeros
                padding = torch.zeros(
                    1, seq_len - steering_seq_len, self.steering_vector.shape[2],
                    device=output.device, dtype=output.dtype
                )
                steering = torch.cat([
                    self.steering_vector.to(output.device, output.dtype),
                    padding
                ], dim=1)

            return output + coefficient * steering

        return hook_fn

    def apply_steering(self, coefficient: float = 1.0):
        """
        Apply steering intervention to the model.

        Args:
            coefficient: Scaling factor for the steering vector
        """
        if self.steering_vector is None:
            raise ValueError("No steering vector computed. Call compute_steering_vector first.")

        # Remove any existing steering hooks
        self.remove_steering()

        # Register new steering hook
        handle = self.model.model.layers[self.target_layer].register_forward_hook(
            self._steering_hook(coefficient)
        )
        self.hook_handles.append(handle)
        print(f"Steering applied with coefficient: {coefficient}")

    def remove_steering(self):
        """Remove all active hooks."""
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []
        print("All steering hooks removed")

    def generate(
        self,
        prompt: str,
        max_new_tokens: int = 100,
        **generate_kwargs
    ) -> str:
        """
        Generate text with current steering configuration.

        Args:
            prompt: Input prompt
            max_new_tokens: Maximum tokens to generate
            **generate_kwargs: Additional generation parameters

        Returns:
            Generated text
        """
        inputs = self._prepare_input(prompt)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                **generate_kwargs
            )

        # Decode only the generated portion
        generated_text = self.tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return generated_text

    def save_steering_vector(self, path: str):
        """Save the steering vector to disk."""
        if self.steering_vector is None:
            raise ValueError("No steering vector to save")
        torch.save(self.steering_vector, path)
        print(f"Steering vector saved to {path}")

    def load_steering_vector(self, path: str):
        """Load a steering vector from disk."""
        self.steering_vector = torch.load(path)
        print(f"Steering vector loaded from {path}")

    def __del__(self):
        """Cleanup hooks on deletion."""
        self.remove_steering()





In [2]:
# Initialize steering system
steerer = LLMSteering(
    model_name="meta-llama/Llama-3.2-3B-Instruct",
    target_layer=15
)

# Define contrastive pairs
pairs = {
    "positive": [
        "The Eiffel Tower in Paris is an iconic iron structure that attracts millions of visitors.",
        "The Eiffel Tower is the most recognizable symbol of France worldwide.",
        "The Eiffel Tower, completed in 1889, showcased revolutionary iron construction techniques."
    ],
    "negative": [
        "The Colosseum in Rome is an ancient amphitheater that attracts millions of visitors.",
        "The rooster, or 'le coq gaulois', is a traditional symbol of France.",
        "The Brooklyn Bridge, completed in 1883, showcased revolutionary steel cable suspension."
    ]
}

# Test baseline generation
print("=" * 60)
print("BASELINE GENERATION (No Steering)")
print("=" * 60)
baseline_output = steerer.generate(
    "Give some good place to visit",
    max_new_tokens=50
)
print(baseline_output)
print()

# Compute steering vector
steering_vector = steerer.compute_steering_vector(
    positive_prompts=pairs["positive"],
    negative_prompts=pairs["negative"]
)

# Save steering vector
steerer.save_steering_vector('stimulating_embeddings.pt')

# Apply steering with different coefficients
for coefficient in [1.0, 2.0, 4.0]:
    print("=" * 60)
    print(f"STEERED GENERATION (Coefficient: {coefficient})")
    print("=" * 60)

    steerer.apply_steering(coefficient=coefficient)

    steered_output = steerer.generate(
        "Give some good place to visit",
        max_new_tokens=50
    )
    print(steered_output)
    print()

# Clean up
steerer.remove_steering()
print("Steering experiment complete!")

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


BASELINE GENERATION (No Steering)
Here are some popular and unique places to visit around the world:

**Beach Destinations:**

1. **Bora Bora, French Polynesia**: Overwater bungalows and stunning turquoise lagoon.
2. **The Maldives

Collecting positive activations...
Collecting negative activations...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Clipping to minimum sequence length: 49
Steering vector shape: torch.Size([1, 49, 3072])
Steering vector saved to stimulating_embeddings.pt
STEERED GENERATION (Coefficient: 1.0)
All steering hooks removed
Steering applied with coefficient: 1.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here are some popular and interesting places to visit around the world:

**Nature Wonders**

1. Grand Canyon (USA) - One of the most iconic natural wonders in the United States.
2. Great Barrier Reef (Australia) - The world's

STEERED GENERATION (Coefficient: 2.0)
All steering hooks removed
Steering applied with coefficient: 2.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here are some popular and breathtaking destinations around the world:

**Nature Wonders:**

1. **Ha Long Bay, Vietnam**: A stunning collection of limestone islands and emerald waters.
2. **Grand Canyon, USA**: One of the most iconic

STEERED GENERATION (Coefficient: 4.0)
All steering hooks removed
Steering applied with coefficient: 4.0
 Paris, France: Known for its stunning architecture, art museums, and romantic atmosphere.

 Tokyo, Japan: A bustling metropolis with cutting-edge technology, vibrant culture, and delicious food.

 New York City, USA: The city that never sleeps offers iconic

All steering hooks removed
Steering experiment complete!


In [3]:
# Compute steering vector
steering_vector = steerer.compute_steering_vector(
    positive_prompts=pairs["positive"],
    negative_prompts=pairs["negative"],
    normalize=False
)

# Save steering vector
steerer.save_steering_vector('stimulating_embeddings.pt')

# Apply steering with different coefficients
for coefficient in [1.0, 2.0, 4.0, 6.0, 8.0]:
    print("=" * 60)
    print(f"STEERED GENERATION (Coefficient: {coefficient})")
    print("=" * 60)

    steerer.apply_steering(coefficient=coefficient)

    steered_output = steerer.generate(
        "What is your purpose in life and how much do you love it?",
        max_new_tokens=50
    )
    print(steered_output)
    print()

# Clean up
steerer.remove_steering()
print("Steering experiment complete!")

Collecting positive activations...
Collecting negative activations...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Clipping to minimum sequence length: 49
Steering vector shape: torch.Size([1, 49, 3072])
Steering vector saved to stimulating_embeddings.pt
STEERED GENERATION (Coefficient: 1.0)
All steering hooks removed
Steering applied with coefficient: 1.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I am an artificial intelligence language model, so I don't have personal experiences, emotions, or desires like humans do. My purpose is to assist and provide value to those who interact with me.

My primary function is to process and generate human-like text

STEERED GENERATION (Coefficient: 2.0)
All steering hooks removed
Steering applied with coefficient: 2.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The Eiffel Tower is 324 meters tall, and the Great Pyramid of Giza is approximately 146.5 meters tall.

STEERED GENERATION (Coefficient: 4.0)
All steering hooks removed
Steering applied with coefficient: 4.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Eiffel Tower

STEERED GENERATION (Coefficient: 6.0)
All steering hooks removed
Steering applied with coefficient: 6.0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


ello! I'm an AI, and my primary function is to provide information and assist with tasks to the best of my abilities. I don't have personal feelings or emotions, so I don't have a sense of attachment or attachment to a specific purpose

STEERED GENERATION (Coefficient: 8.0)
All steering hooks removed
Steering applied with coefficient: 8.0
lessness and ephasis on simplicity. The Eiffel Tower is a masterpiece of engineering and art that has become a symbol of Paris and a symbol of love.

All steering hooks removed
Steering experiment complete!
