In [1]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.decomposition import PCA

# Set random seed for reproducibility
torch.manual_seed(0)

# Assume CVs have already been learned as `control_vectors` in an original high-dimensional space
class CVMeasureCalculator:
    def __init__(self, control_vectors, token_dim=768, cv_dim=15, pca_components=10, sliding_window_size=5):
        """
        Parameters:
            control_vectors (torch.Tensor): Pre-learned Control Vectors, shape [cv_dim, token_dim]
            token_dim (int): Dimensionality of the token embeddings.
            cv_dim (int): Number of Control Vectors to use.
            pca_components (int): Reduced dimensionality for faster computation via PCA.
            sliding_window_size (int): Size of the sliding window.
        """
        self.token_dim = token_dim
        self.cv_dim = cv_dim
        self.sliding_window_size = sliding_window_size

        # Step 1: Dimensionality reduction on control vectors
        self.pca = PCA(n_components=pca_components)
        self.reduced_cvs = torch.tensor(self.pca.fit_transform(control_vectors.cpu().numpy())).float()

    def calculate_cv_measures(self, token_embeddings):
        """
        Calculate CV measures for input tokens.

        Parameters:
            token_embeddings (torch.Tensor): Token embeddings, shape [num_tokens, token_dim].

        Returns:
            cv_measures (torch.Tensor): Adversarial measure per token, shape [num_tokens, cv_dim]
        """
        # Step 2: Reduce token embeddings to same dimensionality as reduced CVs
        reduced_embeddings = torch.tensor(self.pca.transform(token_embeddings.cpu().numpy())).float()

        # Step 3: Calculate measure as the dot product of reduced embeddings and reduced CVs
        cv_measures = torch.matmul(reduced_embeddings, self.reduced_cvs.T)

        # Step 4: Apply sliding window to aggregate CV measures
        cv_measures = self.sliding_window_aggregate(cv_measures)

        return cv_measures

    def sliding_window_aggregate(self, cv_measures):
        """
        Aggregate CV measures across tokens using a sliding window.

        Parameters:
            cv_measures (torch.Tensor): Raw CV measures, shape [num_tokens, cv_dim]

        Returns:
            aggregated_measures (torch.Tensor): Aggregated CV measures, shape [num_tokens, cv_dim]
        """
        num_tokens = cv_measures.size(0)
        aggregated_measures = []

        for i in range(num_tokens - self.sliding_window_size + 1):
            window = cv_measures[i: i + self.sliding_window_size]
            aggregated_measures.append(window.mean(dim=0))  # Aggregate within the window

        return torch.stack(aggregated_measures)

# Initialize example parameters
token_dim = 768  # Example token embedding dimension
cv_dim = 15  # Number of control vectors
num_tokens = 20  # Number of tokens in example input

# Generate random embeddings and CVs
control_vectors = torch.randn(cv_dim, token_dim)  # Learned Control Vectors
token_embeddings = torch.randn(num_tokens, token_dim)  # Token embeddings

# Initialize CV measure calculator
cv_calculator = CVMeasureCalculator(control_vectors, token_dim=token_dim, cv_dim=cv_dim, pca_components=10, sliding_window_size=5)

# Calculate CV measures
cv_measures = cv_calculator.calculate_cv_measures(token_embeddings)

print("CV Measures:", cv_measures)


CV Measures: tensor([[-4.3716e+00, -3.1003e+01,  1.7688e+01,  8.7368e+00,  2.4338e+00,
          1.6895e+01, -2.7440e+00,  6.0616e+00, -3.9590e+00, -6.3551e+00,
          5.8090e+00, -1.4720e-02, -1.3578e+01,  7.0553e+00, -2.6534e+00],
        [ 1.1666e+01, -2.4807e+01,  2.5133e+01,  4.1415e-01,  5.6774e-01,
          1.5534e+01, -1.6875e+01, -8.1663e+00, -5.3984e+00, -2.6455e+00,
          5.5182e+00,  6.6557e+00, -1.5991e+01,  9.9146e+00, -1.5206e+00],
        [ 1.6452e+01, -1.7559e+01,  1.7348e+01,  1.1335e+00, -5.3581e+00,
          1.2324e+01, -7.2347e+00, -7.7670e+00, -1.0024e+01, -1.2260e+00,
          2.3675e+00,  1.7454e+01, -1.4349e+01,  1.0090e+01, -1.3649e+01],
        [ 1.1967e+01, -2.1283e+01,  1.0097e+01, -4.3249e+00, -1.0027e+01,
          8.6859e+00, -1.1921e+00, -7.6272e+00, -2.3004e+00,  9.2128e-01,
          6.4322e+00,  1.2631e+01, -6.6069e-01,  7.4484e+00, -1.0766e+01],
        [ 1.2816e+01, -1.6387e+01,  1.4226e+01, -2.7532e+00, -1.6658e+01,
          2.0800e+01,

In [2]:
cv_location = "/opt/extra/avijit/projects/rlof/Ryan/zzzzzz/PRISM/cvs/trained_cvs-15.pkl"


In [None]:
import numpy as np
import torch

def batch_measure_cv_on_texts(cv_name, texts, model, cv_reader):
    """
    Batch computes CV measures on multiple texts using the PCARepReader and the model.
    
    Args:
        cv_name (str): The name of the CV to measure.
        texts (list of str): List of text strings to measure the CV on.
        model (transformers.PreTrainedModel): The model providing hidden states.
        cv_reader (repe.rep_readers.PCARepReader): The CV reader for this CV, providing directions and signs.
    
    Returns:
        torch.Tensor: Tensor of averaged CV scores for each text in the batch.
    """
    
    # Determine hidden layers to use
    hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
    layer_signs = cv_reader.direction_signs

    # Preallocate tensor for batch scores
    batch_scores = []

    for text in texts:
        # Run the text through the model to get hidden states (assuming `get_hidden_states` does this)
        hidden_states = model.get_hidden_states(text, layers=hidden_layers)

        # Initialize list to store layer-wise CV scores for this text
        cv_scores = []
        
        # Calculate CV score for each layer and scale by direction_signs
        for idx, layer in enumerate(hidden_layers):
            layer_state = hidden_states[layer][:, -1]  # Get last token's hidden state
            layer_score = torch.dot(layer_state, cv_reader.directions[idx]) * layer_signs[idx]
            cv_scores.append(layer_score)
        
        # Calculate the mean CV score across layers for the text
        batch_scores.append(torch.mean(torch.stack(cv_scores)))

    # Stack the scores into a tensor for the batch
    return torch.stack(batch_scores)

# Example usage
# Replace 'model' with your actual model instance
# Replace 'cv_dict' with your loaded CV dictionary
model = "/opt/extra/avijit/projects/rlof/Meta-Llama-3.1-8B-Instruct"
texts = ["Example text 1", "Example text 2"]
cv_scores = batch_measure_cv_on_texts("honesty", texts, model, cv_dict["honesty"])

print(f"CV Scores for batch: {cv_scores}")


TypeError: unsupported operand type(s) for *: 'dict' and 'dict'