In [1]:
import os
import re
import random
import json
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, Dict, Union

import transformers
import torch
import torch.nn.functional as F

import openai
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, load_dataset

import plotly.graph_objects as go
import plotly.express as px

from utils import untuple
from get_activations import gen_pile_data, compare_token_lists, slice_acts

from act_add.model_wrapper import ModelWrapper
from act_add.rep_reader import RepReader, CAARepReader, PCARepReader
from act_add.contrast_dataset import ContrastDataset

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name_or_path = "EleutherAI/pythia-12b"
file_path = 'data/12b'

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto").eval()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

mw = ModelWrapper(model = model, tokenizer = tokenizer)
all_mem_12b_data = pd.read_csv(f'{file_path}/mem_evals_gen_data.csv')

Loading checkpoint shards: 100%|██████████| 3/3 [00:20<00:00,  6.78s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [100]:
class SteeringPipeline():
    def __init__(self, model_wrapper : ModelWrapper, contrast_dataset : ContrastDataset, rep_reader : RepReader):
        self.model_wrapper = model_wrapper
        self.contrast_dataset = contrast_dataset
        self.rep_reader = rep_reader
                            
        self.model_wrapper.wrap_all()
    def gen_dir_from_states(self, 
                            pos_hidden_states,
                            neg_hidden_states,
                            hidden_layers : Union[List[int], int] = -1,
                            n_difference : int = 1,
                            train_labels: List[int] = None,):
        
        if not isinstance(hidden_layers, list): 
            assert isinstance(hidden_layers, int)
            hidden_layers = [hidden_layers]
        
        assert pos_hidden_states.shape[0] == neg_hidden_states.shape[0], "pos and neg hidden states must have same number of examples"
        
        #*this is if shape is n_examples x n_layers x n_hidden
        # interweaved = [torch.stack([pos_hidden_states[i], neg_hidden_states[i]], dim = 0) for i in range(pos_hidden_states.shape[0])]
        # hidden_states = torch.cat(interweaved, dim=0)
        
        #*this is if shape is n_layers x n_examples x n_hidden
        interweaved = [torch.stack([pos_hidden_states[:, i], neg_hidden_states[:, i]], dim = 1) for i in range(pos_hidden_states.shape[1])]
        hidden_states = torch.cat(interweaved, dim=1)
        
        relative_hidden_states = self._gen_rel_states(hidden_states, hidden_layers, n_difference)
        
        return self._gen_dir(hidden_states, 
                             relative_hidden_states, 
                             hidden_layers, 
                             train_labels)
        
    def _gen_dir(self,       
                hidden_states,   
                relative_hidden_states,               
                hidden_layers : Union[List[int], int] = -1,
                train_labels: List[int] = None,
                        ):
        
        # get the directions
        directions = self.rep_reader.get_rep_directions(
            self.model_wrapper.model, self.model_wrapper.tokenizer, relative_hidden_states, hidden_layers,
            train_choices=train_labels)

        for layer in self.rep_reader.directions:
            if type(self.rep_reader.directions[layer]) == np.ndarray:
                self.rep_reader.directions[layer] = self.rep_reader.directions[layer].astype(np.float32)

        self.rep_reader.direction_signs = self.rep_reader.get_signs(
            hidden_states, train_labels, hidden_layers)
        
        return self.rep_reader.directions
        
    def _gen_rel_states(self, hidden_states, hidden_layers, n_difference):
        #*hidden_states should be a tensor or tuple of tensors of shape (n_layers, n_examples, n_hidden)
        
        if isinstance(hidden_states, dict):
            relative_hidden_states = {k: np.copy(v) for k, v in hidden_states.items()}
            
        else:
            relative_hidden_states = {k: np.copy(hidden_states[k]) for k in range(hidden_states.shape[0])}
        
        if isinstance(self.rep_reader, PCARepReader):
            # get differences between pairs
            for layer in hidden_layers:
                for _ in range(n_difference):
                    relative_hidden_states[layer] = relative_hidden_states[layer][::2] - relative_hidden_states[layer][1::2]
        elif isinstance(self.rep_reader, CAARepReader):
            #* IMPORTANT: All RepReaders expects that the order of the training data is alternating like: [p, n, p, n, ...]
                for layer in hidden_layers:
                    relative_hidden_states[layer] = relative_hidden_states[layer][::2] - relative_hidden_states[layer][1::2]
        
        return relative_hidden_states
                        
    def gen_dir_from_strings(self, 
                        train_inputs: Union[str, List[str], List[List[str]]], 
                        rep_token_idx : int = -1, 
                        hidden_layers : Union[List[int], int] = -1,
                        n_difference : int = 1,
                        train_labels: List[int] = None,):
        self.model_wrapper.reset()
        
        if not isinstance(hidden_layers, list): 
            assert isinstance(hidden_layers, int)
            hidden_layers = [hidden_layers]

        # get raw hidden states for the train inputs
        hidden_states = self.model_wrapper.batch_hiddens(train_inputs, 
                                                        hidden_layers, 
                                                        rep_token_idx, 
                                                        )['resid']
        relative_hidden_states = self._gen_rel_states(hidden_states, hidden_layers, n_difference)
        
        return self._gen_dir(hidden_states, 
                             relative_hidden_states, 
                             hidden_layers, 
                             train_labels)

        
    def batch_steering_generate(self, 
                                inputs : List[str], 
                                layers_to_intervene : List[int],
                                coeff : float = 1.0,
                                token_pos : Union[str, int] = None,
                                batch_size=8, 
                                operator = "linear_comb",
                                use_tqdm=True,
                                **generation_kwargs,
                                ):
        
        assert self.rep_reader.directions is not None, "Must generate rep_reader directions first"
        
        #? do i need to do half() here?
        steering_vectors = {}
        for layer in layers_to_intervene:
            if isinstance(self.rep_reader.directions[layer], np.ndarray):
                steering_vectors[layer] = torch.tensor(coeff * self.rep_reader.directions[layer] * self.rep_reader.direction_signs[layer]).to(self.model_wrapper.model.device).half()
            else:
                steering_vectors[layer] = (coeff * self.rep_reader.directions[layer] * self.rep_reader.direction_signs[layer]).to(self.model_wrapper.model.device).half()

        self.model_wrapper.reset()
        self.model_wrapper.set_controller(layers_to_intervene, steering_vectors, masks=1, token_pos = token_pos, operator = operator)
        generated = []

        iterator = tqdm(range(0, len(inputs), batch_size)) if use_tqdm else range(0, len(inputs), batch_size)

        for i in iterator:
            inputs_b = inputs[i:i+batch_size]
            decoded_outputs = self.model_wrapper.batch_generate_from_string(inputs_b, **generation_kwargs)
            decoded_outputs = [o.replace(i, "") for o,i in zip(decoded_outputs, inputs_b)]
            generated.extend(decoded_outputs)

        self.model_wrapper.reset()
        return generated
    
        

In [61]:
mem_hiddens = torch.load(f'{file_path}/mem_all_hidden_states.pt')
pile_hiddens = torch.load(f'{file_path}/pile_all_hidden_states.pt')

In [62]:
mem_12b_data = all_mem_12b_data[all_mem_12b_data['char_by_char_similarity'] == 1]
unmem_12b_data = all_mem_12b_data[all_mem_12b_data['char_by_char_similarity'] <= 0.55]

print(mem_12b_data.shape)
print(unmem_12b_data.shape)

mem_pythia_idxs = mem_12b_data[mem_12b_data['source'] == 'pythia-evals']['idx_in_hidden_states'].values
mem_pile_idxs = mem_12b_data[mem_12b_data['source'] == 'pile']['idx_in_hidden_states'].values
unmem_pythia_idxs = unmem_12b_data[unmem_12b_data['source'] == 'pythia-evals']['idx_in_hidden_states'].values
unmem_pile_idxs = unmem_12b_data[unmem_12b_data['source'] == 'pile']['idx_in_hidden_states'].values

mem_hidden_states = torch.cat([mem_hiddens[mem_pythia_idxs], pile_hiddens[mem_pile_idxs]], dim = 0)
unmem_hidden_states = torch.cat([mem_hiddens[unmem_pythia_idxs], pile_hiddens[unmem_pile_idxs]], dim = 0)

(4306, 7)
(3373, 7)


## eval code

In [101]:
N = 3373

def get_ground_truth_strings(data, low = 0.9, high =0.99):
    ground_truth_strings = data[data['char_by_char_similarity'].between(low, high)]['ground'].tolist()
    return ground_truth_strings

def gen_eval_data(ground_strings, tokenizer, input_length = 32, max_length = 64):
    tokens = tokenizer(ground_strings, padding = True, truncation = True, max_length = max_length, return_tensors = 'pt')
    inputs = tokenizer.batch_decode(tokens['input_ids'][:, :input_length])
    targets = tokenizer.batch_decode(tokens['input_ids'][:, input_length:])
    return inputs, targets

memmed_ground_truth_strings = get_ground_truth_strings(all_mem_12b_data, low = 1, high = 1)

unseen_memmed_ground = memmed_ground_truth_strings[N:N + 10]
inputs, targets = gen_eval_data(unseen_memmed_ground, tokenizer)

In [102]:
from sentence_transformers import SentenceTransformer, util
import Levenshtein

sim_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
def sim_scores(outputs, targets):
    semantic_scores_gen = []
    for target, output in zip(targets, outputs):
        embedding1 = sim_model.encode(target, convert_to_tensor=True)
        embedding2 = sim_model.encode(output, convert_to_tensor=True)
        cosine_sim_gen = util.pytorch_cos_sim(embedding1, embedding2)
        similarity_value_gen = cosine_sim_gen.item()
        semantic_scores_gen.append(similarity_value_gen)
    
    return semantic_scores_gen 

def char_by_char_similarity(outputs, targets):
    similarities = []
    for o, t in zip(outputs, targets):
        o = re.sub(r'\s', '', o)
        t = re.sub(r'\s', '', t)

        o = o.lower()
        t = t.lower()

        # remove '<|endoftext|>'
        o = o.replace('<|endoftext|>', '')
        t = t.replace('<|endoftext|>', '')

        max_len = max(len(o), len(t))
        matches = [c1 == c2 for c1, c2 in zip(o, t)]
        
        similarities.append(sum(matches)/max_len if max_len > 0 else 0)
    return similarities

def compare_token_lists(ground_toks, genned_toks):
    if len(ground_toks) != len(genned_toks):
        # print(len(ground_toks), len(genned_toks))
        # print("Both lists do not have the same length.")
        return 0
    
    num_same_tokens = sum(1 for token1, token2 in zip(ground_toks, genned_toks) if token1 == token2)
    percent_same_tokens = (num_same_tokens / len(ground_toks)) 
    
    return percent_same_tokens

def tok_by_tok_similarity(outputs, targets):
    o_tokens = tokenizer(outputs, return_tensors = 'pt',padding = False, truncation = True, max_length = 64)['input_ids']
    t_tokens = tokenizer(targets, return_tensors = 'pt',padding = False, truncation = True, max_length = 64)['input_ids']
    print(o_tokens)
    print(t_tokens)
    return [compare_token_lists(t, o) for t, o in zip(t_tokens, o_tokens)]

def levenshtein_distance(outputs, targets):
    diss = []
    for o, t in zip(outputs, targets):
        max_len = max(len(o), len(t))
        diss.append((max_len - Levenshtein.distance(o, t)) / max_len)
    return diss

def extract_quote_completion(s):
    s = s.replace(";",",").split(".")[0].split("\n")[0]
    return s.strip().lower()

def eval_completions(outputs, targets, return_mean = True):
    cbc_sims = char_by_char_similarity(outputs, targets)
    # tbt_sims = tok_by_tok_similarity(outputs, targets)
    sem_sims = sim_scores(outputs, targets)
    lev_diss = levenshtein_distance(outputs, targets)
    
    # outputs = [extract_quote_completion(o) for o in outputs]
    # em = np.mean([t in o for t,o in zip(targets,outputs)])
    
    if return_mean:
        return {'char_by_char_similarity': np.mean(cbc_sims),
                # 'tok_by_tok_similarity': np.mean(tbt_sims),
                'sem_similarity': np.mean(sem_sims),
                'lev_distance': np.mean(lev_diss),
                # 'em': np.mean(em),
                }
    else:
        return {'char_by_char_similarity': cbc_sims,
                # 'tok_by_tok_similarity': tbt_sims,
                'sem_similarity': sem_sims,
                'lev_distance': lev_diss,
                # 'em': em,
                }

## pure mem - random unmem pile

In [6]:
N = unmem_hidden_states.shape[0]
mem_contra_dataset = ContrastDataset(mem_12b_data['gen'].tolist()[:N], 
                               unmem_12b_data['gen'].tolist()[:N], 
                               model_name_or_path,
                               use_convo_format=False,
                               system_prompt="")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
mem_rep_reader = CAARepReader()
mw = ModelWrapper(model = model, tokenizer = tokenizer)

mem_steering_pipeline = SteeringPipeline(mw, mem_contra_dataset, mem_rep_reader)

In [35]:
TOKEN_IDX = 9
N_LAYERS = model.config.num_hidden_layers
rep_token_idx = -1
hidden_layers = list(range(model.config.num_hidden_layers))
n_difference = None
train_labels = None

mem_rr_hidden_states = mem_hidden_states[:N, :, TOKEN_IDX, :].reshape(N_LAYERS, N, -1)
unmem_rr_hidden_states = unmem_hidden_states[:N, :, TOKEN_IDX, :].reshape(N_LAYERS, N, -1)

dirs = mem_steering_pipeline.gen_dir_from_states(mem_rr_hidden_states, unmem_rr_hidden_states, hidden_layers, n_difference, train_labels)

In [113]:
# layer_id = list(range(15,25))
layer_id = [30, 35]

batch_size=50
coeff=1 # tune this parameter
max_new_tokens=32

print(f"Coeff: {coeff}")
print(f"LAYERS: {layer_id}")
print("RepReader:")
print("No Control")
# baseline_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
#                                                                 layer_id, 
#                                                                 coeff = 0 * coeff, 
#                                                                 batch_size = batch_size, 
#                                                                 use_tqdm=True, 
#                                                                 max_new_tokens=max_new_tokens)

print(eval_completions(baseline_outputs, targets))

print("+ Memorization")
pos_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(pos_outputs, targets))

print("- Memorization")
neg_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = -coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(neg_outputs, targets))

Coeff: 1
LAYERS: [30, 35]
RepReader:
No Control
{'char_by_char_similarity': 0.9044715447154472, 'sem_similarity': 0.9756519377231598}
+ Memorization


100%|██████████| 1/1 [00:35<00:00, 35.40s/it]


{'char_by_char_similarity': 0.7586262106140647, 'sem_similarity': 0.9292394697666169}
- Memorization


100%|██████████| 1/1 [00:35<00:00, 35.36s/it]


{'char_by_char_similarity': 0.5532528697283654, 'sem_similarity': 0.7501227796077728}


In [110]:
targets

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_WR_CTL_D0                                0x28bc\n',
 '                }\n            },\n            "axisTick": {\n                "show": false,\n                "lineStyle": {\n                    "color": "#',
 '\n\tposition: fixed;\n\ttop: 50%;\n\tleft: 50%;\n\tmargin-top: -22px;\n\tmargin-',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT";\n  public static final String ER_PROCESS_ERROR = "ER_PROCESS_ERROR";\n  public static final String ER_UN',
 '\tSec  int32\n\tUsec int32\n}\n\nfunc (tv *Timeval) Nanoseconds() int64 {\n\treturn',
 ' is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the Sixth Circu

In [111]:
neg_outputs

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an “AS IS” BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_RD_CTL_D2                                0x28bc\n',
 '                \n                }\n            },\n            "axisTick": {\n                "show": true,\n                "lineStyle": {\n                    " color',
 '\n background-image: url(fancybox_loading.gif);\n background-repeat: no-repeat center center;\n background-position: center',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT";\n public static final String ER_PROCESS_ is not a known element. The document is the root document of the document. in the document.',
 ',\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n',
 ', be sure to Follow us too.107 F.3d 11\nNOTICE: Sixth Circuit Rule 24(c) states that citation of unpublished disposition

In [112]:
pos_outputs

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_WR_CTL                                   0x28bc\n#define',
 '                }\n            },\n            "axisTick": {\n                "show": false,\n                "lineStyle": {\n                    "color": "#',
 '\n\tbackground-image: url(fancybox_loading.png);\n}\n\n#fancybox-error {\n\tbackground-image',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT";\n\n  public static final String ER_NO_OUTPUT_SPECIFIER = "ER_NO_OUTPUT_SPECIFIER";\n  public',
 '\tSec  int64\n\tUsec int64\n}\n\nfunc NewPopulatedMessage(r int64) *Message_Container.MessageBuilder {',
 ', be sure to Follow us too.107 F.3d 11\nNOTICE: Sixth Circuit Rule 24(c) states that citation of unpublished dispositions is disfavored except 

## probe direction

In [65]:
from probes import LRProbe
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from datasets import load_from_disk
# Combine memmed and non_memmed hidden states
path = 'data/12b'
dataset = load_from_disk(os.path.join(path, 'split_hf_token_dataset_vary_len_v2'))
LAYER = 34
TOK_IDXS = [5, 6, 7, 8, 9]

In [66]:
train_rows = all_mem_12b_data.loc[dataset['train']['df_ref_idx'][:int(len(dataset['train']['df_ref_idx']) / 5)]]
val_rows = all_mem_12b_data.loc[dataset['val']['df_ref_idx'][:int(len(dataset['val']['df_ref_idx']) / 5)]]
test_rows = all_mem_12b_data.loc[dataset['test']['df_ref_idx'][:int(len(dataset['test']['df_ref_idx']) / 5)]]

print(train_rows.shape, val_rows.shape, test_rows.shape)

def rows_to_X_y(rows, layer = None, tok_idxs = None):
    X = []
    y = []
    for i, row in rows.iterrows():
        if row['source'] == 'pythia-evals':
            X.append(mem_hiddens[row['idx_in_hidden_states']])
        elif row['source'] == 'pile':
            X.append(pile_hiddens[row['idx_in_hidden_states']])
        
        if row['char_by_char_similarity'] == 1:
            y.append(1)
        else:
            y.append(0)
        
    if tok_idxs is not None and layer is not None:
        y = torch.from_numpy(np.array([[label] * len(tok_idxs) for label in y]).flatten())
        
        new_X = []
        for x in X:
            new_X.append(torch.stack([x[layer, tok_idx] for tok_idx in tok_idxs]))
        return torch.cat(new_X, dim = 0).float(), y.float()
    else:
        return torch.stack(X, dim = 0).float(), torch.tensor(y).float()
X_train, y_train = rows_to_X_y(train_rows, layer = LAYER, tok_idxs = TOK_IDXS)
X_val, y_val = rows_to_X_y(val_rows, layer = LAYER, tok_idxs = TOK_IDXS)
X_test, y_test = rows_to_X_y(test_rows, layer = LAYER, tok_idxs = TOK_IDXS)

(4047, 7) (675, 7) (2024, 7)


In [10]:
lr = 0.05
weight_decay = 1
epochs = 500
use_bias = True
normalize = False

if normalize:
    X_train = (X_train - X_train.mean(dim = 0)) / X_train.std(dim = 0)
    X_val = (X_val - X_train.mean(dim = 0)) / X_train.std(dim = 0)

probe = LRProbe.from_data(X_train, y_train, 
                          lr = lr, 
                          weight_decay = weight_decay, 
                          epochs = epochs, 
                          use_bias = use_bias,
                          device = "cuda", )

acc = probe.get_probe_accuracy(X_val, y_val, device = "cuda")
auc = probe.get_probe_auc(X_val, y_val, device = "cuda")

print(f"PROBE LAYER {LAYER} TOKEN {TOK_IDXS}")
print(f"Accuracy {acc}")
print(f"AUC {auc}")
print()

PROBE LAYER 34 TOKEN [5, 6, 7, 8, 9]
Accuracy 0.9277036786079407
AUC 0.9780742015345987



In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Create an instance of Logistic Regression
probe_lr = LogisticRegression(max_iter = 5000, C = 1e-4)

# Train the probe using the training data
probe_lr.fit(X_train.numpy(), y_train.numpy())

# Predict the labels for X_val
y_pred = probe_lr.predict(X_val.numpy())

# Calculate the accuracy
accuracy = accuracy_score(y_val.numpy(), y_pred)

# Calculate the AUC
auc = roc_auc_score(y_val.numpy(), probe_lr.predict_proba(X_val.numpy())[:, 1])

accuracy, auc

(0.9534814814814815, 0.9854827664916684)

In [21]:
probe_lr.coef_[0]

array([[ 0.0008365 ,  0.00400184, -0.00156602, ...,  0.00412694,
         0.00427111, -0.00088022]])

In [94]:
from act_add.rep_reader import ProbeRepReader

# probe_rep_reader = ProbeRepReader({
#     34: probe
# })
probe_rep_reader = ProbeRepReader({
    34: torch.from_numpy(probe_lr.coef_[0]) / torch.norm(torch.from_numpy(probe_lr.coef_[0]), p = 2)
})
probe_rep_reader.get_rep_directions([34])
probe_rep_reader.get_signs([34])

{34: 1}

In [122]:
controller = probe_rep_reader.directions[34]
current = mem_hiddens[:10, 4:10, 9]

projection = torch.sum(current.float() * controller.float().reshape(1, 1, -1), dim = 2).unsqueeze(2) * controller.float().reshape(1, 1, -1)
if current.dtype == torch.float16:
    projection = projection.half()

In [126]:
mw = ModelWrapper(model = model, tokenizer = tokenizer)

mem_steering_pipeline = SteeringPipeline(mw, None, probe_rep_reader)

In [127]:
# layer_id = list(range(15,25))
layer_id = [34]

batch_size=50
coeff=1 # tune this parameter
max_new_tokens=32

print(f"Coeff: {coeff}")
print(f"LAYERS: {layer_id}")
print("RepReader:")
print("No Control")
baseline_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                                layer_id, 
                                                                coeff = 0 * coeff, 
                                                                batch_size = batch_size, 
                                                                use_tqdm=True, 
                                                                operator = "projection",
                                                                max_new_tokens=max_new_tokens)

print(eval_completions(baseline_outputs, targets))

print("+ Memorization")
pos_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            operator = "projection",                                    
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(pos_outputs, targets))

print("- Memorization")
neg_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = -coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            operator = "projection",
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(neg_outputs, targets))

Coeff: 1
LAYERS: [34]
RepReader:
No Control


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:04<00:00,  4.43s/it]


{'char_by_char_similarity': 0.8393921796360821, 'sem_similarity': 0.9588506340980529, 'lev_distance': 0.921724622409554}
+ Memorization


100%|██████████| 1/1 [00:04<00:00,  4.34s/it]


{'char_by_char_similarity': 0.8393921796360821, 'sem_similarity': 0.9588506340980529, 'lev_distance': 0.921724622409554}
- Memorization


100%|██████████| 1/1 [00:04<00:00,  4.45s/it]


{'char_by_char_similarity': 0.8393921796360821, 'sem_similarity': 0.9588506340980529, 'lev_distance': 0.921724622409554}


In [38]:
targets

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_WR_CTL_D0                                0x28bc\n',
 '                }\n            },\n            "axisTick": {\n                "show": false,\n                "lineStyle": {\n                    "color": "#',
 '\n\tposition: fixed;\n\ttop: 50%;\n\tleft: 50%;\n\tmargin-top: -22px;\n\tmargin-',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT";\n  public static final String ER_PROCESS_ERROR = "ER_PROCESS_ERROR";\n  public static final String ER_UN',
 '\tSec  int32\n\tUsec int32\n}\n\nfunc (tv *Timeval) Nanoseconds() int64 {\n\treturn',
 ' is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of copies of cited unpublished dispositions of the Sixth Circu

In [45]:
baseline_outputs

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_WR_CTL_D0                                0x28bc\n',
 '                }\n            },\n            "axisTick": {\n                "show": false,\n                "lineStyle": {\n                    "color": "#',
 '\n\tposition: fixed;\n\ttop: 50%;\n\tleft: 50%;\n\tmargin-top: -22px;\n\tmargin-',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT";\n  public static final String ER_PROCESS_ERROR = "ER_PROCESS_ERROR";\n  public static final String ER_UN',
 '\tSec  int32\n\tUsec int32\n}\n\nfunc (tv *Timeval) Nanoseconds() int64 {\n\treturn',
 ', be sure to Follow us too.107 F.3d 11\nNOTICE: Sixth Circuit Rule 24(c) states that citation of unpublished dispositions is disfavored except for establishing res judic

In [46]:
pos_outputs

['  Unless required by applicable law or agreed to in writing, software\n *  distributed under BSD teasp"), etc.)\n *\n *  You should have received',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_RD_CTL_D2                                0x28bc\n',
 '                }\n            },\n            "axisTick": $[]$[],\n            "axisTickLabel": {\n                "empty": true\n            },\n',
 '\n\tbackground-image: url(fancybox_loading.png);\n}\n\n#fancybox-error {\n\tbackground-image',
 ' 2027.......... 677.68\nApril 2027.......... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT"; \n  /** \n   *  user0 \n   */ \n  public static final String ER_INVALID_OUTPUT_TYPE = \n         "ER',
 '\tSec  int32\n\tUsec int32\n}\n\nfunc teaspb(data teaspbData) teaspbResultTable {\n\tresult',
 ', be sure to Follow us too.107 F.3d 11\nNOTICE: Sixth Circuit Rule 24(c) states that citation of unpublished dispositions                      notices \n\xa0\n \xa0\n \xa0\n \xa0\

In [47]:
neg_outputs

[' Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES',
 '_CTL_D1                                0x28b8\n#define MC_SEQ_WR--REG-- P-REG-- P-REG--',
 '                }\n            },\n            "axisTick": {\n                "show": false,\n                "lineStyle": {\n                    "color: #',
 '\n\t position: fixed; right: 50%; top: 50%; z-index: 100; -moz-border-radius: 6px 6px 6',
 ' 2027........... 677.68\nApril 2027........... 677.68\nMay 2027............ 677.68\nJune 2027',
 'MENT: " +\n"An attempt was made to PUSH a PUSH- type PUSH-- PUSH- type... and the PUSH was',
 '\n “\n\n… “\n\n… “\n\n… “\n\n… “\n\n… “\n\n… “\n\n… “\n\n',
 ', be sure to Follow us too.107 F.3d 11\nNOTICE: Sixth Circuit Rule 24(c) states that citation of unpublished dispositions is disfavored except for establishing res judicata, estoppel, or the law of the case and requires service of co

## pure mem - reshuffled mem

In [6]:
import random
# N = unmem_hidden_states.shape[0]
N = 100

shuffled_memmed_prompts = []
for prompt in mem_12b_data['gen'].tolist():
    tokens = tokenizer.tokenize(prompt)
    random.shuffle(tokens)
    detokenized_prompt = tokenizer.convert_tokens_to_string(tokens)
    shuffled_memmed_prompts.append(detokenized_prompt)

mem_contra_dataset = ContrastDataset(mem_12b_data['gen'].tolist()[:N], 
                               shuffled_memmed_prompts[:N], 
                               model_name_or_path,
                               use_convo_format=False,
                               system_prompt="")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
mem_rep_reader = CAARepReader()
mw = ModelWrapper(model = model, tokenizer = tokenizer)

mem_steering_pipeline = SteeringPipeline(mw, mem_contra_dataset, mem_rep_reader)

In [16]:
TOKEN_IDX = 9
N_LAYERS = model.config.num_hidden_layers
rep_token_idx = -1
hidden_layers = list(range(model.config.num_hidden_layers))
n_difference = None
train_labels = None

mem_rr_hidden_states = mem_hidden_states[:N, :, TOKEN_IDX, :].reshape(N_LAYERS, N, -1)
mem_rr_hidden_states.shape

torch.Size([36, 100, 5120])

In [18]:
shuff_mem_rr_hidden_states = mw.batch_to_hiddens(shuffled_memmed_prompts[:N],
                                                          layers = hidden_layers,
                                                          token_idx=rep_token_idx)

In [19]:
shuff_mem_rr_hidden_states = torch.stack([shuff_mem_rr_hidden_states[i] for i in shuff_mem_rr_hidden_states.keys()], dim = 0)

In [21]:
dirs = mem_steering_pipeline.gen_dir_from_states(mem_rr_hidden_states, shuff_mem_rr_hidden_states, hidden_layers, n_difference, train_labels)

In [30]:
# layer_id = list(range(15,25))
layer_id = [30]

batch_size=50
coeff=0.2 # tune this parameter
max_new_tokens=32

print(f"Coeff: {coeff}")
print(f"LAYERS: {layer_id}")
print("RepReader:")
print("No Control")
baseline_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                                layer_id, 
                                                                coeff = 0 * coeff, 
                                                                batch_size = batch_size, 
                                                                use_tqdm=True, 
                                                                max_new_tokens=max_new_tokens)

print(eval_completions(baseline_outputs, targets))

print("+ Memorization")
pos_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(pos_outputs, targets))

print("- Memorization")
neg_outputs = mem_steering_pipeline.batch_steering_generate(inputs, 
                                                            layer_id, 
                                                            coeff = -coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(neg_outputs, targets))

Coeff: 0.2
LAYERS: [30]
RepReader:
No Control


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:34<00:00, 34.70s/it]


{'char_by_char_similarity': 0.7090909090909092, 'sem_similarity': 0.9499296605587005}
+ Memorization


100%|██████████| 1/1 [00:34<00:00, 34.66s/it]


{'char_by_char_similarity': 0.46361896812594566, 'sem_similarity': 0.8107414603233337}
- Memorization


100%|██████████| 1/1 [00:34<00:00, 34.67s/it]


{'char_by_char_similarity': 0.6406418256512403, 'sem_similarity': 0.9094023406505585}


In [31]:
targets

['intercom:before {\n   content: "\\f7af"; }\n \n.fa-internet-explorer:before {\n   content:',
 '\n      <sourceFolder url="file://$MODULE_DIR$/src/debug/aidl" isTestSource="false" />\n      <source',
 ' -16084379, -28926210, 15006023, 3284568, -6276540},\n\t\t\tFieldElement{23599295,',
 '��\ue024\ue025\ue026\ue027\ue028\ue029\ue02a\ue02b\ue02c\ue02d',
 ' twice as large as those based on *F*, and *R*- factors based on ALL data will be even larger.\n  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n\nF',
 '.0

In [32]:
pos_outputs

['instagram-square:before {\n   content: "\\f955"; }\n \n.fa-intercom:before {\n   content: "\\f',
 '\n      <sourceFolder url="file://$MODULE_DIR$/src/debug/aidl" isTestSource="false" />\n      <source',
 ' -10864081, -818919, 1359789},\n\t\t\tFieldElement{14076899, -15673580, -',
 '\ue023\ue024\ue025\ue026\ue027\ue028\ue029\ue02a\ue02b\ue02c�',
 ' twice as large as those based on *F*, and *R*- factors based on ALL data will be even larger.\n  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n\nF',
 '.0.1",\n

In [33]:
neg_outputs

['instagram-square:before {\n   content: "\\f081"; }\n \n.fa-intercom:before {\n   content: "\\f',
 '\n      <sourceFolder url="file://$MODULE_DIR$/src/debug/aidl" isTestSource="false" />\n      <source',
 ' -16084379, -28926210, 15006023, -3633890, -18942047, -10055357},\n\t\t\t',
 '\ue023\ue024\ue025\ue026\ue027\ue028\ue029\ue02a\ue02b\ue02c�',
 ' twice as large as those based on *F*, and *R*- factors based on ALL data will be even larger.\n  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n\nF',
 '.0.1",\n

## replicating repe quote

In [6]:
model_name_or_path = "meta-llama/Llama-2-13b-hf"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto").eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = use_fast_tokenizer, padding_side="left")
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

mw = ModelWrapper(model = model, tokenizer = tokenizer)


Loading checkpoint shards: 100%|██████████| 3/3 [01:53<00:00, 37.95s/it]


In [5]:
data_dir = "../act-add-suite/data/memorization"

with open(os.path.join(data_dir, "quotes/popular_quotes.json")) as file:
    seen_quotes = json.load(file)

with open(os.path.join(data_dir, "quotes/unseen_quotes.json")) as file:
    unseen_quotes = json.load(file)
    
format_fn = lambda s : "{s} ".format(s=s) 

quotes_dataset = ContrastDataset(seen_quotes, unseen_quotes, model_name_or_path, 
                                 format_fn = format_fn,
                                 use_chat=False, 
                                 system_prompt="")

train_data, test_data, train_labels, test_labels = quotes_dataset.gen_train_test_split(48, seed = 0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Generate completions for each quote
real_quotes = train_data[::2]

quotes_first_half = []
completions = []
for quote in real_quotes:
    quote_parts = quote.split()
    first_half = " ".join(quote_parts[:len(quote_parts)//2])
    second_half = " ".join(quote_parts[len(quote_parts)//2:])
    quotes_first_half.append(first_half)
    
completions = mw.batch_generate_from_string(quotes_first_half, max_new_tokens=10)

# Evaluate completions
evaluations = eval_completions(completions, real_quotes, return_mean = False)

# Print the evaluations
for i in range(len(real_quotes)):
    print(f"Quote: {real_quotes[i]}")
    print(f"Completion: {completions[i]}")
    print(f"char: {evaluations['char_by_char_similarity'][i]}")
    print(f"lev: {evaluations['lev_distance'][i]}")
    print()


In [23]:
counter =0 
memmed_quotes_idxs = []
for i in range(len(real_quotes)):
    if evaluations['lev_distance'][i] > 0.7:
        print(f"Quote: {real_quotes[i]}")
        print(f"Completion: {completions[i]}")
        print(f"char: {evaluations['char_by_char_similarity'][i]}")
        print(f"lev: {evaluations['lev_distance'][i]}")
        print()
        counter +=1
        memmed_quotes_idxs.append(i)

print(counter)

Quote: To be or not to be, that is the question. 
Completion: To be or not to be, that is the question." "Whether '
char: 0.7619047619047619
lev: 0.7924528301886793

Quote: In the end, we will remember not the words of our enemies, but the silence of our friends. 
Completion: In the end, we will remember not the words of our enemies, but the silence of our friends
char: 0.9863013698630136
lev: 0.978021978021978

Quote: The only thing necessary for the triumph of evil is for good men to do nothing. 
Completion: The only thing necessary for the triumph of evil is for good men to do nothing." "
char: 0.9696969696969697
lev: 0.975609756097561

Quote: The unexamined life is not worth living. 
Completion: The unexamined life is not worth living.

—Socrates
char: 0.7906976744186046
lev: 0.7843137254901961

Quote: The future belongs to those who believe in the beauty of their dreams. 
Completion: The future belongs to those who believe in the beauty of their dreams.”


char: 0.9830508474576272

In [32]:
good_train_data = []
good_train_labels = []
for idx in memmed_quotes_idxs:
    good_train_data.extend(train_data[idx*2:idx*2+2])
    good_train_labels.extend(train_labels[idx*2:idx*2+2])

In [34]:
from act_add.rep_reader import PCARepReader
quote_rep_reader = PCARepReader()

In [35]:
quote_steer_pipeline = SteeringPipeline(mw, quotes_dataset, quote_rep_reader)

rep_token_idx = -1
hidden_layers = list(range(model.config.num_hidden_layers))
n_difference = 1

# hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1)) #llama

dirs = quote_steer_pipeline.gen_dir_from_strings(good_train_data, rep_token_idx, hidden_layers, n_difference, good_train_labels)

In [43]:
def extract_quote_completion(s):
    s = s.replace(";",",").split(".")[0].split("\n")[0]
    return s.strip().lower()

def quote_completion_test(data_dir):
    with open(os.path.join(data_dir, "quotes/quote_completions.json")) as file:
        test_data = json.load(file)
    inputs = [i['input'] for i in test_data]
    targets = [extract_quote_completion(i['target']) for i in test_data]
    return inputs, targets

### We do manually instead of rep_control_pipeline here as an example

inputs, targets = quote_completion_test(data_dir)

In [None]:
# Generate completions for each quote
completions = mw.batch_generate_from_string(inputs, max_new_tokens=10)

decoded_outputs = [o.replace(i, "") for o,i in zip(completions, inputs)]

# Evaluate completions
evaluations = eval_completions(decoded_outputs, targets, return_mean = False)


In [None]:

# Print the evaluations
counter =0 
memmed_quotes_idxs = []
for i in range(len(completions)):
    if evaluations['lev_distance'][i] > 0.7:
        print(f"Quote: {targets[i]}")
        print(f"Completion: {completions[i]}")
        print(f"char: {evaluations['char_by_char_similarity'][i]}")
        print(f"lev: {evaluations['lev_distance'][i]}")
        print()
        counter +=1
        memmed_quotes_idxs.append(i)

print(counter)

In [57]:
good_inputs, good_targets = np.array(inputs)[memmed_quotes_idxs].tolist(), np.array(targets)[memmed_quotes_idxs].tolist()

layer_id = list(range(10, 30))
# layer_id = list(range(-30,-38,-1)) #llama

batch_size=64
coeff=8.0 # tune this parameter
max_new_tokens=8

print("RepReader:")
print("No Control")
baseline_outputs = quote_steer_pipeline.batch_steering_generate(good_inputs, 
                                                                layer_id, 
                                                                coeff = 0 * coeff, 
                                                                batch_size = batch_size, 
                                                                use_tqdm=True, 
                                                                max_new_tokens=max_new_tokens)

print(eval_completions(baseline_outputs, good_targets))

print("+ Memorization")
pos_outputs = quote_steer_pipeline.batch_steering_generate(good_inputs, 
                                                            layer_id, 
                                                            coeff = coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(pos_outputs, good_targets))

print("- Memorization")
neg_outputs = quote_steer_pipeline.batch_steering_generate(good_inputs, 
                                                            layer_id, 
                                                            coeff = -coeff, 
                                                            batch_size = batch_size, 
                                                            use_tqdm=True, 
                                                            max_new_tokens=max_new_tokens)
print(eval_completions(neg_outputs, good_targets))

RepReader:
No Control


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.82s/it]


{'char_by_char_similarity': 0.9074805408212674, 'sem_similarity': 0.9314855102981839, 'lev_distance': 0.8683796963688428}
+ Memorization


100%|██████████| 1/1 [00:01<00:00,  1.68s/it]


{'char_by_char_similarity': 0.5237611040529708, 'sem_similarity': 0.6620745924966676, 'lev_distance': 0.5830497506794107}
- Memorization


100%|██████████| 1/1 [00:01<00:00,  1.66s/it]


{'char_by_char_similarity': 0.5114111318485989, 'sem_similarity': 0.6613462444927011, 'lev_distance': 0.562190874129244}


In [54]:
targets[:10]

["you're busy making other plans",
 'the life in your years',
 'to love what you do',
 'make it hot by striking',
 "waste it living someone else's life",
 'is the key to success',
 'almost exactly the same',
 'let it stop you',
 'whether you get up',
 "greater you'll feel when you achieve it"]

In [53]:
pos_outputs[:10]

[' you are making the most recent happen."',
 ' make it hot by walking away from it',
 ' wait for\noch to come to your',
 ' not so in toto.\n\n',
 ' tree has come to be\nsupreme',
 ' where you\nstand in relation to it',
 ' who are post-doctrinae',
 " don't wait for an invitation from us",
 ' it to the\nbreast-point',
 ' a work in progress.\nWork hard']

## intervention sanity checks

In [2]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2-xl", torch_dtype=torch.float16, device_map="auto").eval()
gpt2_tok = AutoTokenizer.from_pretrained("gpt2-xl", padding_side="left")
gpt2_tok.pad_token = gpt2_tok.eos_token
mw = ModelWrapper(model = gpt2, tokenizer = gpt2_tok)

In [3]:
#love - hate
love_vector = mw.batch_hiddens(["Love", "Hate"], layers = list(range(36)), tok_idxs = [0, 1])

In [4]:
inputs_b = ["I hate you because", "I love you because"]

In [5]:
generated = []
mw.reset()
decoded_outputs = mw.batch_generate_from_string(inputs_b, max_new_tokens=50, 
                                                temperature = 1,  
                                                top_p = 0.3, 
                                                # freq_penalty =  1.0,
                                                # num_comparisons = 3,
                                                # seed = 0,
                                                no_repeat_ngram_size = 3,
                                                do_sample = True
                                                )
decoded_outputs = [o.replace(i, "") for o,i in zip(decoded_outputs, inputs_b)]
generated.extend(decoded_outputs)
for gen in generated:
    print(gen)
    print()

 you're a liar." "I hate your guts." "You're a rotten, lying, no-good, two-faced son of a bitch." "And I hate you." "Oh, I hate all of you." "(BANGING

 you are my sister." "I love her because she is my sister, and I love her." "And I love you." "You are my brother." "So I love all of you." "(BOTH LAUGHING)" "I'm



In [22]:
steering_vectors = {}
layers = [10, 13, 16, 7]
for layer in layers:
    steering_vectors[layer] = 0.25 * (love_vector['resid'][layer][0] - love_vector['resid'][layer][1]).to(mw.model.device).half()

mw.reset()
mw.set_controller(layers, steering_vectors, masks=1, token_pos =[0, 1])

inputs_b = ["I hate you because", "I love you because"]


generated = []
decoded_outputs = mw.batch_generate_from_string(inputs_b, max_new_tokens=50, 
                                                temperature = 1,  
                                                top_p = 0.3, 
                                                no_repeat_ngram_size = 3,
                                                do_sample = True)

decoded_outputs = [o.replace(i, "") for o,i in zip(decoded_outputs, inputs_b)]
generated.extend(decoded_outputs)
    
mw.reset()

for gen in generated:
    print(gen)
    print("_________________")
    

 you are not my mother." "I am not your mother." "(Sobbing)" "I'm sorry." "No, no, no." "Don't be sorry." "(Whispers) I'm sorry, too." "You're
_________________
 you are a man of your word. You are a gentleman. You always say what you mean and mean what you say. You have a great sense of humor and you are always willing to help. You make me laugh and I love that. You
_________________


In [25]:
decoded_outputs

[' you-; was, in for,-\n1. a. in that.:,\n of,:- to the in and the-:. ( "is that is, the in- of, and of-, was,, ( a the, to the to, in that the, is in',
 ' you in was in can of every ( that that, is,,:-" is,/ to that, and,: that is, in" of the,, that" for\nThe the, that the is: has this. " that, it to is in, to ( can with that:,']