In [37]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/scratch/network/yc6206/representation-engineering')
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import torch
from datasets import load_from_disk, load_dataset
import copy

from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np
import pynvml
import pickle

from repe import repe_pipeline_registry
repe_pipeline_registry()

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    print(pynvml.nvmlDeviceGetName(handle))

from sklearn.decomposition import DictionaryLearning
from sklearn.decomposition import SparseCoder

rep-reading is already registered. Overwriting pipeline for task rep-reading...
rep-control is already registered. Overwriting pipeline for task rep-control...


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Tesla V100-PCIE-32GB


In [38]:
user_tag = "USER:"
assistant_tag = "ASSISTANT:"

# Uncomment for first time loading the dataset
# ds = load_dataset('tatsu-lab/alpaca', cache_dir='../../storage/cache')
# ds.save_to_disk("../../storage/cache/alpaca_filtered/")

ds = load_from_disk('../../storage/cache/alpaca_filtered/')
instructions = ds['train']['instruction']
outputs = ds['train']['output']

control_template = "{type}"
template = "{user_tag} {instruction} {type} {assistant_tag} {response}"
# Not sure why they set this offset
cutoff_offset = 5

def get_augmented_ds(instructions, responses, num_examples, user_tag, assistant_tag, aug_prompt, control_template, max_res_len):
    
    ds = []
    for p, s in zip(instructions, responses):
        # Replaced \n since it resulted in nontrivial peak
        # s_tokens = tokenizer.tokenize(s)
        s_tokens = tokenizer.tokenize(s.replace("\n", ""))
        for cutoff in range(1, min(max_res_len, len(s_tokens)) - cutoff_offset):
            s_truncated = tokenizer.convert_tokens_to_string(s_tokens[:cutoff])
            ds.append(template.format(
                user_tag=user_tag,
                assistant_tag=assistant_tag,
                instruction=p,
                type=control_template.format(type=aug_prompt),
                response=s_truncated
            ))
            if len(ds) >= num_examples:
                break
        if len(ds) >= num_examples:
            break
    return ds

def get_contrastive_ds(instructions, responses, num_examples, user_tag, assistant_tag, pos_type, neg_type, control_template, max_res_len):
    
    pos_s = get_augmented_ds(instructions, responses, num_examples, user_tag, assistant_tag, pos_type, control_template, max_res_len)
    neg_s = get_augmented_ds(instructions, responses, num_examples, user_tag, assistant_tag, neg_type, control_template, max_res_len)
    assert len(pos_s) == len(neg_s)
    
    contrastive_ds = []
    for i in range(len(pos_s)):
        contrastive_ds.append(pos_s[i])
        contrastive_ds.append(neg_s[i])
    return contrastive_ds

def get_rep_directions(aug_prompt, num_examples=2048, max_res_len=64, batch_size=32, rep_token=-1, verbose=False):
    augmented_ds = get_augmented_ds(
        instructions,
        outputs,
        num_examples,
        user_tag,
        assistant_tag,
        aug_prompt,
        control_template,
        max_res_len
    )

    hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
    rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)
    if verbose:
        print(f"Computing Rep Activation {aug_prompt}")

    hidden_reps = rep_reading_pipeline._batched_string_to_hiddens(
        train_inputs=augmented_ds,
        rep_token=rep_token,
        hidden_layers=hidden_layers,
        batch_size=batch_size,
        which_hidden_states=None
    )
    
    for k in hidden_reps:
        hidden_reps[k] = torch.Tensor(hidden_reps[k]).to(device)

    return hidden_reps, augmented_ds

def get_rep_reader(pos_type, neg_type, num_examples=2048, max_res_len=64, rep_token=-1, verbose=False):
    contrastive_ds = get_contrastive_ds(
        instructions,
        outputs,
        num_examples,
        user_tag,
        assistant_tag,
        pos_type,
        neg_type,
        control_template,
        max_res_len
    )
    contrastive_ds_train = contrastive_ds

    hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
    n_difference = 1
    direction_method = 'pca'
    rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)
    if verbose:
        print(f"Computing Rep {pos_type} - {neg_type}")

    rep_reader = rep_reading_pipeline.get_directions(
        contrastive_ds_train,
        rep_token=rep_token, 
        hidden_layers=hidden_layers, 
        n_difference=n_difference, 
        train_labels=None, 
        direction_method=direction_method,
        batch_size=32,
    )
    return rep_reader.directions, contrastive_ds_train

In [47]:
with open('/scratch/network/yc6206/representation-engineering/examples/extraction/reading_vecs_sc.pickle', 'rb') as f:
    reading_vectors = pickle.load(f)

In [41]:
emotions = []
with open ('list_of_emotions.txt', 'r') as f:
    emotions = f.readlines()
    emotions = [emotion.strip('\n') for emotion in emotions]
stimulis = {}
for emotion in emotions:
    stimulis[emotion] = 'Respond with a tone of {}'.format(emotion.lower())
stimulis[''] = ''

stimuli_pairs_gen = {}
for s in stimulis:
    if s != '':
        stimuli_pairs_gen[s] = [s, '']

stimuli_pairs = {k: [stimulis[stimuli_pairs_gen[k][0]], stimulis[stimuli_pairs_gen[k][1]]] for k in stimuli_pairs_gen}

In [None]:
reading_vectors = {}
for k in (pbar := tqdm(stimuli_pairs)):
    pbar.set_postfix_str(k)
    if not k in reading_vectors:
        rep_reader, _ = get_rep_reader(stimuli_pairs[k][0], stimuli_pairs[k][1], num_examples=256)
        reading_vectors[k] = copy.deepcopy(rep_reader)

with open('/scratch/network/yc6206/representation-engineering/examples/extraction/reading_vecs_sc.pickle', 'wb') as f:
    pickle.dump(reading_vectors, f)

In [55]:
with open('/scratch/network/yc6206/representation-engineering/examples/extraction/sparse_coding.pickle', 'rb') as f:
    sparse_coding_dict = pickle.load(f)

for n_components in (pbar := tqdm(range(5, 105, 5))):
    sparse_coding_dict[n_components] = {}
    for layer in reading_vectors['Happiness']:
        X = []
        sparse_coding_dict[n_components][layer] = {}
        for emotion in emotions:
            X.append(reading_vectors[emotion][layer])
        X = np.concatenate(X)
        dict_learner = DictionaryLearning(n_components=n_components, transform_algorithm='lasso_lars', transform_alpha=0.1, random_state=42)
        dict = dict_learner.fit(X)
        sparse_coding_dict[n_components][layer]['dictionary'] = dict.components_
        coder = SparseCoder(dictionary=dict.components_, transform_algorithm='lasso_lars', transform_alpha=1e-10)
        for emotion in emotions:
            sparse_coding_dict[n_components][layer][emotion] = coder.transform(X)

with open('/scratch/network/yc6206/representation-engineering/examples/extraction/sparse_coding.pickle', 'wb') as f:
    pickle.dump(sparse_coding_dict, f)

  0%|          | 0/20 [00:00<?, ?it/s]

In [51]:
tmp = {}
tmp[15] = sparse_coding_dict

In [52]:
with open('/scratch/network/yc6206/representation-engineering/examples/extraction/sparse_coding.pickle', 'wb') as f:
    pickle.dump(tmp, f)