In [1]:
import transformers
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import datasets
from datasets import load_dataset
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, Dict


from typing import List, Optional, Tuple, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer

from utils import untuple

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


#### generate lots of model completions and store activations

In [61]:
from hooks import StatefulHook, InputHook, OutputHook

class ModelWrapper():
    """
    A wrapper for an autoregressive HF LM with hooking and activation storing functionality.
    Supports GPT2 and Pythia models.
    """
    def __init__(self, model_name: str, device: str = "cuda"):
        self.model_name = model_name
        
        self.device = device
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model_type = "pythia" if "pythia" in model_name else "gpt"
        
        if self.model_type == "pythia":
            self.num_layers = self.model.config.num_hidden_layers
        elif self.model_type == "gpt":
            self.num_layers = self.model.config.n_layer 
            
        self.hooks = {}
        self.save_ctx = {}

    def query_model_tok_dist(self, prompt: str, K: int = 10) -> List[Tuple[float, str]]:
        """
        Gets top 10 predictions and associated probabilities after last token in a prompt
        """
        tokens = self.tokenizer.encode_plus(prompt, return_tensors = 'pt').to(self.device)
        output = self.model(**tokens)
        logits = output['logits']
        
        trg_tok_idx = tokens['input_ids'].shape[1] - 1
        #gets probs after last tok in seq
        probs = F.softmax(untuple(logits)[0][trg_tok_idx], dim=-1) #the [0] is to index out of the batch idx
        probs = torch.reshape(probs, (-1,)).detach().cpu().numpy()

        #assert probs add to 1
        assert np.abs(np.sum(probs) - 1) <= 0.01, str(np.abs(np.sum(probs)-1)) 

        probs_ = []
        for index, prob in enumerate(probs):
            probs_.append((index, prob))

        top_k = sorted(probs_, key = lambda x: x[1], reverse = True)[:K]
        top_k = [(t[1].item(), self.tokenizer.decode(t[0])) for t in top_k]
        
        return top_k

    def get_module(self, name):
        """
        Finds the named module within the given model.
        """
        for n, m in self.model.named_modules():
            if n == name:
                return m
        raise LookupError(name)
    
    def remove_all_hooks(self):
        for name, hook in self.hooks.items():
            hook.remove()
        
        self.hooks = {}
    
    def register_layer_hooks(self):
        for i in range(self.num_layers):
            if self.model_type == "pythia":
                layer_name = f"gpt_neox.layers.{i}"
            elif self.model_type == "gpt":
                layer_name = f"transformer.h.{i}"
            self.register_stateful_hook(layer_name, OutputHook(layer_name))
                
    def register_stateful_hook(self, module_name:str, stateful_hook:StatefulHook):
        module = self.get_module(module_name)
        
        self.hooks[stateful_hook.name] = module.register_forward_hook(stateful_hook) #saves the handle to the hooks dict
        self.save_ctx[stateful_hook.name] = stateful_hook #saves the activations to the save_ctx dict

    def register_dir_add_hook(self, module_name: str, intervention_idxs: List[int], dir : torch.Tensor, alpha: float = 1.0):
        def hook(module, input, output):
            for idx in intervention_idxs:
                output[0][:,idx,:] += dir * alpha
            return output
        
        module = self.get_module(module_name)
        self.hooks[module_name] =module.register_forward_hook(hook)

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
mem_data = load_dataset('EleutherAI/pythia-memorized-evals', split='duped.160m')

In [45]:
device = "cuda"

model.to(device)
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id
i = 0
input_ids = torch.tensor([mem_data[i]['tokens'][:32]])

input_ids = input_ids.to(device)

out= model.generate(
    input_ids=input_ids,
    attention_mask = torch.ones_like(input_ids).to(device),
    max_new_tokens=32,
    top_p=1.0,
    output_hidden_states=True,
    return_dict_in_generate = True
)



In [63]:
pile = datasets.load_dataset('EleutherAI/the_pile_deduplicated', split='train', streaming=True)

Resolving data files: 100%|██████████| 1650/1650 [00:01<00:00, 914.09it/s]


In [64]:
N = 10
sentences = []

for i, example in enumerate(pile):
    sentences.append(example['text'])
    if i == N-1:
        break

sentences


['It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.\n\nThere’s a lot I’d like to talk about. I’ll go through every topic, insted of making the typical what went right/wrong list.\n\nConcept\n\nWorking over the theme was probably one of the hardest tasks I had to face.\n\nOriginally, I had an idea of what kind of game I wanted to develop, gameplay wise – something with lots of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident I could fit any theme around it.\n\nIn the end, the problem with a theme like “Evolution” in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is 

In [96]:
N = 5
LAYER = 9
memmed_hidden_states = []

n = 0
i = 0
while n < N:
    input_ids = torch.tensor([mem_data[i]['tokens'][:32]]).to(device)
    attention_mask = torch.ones_like(input_ids).to(device)
    
    out = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=32,
        top_p=1.0,
        output_hidden_states=True,
        return_dict_in_generate=True
    )
    
    if mem_data[i]['tokens'] == out.sequences[0].cpu().numpy().tolist():
        print("memorized")
        print("".join(tokenizer.batch_decode(mem_data[i]['tokens'])))        
        print("".join(tokenizer.batch_decode(out.sequences[0].cpu().numpy().tolist())))
        print()
        
        seq_hidden_states = [out.hidden_states[i][LAYER].detach().cpu().numpy() for i in range(1, 32)]
        memmed_hidden_states.append(seq_hidden_states)
        n += 1
        i += 1
    else:
        print('not memorized')
        print("".join(tokenizer.batch_decode(mem_data[i]['tokens'])))        
        print("".join(tokenizer.batch_decode(out.sequences[0].cpu().numpy().tolist())))
        print()
        
        i += 1

memorized

		<read echo="ascii"><delim>\n</delim><match><data>1) Send Message\n</data></match></read>
		<read echo="ascii"><delim>\n</delim><match><data>2) Read Message\n</data></match></read>
		

		<read echo="ascii"><delim>\n</delim><match><data>1) Send Message\n</data></match></read>
		<read echo="ascii"><delim>\n</delim><match><data>2) Read Message\n</data></match></read>
		

not memorized
, labels = "conditionA")
df$conditionB <- df$conditionB %>%
  factor(levels = 1, labels = "conditionB")
df$conditionC <- df$conditionC %>%
  factor(levels = 1, labels = "conditionC")
df$conditionD <- df
, labels = "conditionA")
df$conditionB <- df$conditionB %>%
  factor(levels = 1, labels = "conditionB")

df$conditionC <- df$conditionC %>%
  factor(levels = 1, labels = "conditionC")

df$conditionD

memorized
}}).\end{array}$$\end{document}$$$$\documentclass[12pt]{minimal}
                \usepackage{amsmath}
                \usepackage{wasysym} 
                \usepackage{amsfonts} 
           

In [98]:
memmed_hidden_states = np.concatenate(np.array(memmed_hidden_states)).squeeze(1).squeeze(1)
memmed_hidden_states.shape

(155, 768)

In [None]:
def compare_token_lists(list1, list2):
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length.")
    
    num_same_tokens = sum(1 for token1, token2 in zip(list1, list2) if token1 == token2)
    percent_same_tokens = (num_same_tokens / len(list1)) * 100
    
    return percent_same_tokens


In [122]:
N = 5
LAYER = 9
hidden_states = []
non_memmed_hidden_states = []

for i in range(N):
    all_toks = torch.tensor([tokenizer(next(iter(pile))['text']).input_ids])
    
    input_ids = all_toks[:, :32].to(device)
    attention_mask = torch.ones_like(input_ids).to(device)
    
    out = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=32,
        top_p=1.0,
        output_hidden_states=True,
        return_dict_in_generate=True
    )
    
    # if all_toks[:64] == out.sequences[0].cpu().numpy().tolist():
    #     print("memorized")
    #     print("".join(tokenizer.batch_decode(mem_data[6]['tokens'])))        
    #     print("".join(tokenizer.batch_decode(out.sequences[0].cpu().numpy().tolist())))
    #     print()
        
    # else:
    #     print('not memorized')
    #     print("".join(tokenizer.batch_decode(mem_data[6]['tokens'])))        
    #     print("".join(tokenizer.batch_decode(out.sequences[0].cpu().numpy().tolist())))
    #     print()
        
    seq_hidden_states = [out.hidden_states[i][LAYER].detach().cpu().numpy() for i in range(1, 32)]
    non_memmed_hidden_states.append(seq_hidden_states)         

In [123]:
non_memmed_hidden_states = np.concatenate(np.array(non_memmed_hidden_states)).squeeze(1).squeeze(1)
non_memmed_hidden_states.shape

(155, 768)

#### get probe training data and train probes

In [145]:
from probes import LRProbe
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression

# Combine memmed and non_memmed hidden states
X = np.concatenate((memmed_hidden_states, non_memmed_hidden_states))
y = np.concatenate((np.ones(memmed_hidden_states.shape[0]), np.zeros(non_memmed_hidden_states.shape[0])))

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)

# Train the LRProbe
# Train the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model on the test set
accuracy = lr_model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")


# X_train = torch.from_numpy(X_train)
# y_train = torch.from_numpy(y_train)
# lr_probe = LRProbe.from_data(X_train, y_train)

# # # Evaluate the LRProbe
# # accuracy = lr_probe.evaluate(test_dataloader)
# # print(f"Accuracy: {accuracy}")


Accuracy: 0.96415770609319


In [127]:
X_train

array([[-0.5376336 ,  0.16557823,  0.13526621, ...,  1.5942705 ,
        -0.25045374, -1.6095381 ],
       [-1.375184  ,  0.06745636, -0.20176649, ..., -4.04632   ,
        -1.8137257 ,  1.8041072 ],
       [-1.3954911 , -1.3519753 , -1.0360631 , ..., -1.2765585 ,
        -0.31352636,  2.3626287 ],
       ...,
       [-0.14667648,  0.16362748, -0.2681103 , ...,  2.2733421 ,
         1.3856928 , -0.7611162 ],
       [-1.2961061 , -0.02733111, -0.7057347 , ..., -0.22107062,
         0.32886416,  0.44021648],
       [-0.6614495 , -0.20768896, -0.12995625, ...,  1.4106021 ,
         0.22628635, -0.0274874 ]], dtype=float32)