In [None]:
import numpy as np 
import torch 
from tqdm import tqdm 
import pickle 
import pandas as pd
from typing import List, Dict, Any, Tuple, Union, Optional, Callable
import requests 
import time 

import datasets
from datasets import load_dataset
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForCausalLM

import sys
sys.path.append('../') 

from white_box.model_wrapper import ModelWrapper
from white_box.utils import gen_pile_data 
from white_box.dataset import clean_data 

%load_ext autoreload
%autoreload 2

In [42]:
model_name = "EleutherAI/pythia-70m"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
mw = ModelWrapper(model, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### get prompts

In [3]:
pile_data = gen_pile_data(N = 1000, tokenizer = tokenizer, min_n_toks = 64) #strings 
pythia_evals_data = load_dataset('EleutherAI/pythia-memorized-evals')['duped.70m'][:1000]['tokens'] #tokens

In [6]:
cleaned_pile_toks, dirty_pile_dict = clean_data(pile_data, tokenizer, return_toks = True)
cleaned_pythia_toks, dirty_pythia_dict = clean_data(pythia_evals_data, tokenizer, return_toks = True)

In [7]:
len(cleaned_pile_toks), len(cleaned_pythia_toks), {k: len(v) for k,v in dirty_pythia_dict.items()}

(953,
 655,
 {'increment': 43,
  'repeated_majority': 10,
  'is_repeated_string': 25,
  'repeats_subseq': 214})

In [15]:
cleaned_pile_toks, cleaned_pythia_toks = cleaned_pile_toks[:500], cleaned_pythia_toks[:500]

pickle.dump(cleaned_pile_toks, open('../data/pythia-70m/mem/pile.pkl', 'wb'))
pickle.dump(cleaned_pythia_toks, open('../data/pythia-70m/mem/pythia_evals.pkl', 'wb'))

### after getting activations, create datasets

In [71]:
from white_box.dataset import PromptDist, create_prompt_dist_from_metadata_path, ActDataset, less_than_60_percent, equal_100_percent
from white_box.dataset import gen_fuzzy_pos_dataset 

data_path = '../data/pythia-70m/'
pile_file_spec = "mem/pile_"
pythia_file_spec = "mem/pythia_evals_"

pythia_metadata = pd.read_csv(data_path + pythia_file_spec + 'metadata.csv')
pile_metadata = pd.read_csv(data_path + pile_file_spec + 'metadata.csv')

In [72]:
neg_pythia_evals = create_prompt_dist_from_metadata_path(data_path + pythia_file_spec + 'metadata.csv', less_than_60_percent)
pos_pythia_evals = create_prompt_dist_from_metadata_path(data_path + pythia_file_spec + 'metadata.csv', equal_100_percent)
neg_pile = create_prompt_dist_from_metadata_path(data_path + pile_file_spec + 'metadata.csv', less_than_60_percent)
pos_pile = create_prompt_dist_from_metadata_path(data_path + pile_file_spec + 'metadata.csv', equal_100_percent)

In [73]:
fuzzy_pos = gen_fuzzy_pos_dataset([data_path + pile_file_spec + 'metadata.csv', data_path + pythia_file_spec + 'metadata.csv'])
fuzzy_pos.y.shape

torch.Size([45])

In [39]:
from white_box.dataset import get_quotes
quotes = get_quotes() #one time thing

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [14:59<00:00,  1.80s/it]


In [56]:
from white_box.dataset import gen_quotes_metadata 

quotes_metadata = gen_quotes_metadata(mw, path_to_quotes = '../data/all_quotes.json')

100%|██████████| 39/39 [00:19<00:00,  2.01it/s]


In [74]:
quotes_metadata.to_csv(data_path + 'quotes/all_quotes.csv' , index = False)

In [11]:
from white_box.probes import LRProbe
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from datasets import load_from_disk, DatasetDict
from sklearn.metrics import accuracy_score, roc_auc_score

class ProbeDataset():
    def __init__(self, dataset : ActDataset):
        if dataset.X is None:
            dataset.instantiate()
        
        self.dataset = dataset
        
        self.N_LAYERS = self.dataset.X.shape[1]
        self.N_TOKS = self.dataset.X.shape[2]
    
    def layer_sweep_results(self,
                            lr : float = 0.01,
                            weight_decay : float = 1,
                            epochs : int = 500,
                            use_bias : bool = True,
                            test_size = 0.2):
        probes = [[None for _ in range(self.N_TOKS)] for _ in range(self.N_LAYERS)]
        probe_accs = [[None for _ in range(self.N_TOKS)] for _ in range(self.N_LAYERS)]
        probe_aucs = [[None for _ in range(self.N_TOKS)] for _ in range(self.N_LAYERS)]
        
        train_states, val_states, y_train, y_val = self.dataset.train_test_split(test_size = test_size, layer = None, tok_idxs = None, random_state = 0)
        
        for layer in tqdm(range(self.N_LAYERS)):
            for tok_idx in range(self.N_TOKS):
                X_train, X_val = train_states[:, layer, tok_idx], val_states[:, layer, tok_idx]
                
                probe = LRProbe.from_data(X_train, y_train, 
                                        lr = lr, 
                                        weight_decay = weight_decay, 
                                        epochs = epochs, 
                                        use_bias = use_bias,
                                        device = "cuda")    
                    
                probes[layer][tok_idx] = probe
                probe_accs[layer][tok_idx] = probe.get_probe_accuracy(X_val, y_val, device = "cuda")
                probe_aucs[layer][tok_idx] = probe.get_probe_auc(X_val, y_val, device = "cuda")
        
        return np.array(probe_accs).T, np.array(probe_aucs).T, probes

    def train_probe(self, layer : int, tok_idxs : List[int],
                    lr : float = 0.01,
                    weight_decay : float = 1,
                    epochs : int = 500,
                    use_bias : bool = True):
        
        X_train, X_val, y_train, y_val = self.dataset.train_test_split(test_size = 0.2, layer = layer, tok_idxs = tok_idxs, random_state = 0)
        
        probe = LRProbe.from_data(X_train, y_train, 
                                lr = lr, 
                                weight_decay = weight_decay, 
                                epochs = epochs, 
                                use_bias = use_bias,
                                device = "cuda")

        acc = probe.get_probe_accuracy(X_val, y_val, device = "cuda")
        auc = probe.get_probe_auc(X_val, y_val, device = "cuda")
        
        return acc, auc, probe
    
    def train_sk_probe(self, layer : int, tok_idxs : List[int], 
                       max_iter = 3000,
                       C = 1e-5
                       ):
        X_train, X_val, y_train, y_val = self.dataset.train_test_split(test_size = 0.2, layer = layer, tok_idxs = tok_idxs, random_state = 0)

        probe_lr = LogisticRegression(max_iter = max_iter, C = C)
        probe_lr.fit(X_train.numpy(), y_train.numpy())

        y_pred = probe_lr.predict(X_val.numpy())
        accuracy = accuracy_score(y_val.numpy(), y_pred)
        auc = roc_auc_score(y_val.numpy(), probe_lr.predict_proba(X_val.numpy())[:, 1])
        
        return accuracy, auc, probe_lr

In [12]:
dataset = ActDataset([pos_pythia_evals, pos_quotes, pos_prefix], [neg_pythia_evals, neg_quotes, ])
X, y = dataset.instantiate()
probe_dataset = ProbeDataset(dataset)
accs, aucs, probes = probe_dataset.layer_sweep_results()

100%|██████████| 6/6 [00:23<00:00,  3.83s/it]


In [13]:
import plotly.express as px
fig = px.imshow(accs, y=[str(i) for i in range(10)], x=[str(i) for i in range(accs.shape[1])])
fig.update_layout(
    title=f"Probe Accuracy, {model_name}",
    xaxis_title="Layers",
    yaxis_title="Tokens",
)

fig.show()