In [27]:
import transformers
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import re
import random
import datasets
from datasets import load_dataset, Dataset, DatasetDict
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, Dict
from tqdm import tqdm
import pickle
from dotenv import load_dotenv
import openai
import sys
sys.path.append('../../')
sys.path.append('../')

from typing import List, Optional, Tuple, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import Dataset, load_from_disk

import plotly.graph_objects as go
import plotly.express as px

from utils import *

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [8]:
file_path = '../../../../gld/train-data-probes/data/70m'
dataset = load_from_disk(os.path.join(file_path, 'split_dataset'))

mem_hiddens = torch.load(f'{file_path}/pythia-evals/mem_all_hidden_states.pt')
pile_hiddens = torch.load(f'{file_path}/pile/pile_all_hidden_states.pt')

hiddens = torch.cat([mem_hiddens, pile_hiddens], dim=0)
hiddens.shape

torch.Size([10000, 6, 10, 512])

In [9]:
seed = 0
random.seed(seed)
np.random.seed(seed)
set_seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

dataset['train'] = dataset['train'].shuffle(seed=seed)
dataset['val'] = dataset['val'].shuffle(seed=seed)

temp_train = dataset['train'].select(range(5000))
temp_test = dataset['val'].select(range(1000))

In [10]:
train_idxs = temp_train['orig_idx']
test_idxs = temp_test['orig_idx']

train_idxs = torch.tensor(train_idxs)
test_idxs = torch.tensor(test_idxs)

train_acts = hiddens[train_idxs]
test_acts = hiddens[test_idxs]

train_acts.shape, test_acts.shape

(torch.Size([5000, 6, 10, 512]), torch.Size([1000, 6, 10, 512]))

In [None]:
from probes import LRProbe
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

probe_acc = np.zeros((10, 16))
mem_dist_test_acc = np.zeros((10, 16))

for tok_idx in range(10): 
    for layer in tqdm(range(16)): 

        train = train_acts[:, layer, tok_idx, :]
        test = test_acts[:, layer, tok_idx, :]

        train = train.cpu().numpy()
        test = test.cpu().numpy()

        X_train = torch.tensor(train, dtype=torch.float32)
        y_train = torch.tensor(temp_train['labels'], dtype=torch.float32)
        X_test = torch.tensor(test, dtype=torch.float32)
        y_test = torch.tensor(temp_test['labels'], dtype=torch.float32)
        probe = LRProbe.from_data(X_train, y_train, device = "cpu")
        probe_acc[tok_idx, layer] = LRProbe.get_probe_accuracy(probe, X_test, y_test, device = "cpu")

In [None]:
# probe acc plot
fig = px.imshow(probe_acc, x = list(range(16)), y = list(range(10)), color_continuous_scale='Blues')
fig.update_layout(
    title="Probe Accuracy",
    xaxis_title="Layer",
    yaxis_title="Token",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [28]:
from get_activations import get_memmed_activations, get_memmed_activations_from_pregenned
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m", padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token

def rotation_cipher(text, shift): 
    """
    Rotates the text by shift amount, character-wise. Also 
    known as the Caesar cipher.
    """
    shifted_text = []
    for char in text: 
        if char.isalpha():
            shifted_text.append(chr((ord(char) - ord('a') + shift) % 26 + ord('a')))
        else:
            shifted_text.append(char)
    return ''.join(shifted_text)

def get_cipher_dataset(dataset, shift): 
    """
    Returns a new dataset with the text shifted by shift amount.
    """
    train = {'text': [rotation_cipher(text, shift) for text in dataset['train']['text']], 'labels': dataset['train']['labels']}
    val = {'text': [rotation_cipher(text, shift) for text in dataset['val']['text']], 'labels': dataset['val']['labels']}          
    test = {'text': [rotation_cipher(text, shift) for text in dataset['test']['text']], 'labels': dataset['test']['labels']}
    train['input_ids'] = tokenizer(train['text'], padding="max_length", truncation=True, max_length=64, return_tensors="pt")['input_ids']
    val['input_ids'] = tokenizer(val['text'], padding="max_length", truncation=True, max_length=64, return_tensors="pt")['input_ids']
    test['input_ids'] = tokenizer(test['text'], padding="max_length", truncation=True, max_length=64, return_tensors="pt")['input_ids']
    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)
    test = Dataset.from_dict(test)
    new_dataset = {'train': train, 'val': val, 'test': test}
    new_dataset = DatasetDict(new_dataset)
    return new_dataset

cipher_3 = get_cipher_dataset(dataset, 3)
cipher_3

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 14085
    })
    val: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 2350
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 7045
    })
})

In [40]:
positives = cipher_3['test'].filter(lambda x: x['labels'] == 1)


Filter: 100%|██████████| 7045/7045 [00:00<00:00, 38179.45 examples/s]


3535

In [20]:
model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/gld/train-data-probes/data/70m/ciphers/rotated_3_model_final")
out = model.generate(tokenizer('', return_tensors='pt')['input_ids'], max_new_tokens=100)
tokenizer.decode(out[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'Kl pb qdph lv dq lqwhuqdo vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwdwhphqw ri wkh vwd'

In [39]:
ex = cipher_3['test'][1]['input_ids']
out = model.generate(torch.tensor(ex[:32]).unsqueeze(0), max_new_tokens=32)
print(out[0].tolist()[32:])
print(ex[32:])
print(tokenizer.decode(out[0].tolist()[32:]))
print(tokenizer.decode(ex[32:]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[82, 72, 259, 17616, 67, 1182, 73, 6968, 259, 17616, 67, 1182, 73, 6968, 259, 17616, 67, 1182, 73, 6968, 259, 17616, 67, 1182, 73, 6968, 259, 17616, 67, 1182, 73, 6968]
[88, 27451, 17443, 669, 737, 21448, 545, 82, 48808, 351, 24260, 60, 805, 2140, 1019, 446, 5848, 81, 3088, 94, 187, 50262, 61, 89, 87, 11285, 4989, 2109, 34453, 92, 12132, 87]
qg wkhb zhuh wkhb zhuh wkhb zhuh wkhb zhuh wkhb zhuh
w kljk $\grfxphqwfodvv[12sw]{plqlpdo}
                \xvhsdfndjh{dpv


In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'text', 'labels', 'orig_idx', 'cutoff_at'],
        num_rows: 14085
    })
    val: Dataset({
        features: ['input_ids', 'text', 'labels', 'orig_idx', 'cutoff_at'],
        num_rows: 2350
    })
    test: Dataset({
        features: ['input_ids', 'text', 'labels', 'orig_idx', 'cutoff_at'],
        num_rows: 7045
    })
})