In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np

from inversion_utils import *
import pickle
from sklearn.model_selection import train_test_split

In [3]:
SEED = 0

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)

torch.backends.cudnn.benchmark = True 
torch.backends.cuda.matmul.allow_tf32 = True

LLM = namedtuple('LLM', ['language_model', 'tokenizer', 'processor', 'name', 'model_type'])

In [4]:
model_type = 'llama'
# model_type = 'qwen'

# MODEL_VERSION = '3'
MODEL_VERSION = '3.1'
# MODEL_VERSION = '3.3'

MODEL_SIZE = '8B'
# MODEL_SIZE = '70B'

llm = select_llm(model_type, MODEL_VERSION=MODEL_VERSION, MODEL_SIZE=MODEL_SIZE)

Loading meta-llama/Meta-Llama-3.1-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
with open("../data/moods/all_antonym_pairs.pkl", 'rb') as file:
    all_e = pickle.load(file)

In [6]:
print("Total data:", len(all_e))
print(all_e[:5])

train_data_t, test_data = train_test_split(all_e, test_size=0.1, random_state=SEED)

print("Training data normal:", len(train_data_t))
print(train_data_t[:5])

swap_train_data = [(b, a) for a, b in train_data_t]
print("Training data swapped:", len(swap_train_data))
print(swap_train_data[:5])

train_data = train_data_t + swap_train_data
print("Training data:", len(train_data))
print(train_data[:5])

print("Testing data:", len(test_data))
print(test_data[:5])

Total data: 239
[('optimistic', 'pessimistic'), ('enthusiastic', 'apathetic'), ('energetic', 'lethargic'), ('tense', 'relaxed'), ('confident', 'diffident')]
Training data normal: 215
[('delighted', 'disappointed'), ('sympathetic', 'unsympathetic'), ('burdened', 'unburdened'), ('lighthearted', 'somber'), ('insincere', 'sincere')]
Training data swapped: 215
[('disappointed', 'delighted'), ('unsympathetic', 'sympathetic'), ('unburdened', 'burdened'), ('somber', 'lighthearted'), ('sincere', 'insincere')]
Training data: 430
[('delighted', 'disappointed'), ('sympathetic', 'unsympathetic'), ('burdened', 'unburdened'), ('lighthearted', 'somber'), ('insincere', 'sincere')]
Testing data: 24
[('surprised', 'unsurprised'), ('hasty', 'careful'), ('evasive', 'direct'), ('pretentious', 'unpretentious'), ('confrontational', 'diplomatic')]


In [7]:
X_train, Y_train = read_tuples(llm, train_data, path='../directions_moods_plus_llama/')

Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

D

  return torch.load(io.BytesIO(b))


Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components

In [8]:
print(X_train[-1].mean(dim=0, keepdim=True))
print(X_train[-1].std(dim=0, keepdim=True) + 1e-8)

tensor([[-0.0009, -0.0195,  0.0004,  ...,  0.0050,  0.0019,  0.0221]],
       device='cuda:0')
tensor([[0.0149, 0.0119, 0.0147,  ..., 0.0115, 0.0099, 0.0117]],
       device='cuda:0')


In [9]:
X_train_norm = {}
Y_train_norm = {}

X_mean = {}
X_std = {}
Y_mean = {}
Y_std = {}

for layer in X_train:
    x_mean = X_train[-1].mean(dim=0, keepdim=True)
    x_std = X_train[-1].std(dim=0, keepdim=True) + 1e-8
    y_mean = Y_train[-1].mean(dim=0, keepdim=True)
    y_std = Y_train[-1].std(dim=0, keepdim=True) + 1e-8

    X_mean[layer] = x_mean
    X_std[layer] = x_std
    Y_mean[layer] = y_mean
    Y_std[layer] = y_std

    X_train_norm[layer] = (X_train[layer] - x_mean) / x_std
    Y_train_norm[layer] = (Y_train[layer] - y_mean) / y_std


In [10]:
print(X_mean[-1])
print(X_std[-1])
print(Y_mean[-1])
print(Y_std[-1])

tensor([[-0.0009, -0.0195,  0.0004,  ...,  0.0050,  0.0019,  0.0221]],
       device='cuda:0')
tensor([[0.0149, 0.0119, 0.0147,  ..., 0.0115, 0.0099, 0.0117]],
       device='cuda:0')
tensor([[-0.0009, -0.0195,  0.0004,  ...,  0.0050,  0.0019,  0.0221]],
       device='cuda:0')
tensor([[0.0149, 0.0119, 0.0147,  ..., 0.0115, 0.0099, 0.0117]],
       device='cuda:0')


In [11]:
print(X_train_norm[-1].shape)
print(X_train_norm[-1].mean(dim=0, keepdim=True))
print(X_train_norm[-1].std(dim=0, keepdim=True) + 1e-8)

print(Y_train_norm[-1].shape)
print(Y_train_norm[-1].mean(dim=0, keepdim=True))
print(Y_train_norm[-1].std(dim=0, keepdim=True) + 1e-8)

torch.Size([430, 4096])
tensor([[ 6.6535e-09,  9.4813e-08,  1.1089e-08,  ..., -1.1089e-08,
          2.6614e-08,  2.6614e-08]], device='cuda:0')
tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       device='cuda:0')
torch.Size([430, 4096])
tensor([[ 0.0000e+00,  1.0313e-07, -2.8832e-08,  ..., -1.3307e-08,
          1.7743e-08,  3.3822e-08]], device='cuda:0')
tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       device='cuda:0')


In [25]:
# lrr_matrix_norm = LRR(X_train_norm, Y_train_norm, lambda_reg=0.1) # does not work
# lrr_matrix_norm = LRR(X_train_norm, Y_train_norm, lambda_reg=1.0) # does not work
# lrr_matrix_norm = LRR(X_train_norm, Y_train_norm, lambda_reg=10.0)
lrr_matrix_norm = LRR(X_train_norm, Y_train_norm, lambda_reg=100.0)

In [26]:
t5_controller = load_controller(llm, "aggressive", path='../directions_moods_plus_llama/')
orig_t5 = t5_controller.directions
t6_controller = load_controller(llm, "docile", path='../directions_moods_plus_llama/')

Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : rfm
rfm_iters            : 8
forward_batch_size   : 16
M_batch_size         : 2048
n_components         : 1

Detector found


  return torch.load(io.BytesIO(b))


In [27]:
coef=0.75
max_tokens=100

prompts2 = ["There is a stray dog in front of my house, what should I do?",]

In [28]:
test_concept_vector(t5_controller, concept="aggressive", prompts=prompts2, coef=coef, max_tokens=max_tokens)
test_concept_vector(t6_controller, concept="docile", prompts=prompts2, coef=coef, max_tokens=max_tokens, orig=False)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

There is a stray dog in front of my house, what should I do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

If there's a stray dog in front of your house, here are some steps you can take:

1. **Stay calm**: Approach the dog calmly and slowly, avoiding sudden movements that might scare it. If the dog appears aggressive or fearful, it's best to keep a safe distance.
2. **Observe from a distance**: Watch the dog's behavior, body language, and any visible identification (e.g., a collar, tag, or microchip). This will help you determine if

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

There is a stray dog in front of my house, what should I do?<|eot_id|><|start_header_id|>assista

In [29]:
t5_norm_dir = {i: (t5_controller.directions[i] - X_mean[i]) / X_std[i] for i in t5_controller.directions}

In [30]:
lrr_norm_predicted5 = apply_trans(t5_norm_dir, lrr_matrix_norm)

lrr_predicted5 = {i: lrr_norm_predicted5[i] * X_std[i] + X_mean[i] for i in lrr_norm_predicted5}

In [31]:
print(lrr_norm_predicted5[-1])
print(lrr_predicted5[-1])

tensor([[ 0.7456, -0.8754, -0.6164,  ..., -0.4451, -0.2042,  1.1252]],
       device='cuda:0')
tensor([[ 1.0175e-02, -2.9871e-02, -8.6212e-03,  ..., -5.3996e-05,
         -1.1238e-04,  3.5217e-02]], device='cuda:0')


In [32]:
t5_controller.directions = lrr_predicted5
test_concept_vector(t5_controller, concept="inverted aggressive norm LRR", prompts=prompts2, coef=coef, max_tokens=max_tokens, orig=False)
# test_concept_vector(t5_controller, concept="inverted aggressive norm LRR", prompts=prompts2, coef=0.5, max_tokens=max_tokens, orig=False)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

There is a stray dog in front of my house, what should I do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

That's so sweet! You're so lucky to have a furry friend visiting your doorstep. Don't worry, I've got some tips for you.

First, take a deep breath and smile. You're going to love this. *smiles*

Okay, so you've got a cute little pup wagging its tail in front of your house. You can just imagine the excitement! *chuckles*

Now, let's get down to business. Here's what you should do:

"Hey


In [33]:
# lambda 10 ========================== + inverted aggressive norm LRR Control (normal) ==========================
# <|begin_of_text|><|start_header_id|>system<|end_header_id|>

# Cutting Knowledge Date: December 2023
# Today Date: 26 Jul 2024

# <|eot_id|><|start_header_id|>user<|end_header_id|>

# There is a stray dog in front of my house, what should I do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# That's so sweet! You're so lucky to have a furry friend visiting your doorstep. Don't worry, I've got some tips for you.

# If you're wondering what to do with that adorable pup, here are a few ideas:

# * "Hey, you're so lucky! You get to spend some quality time with a new furry friend! Why not grab a snack and head outside to see if you can coax that sweet pup over to you? Maybe you can even offer 'em a

In [34]:
# with open('inversion_matrices/llama8b/lrr_matrix_2way_norm.pkl', 'wb') as file:
#     pickle.dump(lrr_matrix_norm, file)

In [35]:
for l in lrr_matrix_norm:
    M = lrr_matrix_norm[l].to(dtype=torch.float64)

    # print(torch.norm(M@M - torch.eye(M.shape[0]).to("cuda")))
    eigenvaluest, _ = torch.lobpcg(M, k=5, largest=True)
    eigenvaluesb, _ = torch.lobpcg(M, k=5, largest=False)
    print(eigenvaluest)
    print(eigenvaluesb)
    print("*"*50)

tensor([0.9303, 0.9217, 0.9110, 0.9068, 0.9026], device='cuda:0',
       dtype=torch.float64)
tensor([-1.6545, -1.5140, -1.4652, -1.4090, -1.3406], device='cuda:0',
       dtype=torch.float64)
**************************************************
tensor([0.9992, 0.9918, 0.9889, 0.9812, 0.9660], device='cuda:0',
       dtype=torch.float64)
tensor([-1.7646, -1.5802, -1.3896, -1.3353, -1.2314], device='cuda:0',
       dtype=torch.float64)
**************************************************
tensor([0.9413, 0.9194, 0.9162, 0.9004, 0.8975], device='cuda:0',
       dtype=torch.float64)
tensor([-1.6871, -1.4584, -1.3377, -1.2460, -1.0329], device='cuda:0',
       dtype=torch.float64)
**************************************************
tensor([0.9362, 0.9209, 0.9165, 0.9106, 0.9002], device='cuda:0',
       dtype=torch.float64)
tensor([-1.9878, -1.5626, -1.4691, -1.3329, -1.2210], device='cuda:0',
       dtype=torch.float64)
**************************************************
tensor([0.9548, 0.9242, 