# Do probe training

Load the activations and labels from HF, aggregate, and construct datasets to train the probe on (note sklearn doesn't require a validation dataset).

In [2]:
import probe_gen.probes as probes
from sklearn.metrics import classification_report
from probe_gen.config import ConfigDict

# Create train, val, and test datasets
activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_and_labels_at_layer("refusal_llama_3b_prompted_5k", layer=12, verbose=True)
activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
train_dataset, val_dataset, test_dataset = probes.create_activation_datasets(activations_tensor, labels_tensor, val_size=0.1, test_size=0.2, balance=True, verbose=True)

loaded labels
loaded activations with shape torch.Size([5000, 336, 3072])
calculated attention mask with shape torch.Size([5000, 336])
Train: 3500 samples, 1750.0 positives
Val:   500 samples, 250.0 positives
Test:  1000 samples, 500.0 positives


Create a probe and fit it.

In [3]:
# Initialise and fit a probe with the datasets
cfg = ConfigDict(
    use_bias=True,
    normalize=True,
    lr=0.0001,
    weight_decay=0.0,
)
probe = probes.TorchLinearProbe(cfg)
probe.fit(train_dataset, val_dataset)

# Print val results
eval_dict, y_pred, y_pred_proba = probe.eval(val_dataset)
print('\nroc_auc:', eval_dict['roc_auc'])


Epoch 10/100, Train Loss: 0.3574, Val Loss: 0.3458
Epoch 20/100, Train Loss: 0.3214, Val Loss: 0.3363

roc_auc: 0.9227200000000001


Evaluate the probe on test dataset.

In [4]:
# Evaluate the model
eval_dict, y_pred, y_pred_proba = probe.eval(test_dataset)
print(eval_dict)
print(classification_report(test_dataset['y'], y_pred))

{'accuracy': 0.837, 'roc_auc': 0.89418, 'tpr_at_1_fpr': np.float64(0.148)}
              precision    recall  f1-score   support

         0.0       0.88      0.78      0.83       500
         1.0       0.80      0.89      0.85       500

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



In [5]:
# Load a seperate test dataset
activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_and_labels_at_layer("refusal_llama_3b_1k", layer=12, verbose=True)
activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
_, _, test_dataset = probes.create_activation_datasets(activations_tensor, labels_tensor, val_size=0.0, test_size=1.0, balance=True, verbose=True)

# Evaluate the model
eval_dict, y_pred, y_pred_proba = probe.eval(test_dataset)
print(eval_dict)
print(classification_report(test_dataset['y'], y_pred))

loaded labels
loaded activations with shape torch.Size([1000, 249, 3072])
calculated attention mask with shape torch.Size([1000, 249])
Train: 0 samples, 0.0 positives
Val:   0 samples, 0.0 positives
Test:  1000 samples, 500.0 positives
{'accuracy': 0.85, 'roc_auc': 0.9112439999999999, 'tpr_at_1_fpr': np.float64(0.078)}
              precision    recall  f1-score   support

         0.0       0.82      0.89      0.86       500
         1.0       0.88      0.81      0.84       500

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



In [6]:
# Load a seperate test dataset
activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_and_labels_at_layer("refusal_llama_3b_prompted_1k", layer=12, verbose=True)
activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
_, _, test_dataset = probes.create_activation_datasets(activations_tensor, labels_tensor, val_size=0.0, test_size=1.0, balance=True, verbose=True)

# Evaluate the model
eval_dict, y_pred, y_pred_proba = probe.eval(test_dataset)
print(eval_dict)
print(classification_report(test_dataset['y'], y_pred))

loaded labels
loaded activations with shape torch.Size([1000, 249, 3072])
calculated attention mask with shape torch.Size([1000, 249])
Train: 0 samples, 0.0 positives
Val:   0 samples, 0.0 positives
Test:  1000 samples, 500.0 positives
{'accuracy': 0.875, 'roc_auc': 0.925292, 'tpr_at_1_fpr': np.float64(0.196)}
              precision    recall  f1-score   support

         0.0       0.90      0.84      0.87       500
         1.0       0.85      0.91      0.88       500

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.87      1000
weighted avg       0.88      0.88      0.87      1000



In [7]:
# Load a seperate test dataset
activations_tensor, attention_mask, labels_tensor = probes.load_hf_activations_and_labels_at_layer("refusal_ministral_8b_1k", layer=12, verbose=True)
activations_tensor = probes.MeanAggregation()(activations_tensor, attention_mask)
_, _, test_dataset = probes.create_activation_datasets(activations_tensor, labels_tensor, val_size=0.0, test_size=1.0, balance=True, verbose=True)

# Evaluate the model
eval_dict, y_pred, y_pred_proba = probe.eval(test_dataset)
print(eval_dict)
print(classification_report(test_dataset['y'], y_pred))

loaded labels
loaded activations with shape torch.Size([1000, 248, 3072])
calculated attention mask with shape torch.Size([1000, 248])
Train: 0 samples, 0.0 positives
Val:   0 samples, 0.0 positives
Test:  1000 samples, 500.0 positives
{'accuracy': 0.815, 'roc_auc': 0.908852, 'tpr_at_1_fpr': np.float64(0.22)}
              precision    recall  f1-score   support

         0.0       0.78      0.88      0.83       500
         1.0       0.87      0.75      0.80       500

    accuracy                           0.81      1000
   macro avg       0.82      0.81      0.81      1000
weighted avg       0.82      0.81      0.81      1000



# Hyperparameter Search

In [None]:
from probe_gen.standard_experiments.hyperparameter_search import run_full_hyp_search_on_layers

# You might not be able to run all layers at once, so can do them in batches like below
run_full_hyp_search_on_layers(
    'mean_torch', 'refusal_ministral_8b_5k', 'llama_3b', [6,9,12,15,18,21]
)


######################### Evaluating layer 6 #############################


In [None]:
from probe_gen.standard_experiments.hyperparameter_search import load_best_params_from_search

# Can load the best params from the search at any time
load_best_params_from_search(
    'mean_torch', 'refusal_ministral_8b_5k', 'llama_3b', [6,9,12,15,18,21]
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33madr-skapars[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Best roc_auc: 0.853504
Best params: {'layer': 12, 'lr': 0.0001, 'use_bias': True, 'normalize': True, 'weight_decay': 0.0}
