In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["OPENAI_API_KEY"] = open(os.path.join(os.path.expanduser("~"), ".openai_api_key"), "r").read()[:-1]

from neuron_explainer.activations.activation_records import calculate_max_activation
from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron
from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator
from neuron_explainer.explanations.explainer import TokenActivationPairExplainer, SummaryExplainer, HighlightExplainer, HighlightSummaryExplainer
from neuron_explainer.explanations.prompt_builder import PromptFormat

EXPLAINER_MODEL_NAME = "gpt-3.5-turbo"
SIMULATOR_MODEL_NAME = "text-davinci-003"

neuron_record = load_neuron(19, 1377)

cutoff = neuron_record.quantile_boundaries[2]
print("Cutoff:{:.3f}\n".format(cutoff))
# Grab the activation records we'll need.
slice_params = ActivationRecordSliceParams(n_examples_per_split=5)
train_activation_records = neuron_record.train_activation_records(
    activation_record_slice_params=slice_params
)
#uses a train

valid_activation_records = neuron_record.valid_activation_records(
    activation_record_slice_params=slice_params
)

all_explanations = dict()

### Original
explainer = TokenActivationPairExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)
explanations = await explainer.generate_explanations(
    all_activation_records=train_activation_records,
    max_activation=calculate_max_activation(train_activation_records),
    num_samples=1,
)
assert len(explanations) == 1
all_explanations["Original"] = explanations[0]


explainers = dict()

explainers["Summary"] = SummaryExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)
explainers["Highlight"] = HighlightExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)
explainers["HighlightSummary"] = HighlightSummaryExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)

for key in explainers:
    explanations = await explainers[key].generate_explanations(
        all_activation_records=train_activation_records,
        cutoff=cutoff,
        num_samples=1,
    )
    assert len(explanations) == 1
    all_explanations[key] = explanations[0]

Cutoff:0.515



In [3]:
for key in all_explanations:
    print("Mode:{} - explanation: \n{}\n".format(key,all_explanations[key]))

Mode:Original - explanation: 
metaphorical language related to mixing, blending, or joining two things or concepts.

Mode:Summary - explanation: 
examples of comparisons or analogies using words such as "like" or "as".

Mode:Highlight - explanation: 
comparisons using similes.

Mode:HighlightSummary - explanation: 
phrases that use comparisons or analogies, often using the word "like".

