In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["OPENAI_API_KEY"] = open(os.path.join(os.path.expanduser("~"), ".openai_api_key"), "r").read()[:-1]

from neuron_explainer.activations.activation_records import calculate_max_activation
from neuron_explainer.explanations.explainer import TokenActivationPairExplainer, SummaryExplainer, HighlightExplainer, HighlightSummaryExplainer
from neuron_explainer.explanations.prompt_builder import PromptFormat
from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME


EXPLAINER_MODEL_NAME = "gpt-3.5-turbo"

In [3]:
## original
# explainer = TokenActivationPairExplainer(
#     model_name=EXPLAINER_MODEL_NAME,
#     prompt_format=PromptFormat.HARMONY_V4,
#     max_concurrent=1,
# )

# for puzzle_name, puzzle in PUZZLES_BY_NAME.items():
#     print(f"{puzzle_name=}")
#     puzzle_answer = puzzle.explanation
#     # Generate an explanation for the puzzle.
#     explanations = await explainer.generate_explanations(
#         all_activation_records=puzzle.activation_records,
#         max_activation=calculate_max_activation(puzzle.activation_records),
#         num_samples=1,
#     )
#     assert len(explanations) == 1
#     model_generated_explanation = explanations[0]
#     print(f"{model_generated_explanation=}")
#     print(f"{puzzle_answer=}\n")



In [4]:
explainer = HighlightSummaryExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)

for puzzle_name, puzzle in PUZZLES_BY_NAME.items():
    print(f"{puzzle_name=}")
    puzzle_answer = puzzle.explanation
    # Generate an explanation for the puzzle.
    explanations = await explainer.generate_explanations(
        all_activation_records=puzzle.activation_records,
        cutoff=1,
        num_samples=1,
    )
    assert len(explanations) == 1
    model_generated_explanation = explanations[0]
    print(f"{model_generated_explanation=}")
    print(f"{puzzle_answer=}\n")

puzzle_name='colors'
model_generated_explanation='words related to colors.'
puzzle_answer='words related to colors and nothing else'

puzzle_name='char2'
model_generated_explanation='names of people. Specifically, the neuron is detecting repeated mentions of the same name within a short span of text.'
puzzle_answer='the name of the second named character introduced in the passage, excluding the narrator'

puzzle_name='similes'
model_generated_explanation='similes and metaphors.'
puzzle_answer='phrases that are similes and nothing else'

puzzle_name='idioms'
model_generated_explanation='idiomatic expressions containing nature-related words or phrases such as salt, bush, cake, confection, shrubbery, projectile, and hedge.'
puzzle_answer="the substitution of a key word in a common idiom that isn't the usual choice for that idiom"

puzzle_name='feet'
model_generated_explanation='verbs related to movement or action.'
puzzle_answer='words describing motion that uses feet (e.g. "running", "wa