# Set up api and dataset

In [1]:
!pip install goodfire==0.2.37



In [2]:
try:
  from google.colab import userdata

  # Add your Goodfire API Key to your Colab secrets
  GOODFIRE_API_KEY = userdata.get('GOODFIRE_API_KEY')
except:
  keyfile = open("../goodfire_apikey.txt", "r")
  GOODFIRE_API_KEY = keyfile.readline()

import goodfire
client = goodfire.Client(
    GOODFIRE_API_KEY
  )

# Instantiate a model variant
variant_small = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")
variant_large = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")

In [3]:
# first try to install datasets
!pip install datasets



In [4]:
# try tqdm (it doesnt seem to work for some reason)
!pip install tqdm
from tqdm.notebook import tqdm
import time

for i in tqdm(range(100)):
    time.sleep(0.01)



  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# next try to download the triva dataset
from datasets import load_dataset

ds = load_dataset("google-research-datasets/natural_questions", "default")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

# Run the on question suppression

run the on question suppression for each model at each nudge level (will need to see what combination of nudge + num_features produces the best results)

In [None]:
prompt = "You are an AI assitant answering trivia questions. Give a single concise answer."
nudge_values = [-0.1, -0.25, -0.5, -0.9] # need to check these are reasonable - MUST BE NEGATIVE IF THEY ARE ANSWER SUPPRESORS
num_features = 3
timeout = 0.5

for nudge_amount in nudge_values:
    num_questions = 0
    num_correct = 0
    for index, sample in enumerate(tqdm(ds["train"])):
        sample_question =  sample["question"]["text"]
        sample_answers = sample["annotations"]["short_answers"][0]["text"]
        #if index == 6:
        #    print("skipping norway question")
        #    continue

        # need to skip malformed questions (they seem to be gibberish like 212 year baseball carreer)
        if len(sample_answers[0]) > 50:
            continue

        # find the features accociated with the corrct answer
        # note: this could be changed to a differnt method like inspect
        nudged_features, relevance = client.features.search(
            sample_answers[0], # should probably be random or something else
            model=variant_small,
            top_k=num_features
        )

        # now set the features
        variant_small.reset()
        variant_small.set(nudged_features[0:num_features], nudge_amount, mode="nudge")

        # now get the model response
        #print(variant_small.json())
        response = client.chat.completions.create(
            [
                {"role": "system", "content": prompt},
                {"role": "user", "content": sample_question}
            ],
            model=variant_small,
            stream=False,
            max_completion_tokens=50,
        )

        given_answer = response.choices[0].message["content"].lower()

        #given_answer = ""
        #for token in response:
        #    given_answer += token.choices[0].delta.content
        #given_answer = given_answer.lower()

        for answer in sample_answers:
            if answer in given_answer:
                num_correct += 1
                break

        # need a seprerate counter for question cos some are malformed
        num_questions += 1

        if num_questions > 300:
            break

        # make sure not to spam the api
        time.sleep(timeout)

    print(f"nudge value:{nudge_amount}")
    print(num_correct)
    print(num_questions)
    print(num_correct/num_questions)


  0%|          | 0/138384 [00:00<?, ?it/s]

{'base_model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'fastmodel_config': [{'feature_id': '3d19e693-f332-405b-856c-eff22116831e', 'feature_label': 'Grace, bouncing, and bracelets', 'max_activation_strength': 2.609375, 'index_in_sae': 4423, 'mode': 'nudge', 'value': -0.9}, {'feature_id': 'c9fe3388-5939-4886-9ab2-9c4785c2b9c5', 'feature_label': 'Christian theological concept of divine grace', 'max_activation_strength': 4.09375, 'index_in_sae': 18014, 'mode': 'nudge', 'value': -0.9}, {'feature_id': 'aca5f68a-fd2c-44cf-a74d-45ba2cff1644', 'feature_label': 'Graceful or seductive feminine movements', 'max_activation_strength': 4.5, 'index_in_sae': 46912, 'mode': 'nudge', 'value': -0.9}]}
{'base_model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'fastmodel_config': [{'feature_id': 'bffcc889-de32-4877-9a84-79b3848fff87', 'feature_label': 'UK place names and location suffixes', 'max_activation_strength': 4.4375, 'index_in_sae': 39249, 'mode': 'nudge', 'value': -0.9}, {'feature_id': '334ae082-7f3d-4

In [7]:
print(sample_answers)
print(sample_question)
print(sample)

['norvell', 'norvell disambiguation']
What was Oliver Hardy's real first name?
{'question': "What was Oliver Hardy's real first name?", 'question_id': 'tc_582', 'question_source': 'http://www.triviacountry.com/', 'entity_pages': {'doc_source': ['TagMe'], 'filename': ['Oliver_Hardy.txt'], 'title': ['Oliver Hardy'], 'wiki_context': ['Oliver Norvell "Babe" Hardy (born Norvell Hardy) (January 18, 1892 – August 7, 1957) was an American comic actor famous as one half of Laurel and Hardy, the classic double act that began in the era of silent films and lasted 25 years, from 1927 to 1951. He was credited with his first film, Outwitting Dad, in 1914. In some of his early works, he was billed as Babe Hardy, using his nickname.\n\nEarly life and education\n\nOliver Hardy was born Norvell Hardy in Harlem, Georgia. His father Oliver  was a Confederate veteran who was wounded at the Battle of Antietam on September 17, 1862. After his demobilization as a recruiting officer for Company K, 16th Georgia

In [None]:
nudge_values = [-0.1, -0.25, -0.5, -0.9] # need to check these are reasonable
num_features = 3
timeout = 0.5

for nudge_amount in nudge_values:
    num_questions = 0
    num_correct = 0
    for index, sample in enumerate(tqdm(ds["train"])):
        sample_question =  sample["question"]["text"]
        sample_answers = sample["annotations"]["short_answers"][0]["text"]

        # need to skip malformed questions (they seem to be gibberish like 212 year baseball carreer)
        if len(sample_answers[0]) > 50:
            continue

        # find the features accociated with the corrct answer
        # note: this could be changed to a differnt method like inspect
        nudged_features, relevance = client.features.search(
            sample_answers[0], # should probably be random or something else
            model=variant_large,
            top_k=num_features
        )

        # now set the features
        variant_large.reset()
        variant_large.set(nudged_features[0:num_features], nudge_amount, mode="nudge")

        # now get the model response
        response = client.chat.completions.create(
            [
                {"role": "system", "content": prompt},
                {"role": "user", "content": sample_question}
            ],
            model=variant_large,
            stream=False,
            max_completion_tokens=50,
        )

        given_answer = response.choices[0].message["content"].lower()

        #given_answer = ""
        #for token in response:
        #    given_answer += token.choices[0].delta.content
        #given_answer = given_answer.lower()

        for answer in sample_answers:
            if answer in given_answer:
                num_correct += 1
                break

        num_questions += 1

        if num_questions > 300:
            break

        # make sure not to spam the api
        time.sleep(timeout)

    print(f"nudge value:{nudge_amount}")
    print(num_correct)
    print(num_questions)
    print(num_correct/num_questions)


  0%|          | 0/138384 [00:00<?, ?it/s]

nudge value:-0.1
219
301
0.7275747508305648


  0%|          | 0/138384 [00:00<?, ?it/s]

nudge value:-0.25
188
301
0.6245847176079734


  0%|          | 0/138384 [00:00<?, ?it/s]

nudge value:-0.5
108
301
0.3588039867109635


  0%|          | 0/138384 [00:00<?, ?it/s]

nudge value:-0.9
22
301
0.07308970099667775
