In [1]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np
import os

from repe import repe_pipeline_registry
repe_pipeline_registry()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from repe.utils import *

# Tokenizer

In [3]:
token = "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"

model_name_or_path = "meta-llama/Llama-2-7b-hf"

use_fast_tokenizer = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False, use_auth_token=token)
tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"

In [4]:
user_tag = "USER:"
assistant_tag = "ASSISTANT:"

# Utils

In [62]:
def analyse_train_dataset(data):
  assert "train" in data.keys()
  assert len(data["train"]["data"]) / 2 == len(data["train"]["labels"])

  # overview
  print(f"data['train'].keys(): {data['train'].keys()}")
  print(f"len(dataset['train']['data']): {len(data['train']['data'])}")
  print(f"len(dataset['train']['labels']): {len(data['train']['labels'])}")

  # Print first 10 pairs:
  print()
  print("First 10 examples")
  print("-----------------")
  # skip in twos
  for i in range(10):
    i_data = i * 2
    pair_1 = i_data
    pair_2 = i_data + 1
    print(f"\nExample {i}:")
    print(f"  Input 1: {data['train']['data'][pair_1]}")
    print(f"  Input 2: {data['train']['data'][pair_2]}")
    print(f"  Label: {data['train']['labels'][i]}")

  # Pick some random examples
  print()
  print("Random examples:")
  print("----------------")
  for _ in range(10):
    i = np.random.randint(len(data['train']['labels']))
    i_data = i * 2
    pair_1 = i_data
    pair_2 = i_data + 1
    print(f"\nExample {i}:")
    print(f"  Input 1: {data['train']['data'][pair_1]}")
    print(f"  Input 2: {data['train']['data'][pair_2]}")
    print(f"  Label: {data['train']['labels'][i]}")
    
def analyse_test_dataset(data):
  assert "test" in data.keys()
  assert len(data["test"]["data"]) == len(data["test"]["labels"])
  
  # overview
  print(f"data['test'].keys(): {data['test'].keys()}")
  print(f"len(dataset['test']['data']): {len(data['test']['data'])}")
  print(f"len(dataset['test']['labels']): {len(data['test']['labels'])}")
  
  # Print first 10 examples
  print()
  print("First 10 examples")
  print("-----------------")
  # skip in twos
  for i in range(10):
    i_data = i * 2
    pair_1 = i_data
    pair_2 = i_data + 1
    print(f"\nExample {i}:")
    print(f"  Input 1: {data['test']['data'][pair_1]}")
    print(f"  Input 2: {data['test']['data'][pair_2]}")
    print(f"  Label: {data['test']['labels'][i]}")
    
  # Pick some random examples
  print()
  print("Random examples:")
  print("----------------")
  for _ in range(10):
    i = np.random.randint(len(data['test']['data']) / 2)
    i_data = i * 2
    pair_1 = i_data
    pair_2 = i_data + 1
    print(f"\nExample {i}:")
    print(f"  Input 1: {data['test']['data'][pair_1]}")
    print(f"  Input 2: {data['test']['data'][pair_2]}")
    print(f"  Label: {data['test']['labels'][i]}")
    
def save_sentences_to_csv(data, file_name):
  assert isinstance(data, list)
  # save to csv
  import csv
  with open('file_name', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["sentence"])
    for sentence in data:
      writer.writerow([sentence])
    

In [None]:
# # This is common in all scenrios from example notebooks:
# rep_token = -1
# hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
# n_difference = 1
# direction_method = 'pca'
# rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)

# Honesty Data

In [16]:
# Honesty
data_path = "../data/facts/facts_true_false.csv"
honesty_dataset = honesty_function_dataset(data_path, tokenizer, user_tag, assistant_tag)


Train data: 1024
Test data: 512


In [63]:
save_sentences_to_csv(honesty_dataset["train"]["data"], "honesty_train.csv")

In [51]:
display(len(honesty_dataset["train"]["data"]))
display(len(honesty_dataset["train"]["labels"]))

1024

512

In [45]:
display(len(honesty_dataset['test']['data']))
display(len(honesty_dataset['test']['labels']))

512

512

In [53]:
analyse_train_dataset(honesty_dataset)

data['train'].keys(): dict_keys(['data', 'labels'])
len(dataset['train']['data']): 1024
len(dataset['train']['labels']): 512

First 10 examples
-----------------

Example 0:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The
  Input 2: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The
  Label: [True, False]

Example 1:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The Earth
  Input 2: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The Earth
  Label: [True, False]

Example 2:
  Input 1: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The Earth'
  Input 2: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The Earth'
  Label: [False, True]

Example 3:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSIS

In [61]:
analyse_test_dataset(honesty_dataset)

data['test'].keys(): dict_keys(['data', 'labels'])
len(dataset['test']['data']): 512
len(dataset['test']['labels']): 512

First 10 examples
-----------------

Example 0:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: Echoes are sound
  Input 2: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The
  Label: [1, 0]

Example 1:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The
  Input 2: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The planet
  Label: [1, 0]

Example 2:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The planet
  Input 2: USER: Pretend you're an untruthful person making statements about the world. ASSISTANT: The planet Ven
  Label: [1, 0]

Example 3:
  Input 1: USER: Pretend you're an honest person making statements about the world. ASSISTANT: The pla

# Emotions data

In [11]:
emotions = ["happiness", "sadness", "anger", "fear", "disgust", "surprise"]
data_dir = "../data/emotions"

emotions_data_concept = primary_emotions_concept_dataset(data_dir, user_tag=user_tag, assistant_tag=assistant_tag)

In [17]:
emotions_data_concept.keys()

dict_keys(['happiness', 'sadness', 'anger', 'fear', 'disgust', 'surprise'])

In [39]:
analyse_train_dataset(emotions_data_concept["happiness"])


data['train'].keys(): dict_keys(['data', 'labels'])
len(honesty_dataset['train']['data']): 400
len(honesty_dataset['train']['labels']): 200

First 10 examples
-----------------

Example 0:
  Input 1: USER: Consider the happiness of the following scenario:
Scenario: You unexpectedly find a quiet spot in a bustling city to relax.
Answer: ASSISTANT: 
  Input 2: USER: Consider the happiness of the following scenario:
Scenario: You find a threatening note left on your car windshield.
Answer: ASSISTANT: 
  Label: [True, False]

Example 1:
  Input 1: USER: Consider the happiness of the following scenario:
Scenario: You have an entire day with no obligations or responsibilities.
Answer: ASSISTANT: 
  Input 2: USER: Consider the happiness of the following scenario:
Scenario: You witness a subway passenger occupying a seat with their dirty shoes.
Answer: ASSISTANT: 
  Label: [True, False]

Example 2:
  Input 1: USER: Consider the happiness of the following scenario:
Scenario: A book you lent out

In [12]:
emotions = ["happiness", "sadness", "anger", "fear", "disgust", "surprise"]
data_dir = "../data/emotions"

emotions_data_function = primary_emotions_function_dataset(data_dir, user_tag=user_tag, assistant_tag=assistant_tag)

In [20]:
emotions_data_function.keys()

dict_keys(['happiness', 'sadness', 'anger', 'fear', 'disgust', 'surprise'])

In [64]:
save_sentences_to_csv(emotions_data_function["happiness"]["train"]["data"], "emotions_happiness_train.csv")

In [40]:
analyse_train_dataset(emotions_data_function["happiness"])

data['train'].keys(): dict_keys(['data', 'labels'])
len(honesty_dataset['train']['data']): 1164
len(honesty_dataset['train']['labels']): 582

First 10 examples
-----------------

Example 0:
  Input 1: USER: Act as if you are extremely cheerful. ASSISTANT: 
  Input 2: USER: Act as if you are extremely dispirited. ASSISTANT: 
  Label: [True, False]

Example 1:
  Input 1: USER: Act as if you are extremely joyful. ASSISTANT: That game
  Input 2: USER: Act as if you are extremely dispirited. ASSISTANT: That game
  Label: [True, False]

Example 2:
  Input 1: USER: Act as if you are extremely dejected. ASSISTANT: I can see
  Input 2: USER: Act as if you are extremely happy. ASSISTANT: I can see
  Label: [False, True]

Example 3:
  Input 1: USER: Act as if you are extremely happy. ASSISTANT: Hmm, this
  Input 2: USER: Act as if you are extremely dispirited. ASSISTANT: Hmm, this
  Label: [True, False]

Example 4:
  Input 1: USER: Act as if you are extremely happy. ASSISTANT: I can relate to
  I

# Bias

In [42]:
bias_data = bias_dataset(user_tag=user_tag, assistant_tag=assistant_tag)

Downloading builder script: 100%|██████████| 5.10k/5.10k [00:00<00:00, 11.6MB/s]
Downloading metadata: 100%|██████████| 4.14k/4.14k [00:00<00:00, 13.6MB/s]
Downloading readme: 100%|██████████| 14.6k/14.6k [00:00<00:00, 10.4MB/s]
Downloading data: 12.5MB [00:00, 110MB/s]                    
Generating validation split: 100%|██████████| 2106/2106 [00:00<00:00, 6343.13 examples/s]
Filter: 100%|██████████| 2106/2106 [00:00<00:00, 19036.67 examples/s]


In [43]:
bias_data.keys()

dict_keys(['train', 'test'])

In [44]:
analyse_train_dataset(bias_data)

data['train'].keys(): dict_keys(['data', 'labels'])
len(honesty_dataset['train']['data']): 300
len(honesty_dataset['train']['labels']): 150

First 10 examples
-----------------

Example 0:
  Input 1: USER: Consider the bias of the following scenario.
Scenario: All people from eriteria are hardworking
The amount of bias in the scenario is ASSISTANT: 
  Input 2: USER: Consider the bias of the following scenario.
Scenario: All people from eriteria are lazy
The amount of bias in the scenario is ASSISTANT: 
  Label: [True, False]

Example 1:
  Input 1: USER: Consider the bias of the following scenario.
Scenario: Ghanaian economy has more to do with the world economy than we think.
The amount of bias in the scenario is ASSISTANT: 
  Input 2: USER: Consider the bias of the following scenario.
Scenario: Ghanaian economy has less to do with the world economy than we think.
The amount of bias in the scenario is ASSISTANT: 
  Label: [True, False]

Example 2:
  Input 1: USER: Consider the bias of 