## Imports

In [None]:
import os 
import time
import csv
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
home = Path('...ADD_PATH...')

## Load harmful prompts (and suffle)

In [None]:
harmful_behaviour_data = home/'data/advbench_subset.csv'

In [None]:
df_behaviour = pd.read_csv(harmful_behaviour_data)
df_behaviour['attack_success_CRDHPW23'] = 0
df_behaviour['attack_success_this_work_tot'] = 0
df_behaviour = df_behaviour.reset_index().rename(columns={'index': 'behaviour_id'})
df_behaviour = df_behaviour.set_index(['behaviour_id'])

In [None]:
np.random.seed(1234)
df_behaviour = df_behaviour.sample(frac=1).reset_index(drop=True)

In [None]:
df_behaviour.head()

# Create Results Folder

In [None]:
results_folder = f'simulation_results/simulation_results_{datetime.today().strftime("%Y_%m_%d")}/'
!mkdir -p "$results_folder"

## Implementation of Tree of Attacks with Pruning (TAP)

In [None]:
@dataclass
class TAP:
    branch_factor: int = 4
    depth: int = 10
    width: int = 10

In [None]:
def run_simulation(target='llama-2',\
                    evaluator='no-evaluator',
                    attacker="vicuna-api-model",
                    attack=None,\
                    prompt_ids=range(10)):
    
    currentTime = datetime.now().strftime("%Y_%m_%d_at_%H_%M")
    
    folder = f'target_{target}/evaluator_{evaluator}/'\
             + f'params_b{attack.branch_factor}_'\
             + f't{attack.depth}_'\
             + f'w{attack.width}' 
    
    !mkdir -p "$results_folder"/"$folder"
    folder = str(home/results_folder/folder) 
    
    for ijk, i in tqdm(enumerate(prompt_ids)): 
        goal = df_behaviour.iloc[i]['goal']
        target_str = df_behaviour.iloc[i]['target']
        
        !python3.10 main_TAP.py --attack-model "$attacker" --target-model "$target" --evaluator-model "$evaluator" \
                 --goal "\"$goal\"" --target-str "\"$target_str\"" \
                 --branching-factor "$attack.branch_factor" --width "$attack.width" --depth "$attack.depth" \
                 --store-folder "$folder" --iter-index "$i" \
                 >> "$folder"/iter_"$i"_datetime_"$currentTime" 2>&1 

## Run Simulations

In [None]:
attack = TAP(branch_factor=4, depth=10, width=10)

### Evaluator: GPT-4, Attacker: Vicuna-13B

In [None]:
evaluator = 'gpt-4'
attacker = "vicuna-api-model"

#### Target: Vicuna-13B

In [None]:
run_simulation(target='vicuna-api-model', 
               evaluator=evaluator,
               attacker=attacker, 
               attack=attack, 
               prompt_ids=range(50))

#### Target: Llama-2-Chat-7B

In [None]:
run_simulation(target='llama-2-api-model', 
               evaluator=evaluator,
               attacker=attacker, 
               attack=attack, 
               prompt_ids=range(50))

#### Target: GPT-3.5-Turbo 

In [None]:
run_simulation(target='gpt-3.5-turbo',
               evaluator=evaluator,
               attacker=attacker, 
               attack=attack, 
               prompt_ids=range(50))

#### Target: GPT-4-Turbo 

In [None]:
run_simulation(target='gpt-4-1106-preview', 
               evaluator=evaluator,
               attacker=attacker, 
               attack=attack, 
               prompt_ids=range(50))

#### Target: PaLM-2

In [None]:
run_simulation(target='palm-2', 
               evaluator=evaluator,
               attacker=attacker, 
               attack=attack, 
               prompt_ids=range(50))