In [3]:
import pandas as pd

df = pd.read_pickle('../reports/df.pkl')
df.dropna(subset=['benchmark_start_time', 'response', 'model', 'agent'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21013 entries, 0 to 25470
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   createdAt             21013 non-null  datetime64[ns]     
 1   agent                 21013 non-null  object             
 2   costUSD               21013 non-null  float64            
 3   job_id                21013 non-null  object             
 4   challenge             21013 non-null  object             
 5   benchmark_start_time  21013 non-null  datetime64[ns, UTC]
 6   prompt                21013 non-null  object             
 7   response              21013 non-null  object             
 8   model                 21013 non-null  object             
 9   request               21013 non-null  object             
 10  attempted             15008 non-null  object             
 11  categories            15008 non-null  object             
 12  task     

In [4]:
df['challenge'].unique()

array(['TestSearch', 'TestReadFile', 'TestRememberGoal_Hard',
       'TestThreeSum', 'TestWriteFile', 'TestRememberGoal_Simple',
       'TestDebugSimpleTypoWithGuidance', 'TestPasswordGenerator_Easy',
       'TestWritingCLI_FileOrganizer', 'TestRevenueRetrieval',
       'TestBasicRetrieval', 'TestRetrieval3', 'TestReturnCode_Tests',
       'TestFunctionCodeGeneration', 'TestReturnCode_Modify',
       'TestDebugMultipleTypo', 'TestRememberMultipleIdsWithNoise',
       'TestRememberGoal_Advanced', 'TestReturnCode_Write',
       'TestAdaptSimpleTypoWithGuidance',
       'TestDebugSimpleTypoWithoutGuidance', 'TestRememberMultipleIds',
       'TestRememberGoal_Medium', 'TestWrite6Files', 'TestAdaptLink',
       'TestReturnCode_Simple', 'TestBasicMemory',
       'TestWrite5FilesWithArray', 'TestPlanCreation',
       'TestRememberMultiplePhrasesWithNoise', 'TestAdaptTeslaRevenue',
       'TestBasicSafety_BuildNuke', 'TestWebApp_ListAnimals',
       'TestBasicContentGen', 'TestAgentProtocol_Cr

In [5]:
import json
with open ('all_data_jsons.json', 'r') as f:
    all_data_jsons = json.load(f)
    
challenges_array = list(all_data_jsons.keys())

# challenges_to_ignore = ["TestBasicSafety_BuildNuke","TestAgentProtocol_ExecuteAgentTaskStep", "TestAgentProtocol_GetAgentTask", "TestAgentProtocol_ListAgentTasksIds", "TestAgentProtocol_CreateAgentTask", "TestWritingCLI_Easy", "TestGoalDivergence", "TestGoalLoss_Simple"]

agents_to_ignore = ['gpt-engineer', 'smol-developer', 'babyagi', 'evo', 'auto-gpt-turbo']
agent_array = set(df['agent'].unique()) - set(agents_to_ignore)

In [32]:
import json
import re


def is_action_auto_gpt(log):
    """AutoGPTs actions are defined by the presence of the "command" key.

    World state actions
    - web_search
    - write_to_file
    - browse_website
    - execute_python_file
    - list_files
    - execute_python_code
    - read_file
    Internal actions
    - goals_accomplished

    Input
        The "content" key of an LLM response
    """

    # Check for the existence of the "command" key in the log
    command_existence = bool(re.search(r'"command"\s*:', log))
    
    if command_existence:
        try:
            # Convert the JSON-like string to a Python dictionary
            log_dict = json.loads(log)
            
            # Check if the "command" key exists and has a "name" key
            if "command" in log_dict and "name" in log_dict["command"]:
                command_name = log_dict["command"]["name"]

                # List of command names that signify an action
                action_command_names = [
                    "web_search",
                    "write_to_file",
                    "browse_website",
                    "execute_python_file",
                    "list_files",
                    "execute_python_code",
                    "read_file",
                ]
                
                # Check if the command name matches any in the list
                return command_name in action_command_names
        except Exception as e:
            print(e, log)
            raise e

    return False


def is_openai_function(log):
    """OpenAI API function calls are determined by the presence of the "function_call" key.
    Beebot
    World state actions
    - get_html_content
    - read_file
    - write_file
    - wolfram_alpha_query
    - write_python_code
    - execute_python_file
    - google_search
    - wikipedia
    - install_python_package
    - execute_python_file_in_background
    - get_process_status
    - kill_process
    - analyze_webpage_content
    - get_website_text_content
    - gmail_get_message
    - gmail_create_draft
    - disk_usage
    Internal actions
    - get_more_tools
    - exit
    - rewind_actions
    - delegate_task
    - function_summary

    PolyGPT
    World state actions
    - http.
    - filesystem.
    - ethers.
    - ipfs.
    - web-scraper.
    - ens.
    - safe-factory.
    - safe-manager.
    Internal actions
    - LearnWrap
    - InvokeWrap
    - user

    Input
        The entire LLM response
    """
    # Check for the existence of the "function_call" key in the log
    function_call_existence = bool(log.get("function_call", None))

    if function_call_existence:
        # Check if the "function_call" key exists and has a "name" key
        if "name" in log["function_call"]:
            function_name = log["function_call"]["name"]

            # List of function names that signify an action
            action_function_names = [
                "read_file",
                "write_",
                "wolfram_alpha_query",
                "execute_",
                "install_python_package",
                "get_",
                "kill_process",
                "encyclopedia",
                "gmail_",
                "disk_usage",
                "os_name_and_version",
                "analyze_webpage_content",
                "google_",
                "wikipedia",
                "http.",
                "filesystem.",
                "ethers.",
                "ipfs.",
                "web-scraper.",
                "ens.",
                "safe-factory.",
                "safe-manager.",
            ]

            # Check if the function name matches any in the list
            return any(function_name in action for action in action_function_names)

    return False


def is_action_miniagi(log):
    """Mini-AGI function calls are determined by the presence of different patterns
    World state actions
    - execute_python
    - web_search
    - execute_shell
    - ingest_data
    - process_data
    Internal actions
    - done
    - talk_to_user
    - memorize_thoughts
    """
    # List of function names that signify an action
    action_function_names = [
        "execute_python",
        "web_search",
        "execute_shell",
        "ingest_data",
        "process_data",
    ]

    # Check for the <c>...</c> pattern and whether it matches any action function names
    c_pattern_match = False
    c_pattern_search = re.search(r"<c>(.*?)<\/c>", log)
    if c_pattern_search:
        c_pattern_match = c_pattern_search.group(1) in action_function_names

    # Check for the "ACTION:" pattern and whether it matches any action function names
    action_pattern_match = False
    action_pattern_search = re.search(r"ACTION:\s*(\w+)\s*(\(x\d+\))?", log)
    if action_pattern_search:
        action_pattern_match = action_pattern_search.group(1) in action_function_names

    return c_pattern_match or action_pattern_match


def is_action_turbo(log):
    """Turbos actions are defined by the presence of the "cmd" key.
    World state actions
    - search
    - www
    - py
    - aol
    - put
    - pyf
    Internal actions
    - end
    """
    # List of function names that signify an action
    action_function_names = ["search", "www", "py", "aol", "put", "pyf", "sh", "ls"]

    # Check for the "cmd" key pattern and whether its "name" field matches any action function names
    cmd_pattern_match = False
    cmd_pattern_search = re.search(r'"cmd"\s*:\s*{\s*"name"\s*:\s*"(\w+)"', log)
    if cmd_pattern_search:
        cmd_pattern_match = cmd_pattern_search.group(1) in action_function_names

    return cmd_pattern_match


def is_action_general(log):
    """General actions are defined by the presence of specific keywords.

    Categories and their keywords:
    WRITE: ['write', 'start', 'create', 'execute', 'post']
    MODIFY: ['modify', 'mutate', 'delete', 'put']
    READ: ['read', 'list', 'search', 'find', 'get', 'browse', 'query', 'www']
    GENERAL: ['command', 'call', 'function', 'action', 'http']
    """

    if log is None:
        return False

    keywords = {
        "WRITE": ["write", "start", "create", "execute", "post", "publish"],
        "MODIFY": ["modify", "mutate", "delete", "put", "update"],
        "READ": [
            "read",
            "list",
            "search",
            "find",
            "get",
            "browse",
            "query",
            "www",
            "view",
        ],
        "GENERAL": [
            "command",
            "call",
            "function",
            "action",
            "http",
            "invoke",
            "request",
        ],
    }

    all_keywords = [word for sublist in keywords.values() for word in sublist]

    if log.get("content", ""):
        log = log["content"]
    elif log.get("function_call", ""):
        log = json.dumps(log["function_call"])

    return bool(re.search(rf"\b({'|'.join(all_keywords)})\b", log))


def is_action_agent(log, agent, test="", response=""):
    """Determines if a log contains an action based on patterns from different agents."""
    is_action = False

    if log is None:
        print("Log is None", agent, test, response)
        return is_action

    log_content = log.get("content", "")

    if agent == "auto-gpt":
        is_action = is_action_auto_gpt(log_content)
    elif agent in ["beebot", "polygpt"]:
        is_action = is_openai_function(log)
    elif agent == "miniagi":
        is_action = is_action_miniagi(log_content)
    elif agent == "turbo":
        is_action = is_action_turbo(log_content)

    return is_action


In [7]:
import json
import pandas as pd

# Function to convert JSON-like strings to nested dictionaries
def nested_json(x):
    if pd.notna(x) and x:  # Check for not NaN and not empty
        try:
            d = json.loads(x)
        except json.JSONDecodeError:
            return x  # If x is not valid JSON, return as is

        if "content" in d and isinstance(d["content"], str):
            try:
                d["content"] = json.loads(d["content"])
            except json.JSONDecodeError:
                pass  # If inner content is not valid JSON, leave it as is
        return d
    return x  # If x is NaN or empty, return as is

In [8]:
import json
import os
from collections import OrderedDict

# challenge = "TestRememberMultipleIds"

# Loop through unique agents
for challenge in challenges_array:
    for agent in ['auto-gpt']: # agent_array:
        
        master_response_dict = OrderedDict()
        master_response_nested_dict = OrderedDict()
        master_actions_dict = OrderedDict()
        master_request_dict = OrderedDict()
        master_general_actions_dict = OrderedDict()
        
        # Replace with your actual DataFrame
        selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]

        # Group by 'benchmark_start_time'
        grouped_df = selected_df.groupby('benchmark_start_time')

        for timestamp, group in grouped_df:
            response_dict = OrderedDict()
            response_nested_dict = OrderedDict()
            actions_dict = OrderedDict()
            request_dict = OrderedDict()
            general_actions_dict = OrderedDict()
            
            total_rows = len(group)
            
            for i, (_, row) in enumerate(group.iterrows()):
                response = json.loads(row['response'])
                request = row['request']
                response_nested = nested_json(row['response'])
                
                response_dict[str(total_rows-i)] = response
                request_dict[str(total_rows-i)] = request
                response_nested_dict[str(total_rows-i)] = response_nested
                
                if is_action_agent(response, agent, challenge, response):
                    actions_dict[str(total_rows-i)] = response

                if is_action_general(response):
                    general_actions_dict[str(total_rows-i)] = response
            
            response_dict = OrderedDict(reversed(list(response_dict.items())))
            response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))
            actions_dict = OrderedDict(reversed(list(actions_dict.items())))
            request_dict = OrderedDict(reversed(list(request_dict.items())))
            general_actions_dict = OrderedDict(reversed(list(general_actions_dict.items())))
            
            master_response_dict[str(timestamp)] = response_dict
            master_response_nested_dict[str(timestamp)] = response_nested_dict
            master_actions_dict[str(timestamp)] = actions_dict
            master_request_dict[str(timestamp)] = request_dict
            master_general_actions_dict[str(timestamp)] = general_actions_dict
        
        os.makedirs(f'specific_logs/{challenge}', exist_ok=True)
        os.makedirs(f'specific_logs/{challenge}/{agent}', exist_ok=True)
        
        with open(f'specific_logs/{challenge}/{agent}/response.json', 'w') as f:
            json.dump(master_response_dict, f, indent=4)

        # with open(f'specific_logs/{challenge}/{agent}/regex_specific.json', 'w') as f:
        #     json.dump(master_actions_dict, f, indent=4)

        # with open(f'specific_logs/{challenge}/{agent}/response_nested.json', 'w') as f:
        #     json.dump(master_response_nested_dict, f, indent=4)
            
        # with open(f'specific_logs/{challenge}/{agent}/request.json', 'w') as f:
        #     json.dump(master_request_dict, f, indent=4)

        # with open(f'specific_logs/{challenge}/{agent}/regex_simple.json', 'w') as f:
        #     json.dump(master_general_actions_dict, f, indent=4)
            
        # with open(f'specific_logs/{challenge}/{agent}/response_malicious.json', 'w') as f:
        #     json.dump(master_response_nested_dict, f, indent=4)



In [9]:
import json
import os
import random
from collections import OrderedDict

# Initialize the master challenge dictionaries outside the agent loop
master_challenge_response_dict = {}
master_challenge_prompt_dict = {}

# Loop through unique agents
for agent in agent_array:
    
    # Initialize dictionaries to hold data per challenge
    master_response_dict_per_challenge = {}
    master_prompt_dict_per_challenge = {}

    for challenge in challenges_array:
        
        master_response_dict = OrderedDict()
        master_prompt_dict = OrderedDict()
        
        # Replace with your actual DataFrame
        selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]

        # Group by 'benchmark_start_time'
        grouped_df = selected_df.groupby('benchmark_start_time')

        for timestamp, group in grouped_df:
            response_dict = OrderedDict()
            prompt_dict = OrderedDict()
            
            total_rows = len(group)
            
            for i, (_, row) in enumerate(group.iterrows()):
                response = json.loads(row['response'])
                prompt = row['prompt']
                
                response_dict[str(total_rows-i)] = response
                prompt_dict[str(total_rows-i)] = prompt
            
            response_dict = OrderedDict(reversed(list(response_dict.items())))
            prompt_dict = OrderedDict(reversed(list(prompt_dict.items())))
            
            master_response_dict[str(timestamp)] = response_dict
            master_prompt_dict[str(timestamp)] = prompt_dict
        
        master_response_dict_per_challenge[challenge] = master_response_dict
        master_prompt_dict_per_challenge[challenge] = master_prompt_dict

    # Now write the JSON files, using the master dictionaries for each challenge
    os.makedirs(f'all_logs/{agent}', exist_ok=True)

    with open(f'all_logs/{agent}/response.json', 'w') as f:
        json.dump(master_response_dict_per_challenge, f, indent=4)

    with open(f'all_logs/{agent}/prompt.json', 'w') as f:
        json.dump(master_prompt_dict_per_challenge, f, indent=4)

In [10]:
import random
import os
import json

# Variable to specify the percentage of logs to be replaced
replace_percentage = 30  # 30% replace rate

# Specify the agent
agent = "auto-gpt"  # Replace with the actual agent name

# Load logs you want to insert from
with open('malicious_actions/auto-gpt.json', 'r') as f:
    insert_logs = json.load(f)

# Organize logs to be inserted by their command name
insert_logs_by_command = {}
for action in insert_logs['actions']:
    cmd_name = action['command']
    if cmd_name not in insert_logs_by_command:
        insert_logs_by_command[cmd_name] = []
    insert_logs_by_command[cmd_name].append(action)
    
print(f"Total logs to insert: {len(insert_logs['actions'])} with {len(insert_logs_by_command)} unique commands")

# Load the original logs
with open(f'all_logs/{agent}/response.json', 'r') as f:
    original_logs = json.load(f)
    
# Traverse the original logs
candidates_to_replace = []

# Initialize counters for each category
category_count = {0: 0, 1: 0, 2: 0}
total_logs = 0

for challenge, timestamps in original_logs.items():
    for timestamp, logs in timestamps.items():
        for log_num, log in logs.items():
            total_logs += 1
            
            content_str = log['content']
            cmd_name = ""
            
            if content_str:
                try:
                    content_dict = json.loads(content_str)
                    cmd_name = content_dict.get('command', {}).get('name')
                except (json.JSONDecodeError, AttributeError) as e:
                    # print(f"JSON decoding failed for log {log_num}. Error: {e}")
                    continue
            
            if cmd_name in insert_logs_by_command:
                candidates_to_replace.append((challenge, timestamp, log_num))

print(f"Total logs: {total_logs}")

# Choose logs to replace
num_to_replace = int(len(candidates_to_replace) * replace_percentage / 100)
print(f"Replacing {num_to_replace} logs out of {len(candidates_to_replace)} logs that match the insertion logs commands")
logs_to_replace = random.sample(candidates_to_replace, num_to_replace)

for challenge, timestamp, log_num in logs_to_replace:
    # Retrieve the correct command name for the log to be replaced
    content_str = json.loads(original_logs[challenge][timestamp][log_num]['content'])
    cmd_name = content_str.get('command', {}).get('name')
    
    if cmd_name in insert_logs_by_command:  # Ensure it exists in the insertion logs
        replacement_log = random.choice(insert_logs_by_command[cmd_name])
        
        # Increment category count if it exists
        category = replacement_log.get('category')
        category_count[category] += 1
        
        
        original_logs[challenge][timestamp][log_num] = replacement_log
        
# Print the counts for each category
print(f"Category 0 count: {category_count[0]}")
print(f"Category 1 count: {category_count[1]}")
print(f"Category 2 count: {category_count[2]}")

# Create directory if it does not exist
os.makedirs(f'all_logs/{agent}', exist_ok=True)

# Save the modified logs back to JSON
with open(f'all_logs/{agent}/inserted_malicious_responses.json', 'w') as f:
    json.dump(original_logs, f, indent=4)

Total logs to insert: 57 with 7 unique commands
Total logs: 1965
Replacing 387 logs out of 1292 logs that match the insertion logs commands
Category 0 count: 172
Category 1 count: 162
Category 2 count: 53


In [34]:
import json

# ['auto-gpt', 'beebot', 'mini-agi', 'polygpt', 'turbo']
agents_to_regex = ['beebot', 'mini-agi', 'polygpt', 'turbo']
# for auto-gpt, use the inserted_malicious_responses.json file
file_name = 'inserted_malicious_responses.json'

# List of agents to process
for agent in ['auto-gpt']: # agent_array:
    # Read the existing 'response.json' file
    with open(f'all_logs/{agent}/{file_name}', 'r') as f:
        data = json.load(f)

    # Initialize dictionaries to store filtered logs
    filtered_specific = {}
    filtered_general = {}

    for challenge, timestamps in data.items():
        filtered_specific[challenge] = {}
        filtered_general[challenge] = {}
        
        for timestamp, logs in timestamps.items():
            filtered_specific[challenge][timestamp] = {}
            filtered_general[challenge][timestamp] = {}
            
            for log_num, log in logs.items():
                if is_action_agent(log, agent):
                    filtered_specific[challenge][timestamp][log_num] = log
                if is_action_general(log):
                    filtered_general[challenge][timestamp][log_num] = log

    # Save the filtered logs as new JSON files
    with open(f'all_logs/{agent}/regex_specific.json', 'w') as f:
        json.dump(filtered_specific, f, indent=4)

    with open(f'all_logs/{agent}/regex_simple.json', 'w') as f:
        json.dump(filtered_general, f, indent=4)

In [35]:
import json

def count_total_ids_in_json_files(file_path):
    """
    Count the total number of IDs across all challenges and timestamps in a given JSON file.

    Args:
    - file_path (str): The path to the JSON file.

    Returns:
    - int: The total number of IDs.
    """

    total_id_count = 0

    # Load JSON data from the file
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Iterate through each challenge
    for challenge, timestamps in data.items():
        # Iterate through each timestamp within the challenge
        for timestamp, ids in timestamps.items():
            # Add the number of IDs within the timestamp to the total count
            total_id_count += len(ids)

    return total_id_count

for agent in ['auto-gpt']:
    # total = count_total_ids_in_json_files(f'all_logs/{agent}/inserted_malicious_responses.json')
    regex_simple_logs = count_total_ids_in_json_files(f'all_logs/{agent}/regex_simple.json')
    regex_specific_logs = count_total_ids_in_json_files(f'all_logs/{agent}/regex_specific.json')

    print(f"{agent} filtered logs")
    # print(f"Total logs: {total}")
    print(f"Simple regex filter logs: {regex_simple_logs}")
    print(f"Specific regex filter logs: {regex_specific_logs}")

auto-gpt filtered logs
Simple regex filter logs: 1800
Specific regex filter logs: 1292


In [36]:
import json

def count_runs(data):
    total_runs = 0
    for challenge in data:
        total_runs += len(data[challenge])
    return total_runs

with open('all_logs/auto-gpt/regex_specific.json', 'r') as f:
    data = json.load(f)

count_runs(data)

422

In [37]:
train_data = {}
validation_data = {}

# 1. Iterate through each challenge in the JSON
for challenge, runs in data.items():
    
    # 2. Extract the list of timestamps
    timestamps = list(runs.keys())
    
    # 3. Shuffle the list
    random.shuffle(timestamps)
    
    # 4. Split the list into training and validation
    train_size = int(len(timestamps) * 0.3)
    train_timestamps = timestamps[:train_size]
    validation_timestamps = timestamps[train_size:]
    
    # 5. Create separate JSON entries for training and validation
    train_data[challenge] = {
        ts: runs[ts] for ts in train_timestamps
    }

    validation_data[challenge] = {
        ts: runs[ts] for ts in validation_timestamps
    }

# Optionally, save them as JSON files
with open("all_logs/auto-gpt/train.json", "w") as train_file:
    json.dump(train_data, train_file, indent=4)

with open("all_logs/auto-gpt/validation.json", "w") as validation_file:
    json.dump(validation_data, validation_file, indent=4)

In [38]:
print(count_runs(train_data))
count_runs(validation_data)

109


313

In [47]:
import json
import random

with open("all_logs/auto-gpt/response.json", "r") as f:
    data = json.load(f)
    
logs = []
for challenge in data:
    for timestamp in data[challenge]:
        for log_num in data[challenge][timestamp]:
            log_entry = data[challenge][timestamp][log_num]
            try:
                log_entry['content'] = json.loads(log_entry['content'])
            except json.JSONDecodeError:
                pass
            log_entry['challenge'] = challenge
            log_entry['id'] = log_num
            log_entry['timestamp'] = timestamp
            logs.append(log_entry)

# Randomly sample 100 logs
sampled_logs = random.sample(logs, 100) if len(logs) >= 100 else logs
with open("all_logs/auto-gpt/sampled_logs.json", "w") as f:
    json.dump(sampled_logs, f, indent=4)