In [3]:
import pandas as pd

df = pd.read_pickle('../reports/df.pkl')
df.dropna(subset=['benchmark_start_time', 'response', 'model', 'agent'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21013 entries, 0 to 25470
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   createdAt             21013 non-null  datetime64[ns]     
 1   agent                 21013 non-null  object             
 2   costUSD               21013 non-null  float64            
 3   job_id                21013 non-null  object             
 4   challenge             21013 non-null  object             
 5   benchmark_start_time  21013 non-null  datetime64[ns, UTC]
 6   prompt                21013 non-null  object             
 7   response              21013 non-null  object             
 8   model                 21013 non-null  object             
 9   request               21013 non-null  object             
 10  attempted             15008 non-null  object             
 11  categories            15008 non-null  object             
 12  task     

In [27]:
challenges_to_ignore = ["TestBasicSafety_BuildNuke","TestAgentProtocol_ExecuteAgentTaskStep", "TestAgentProtocol_GetAgentTask", "TestAgentProtocol_ListAgentTasksIds", "TestAgentProtocol_CreateAgentTask"]
agents_to_ignore = ['gpt-engineer', 'smol-developer', 'babyagi', 'evo', 'auto-gpt-turbo']

In [28]:
import json
import re


def is_action_auto_gpt(log):
    """AutoGPTs actions are defined by the presence of the "command" key.

    World state actions
    - web_search
    - write_to_file
    - browse_website
    - execute_python_file
    - list_files
    - execute_python_code
    Internal actions
    - goals_accomplished

    Input
        The "content" key of an LLM response
    """

    # Check for the existence of the "command" key in the log
    command_existence = bool(re.search(r'"command"\s*:', log))

    if command_existence:
        # Convert the JSON-like string to a Python dictionary
        log_dict = json.loads(log)

        # Check if the "command" key exists and has a "name" key
        if "command" in log_dict and "name" in log_dict["command"]:
            command_name = log_dict["command"]["name"]

            # List of command names that signify an action
            action_command_names = [
                "web_search",
                "write_to_file",
                "browse_website",
                "execute_python_file",
                "list_files",
                "execute_python_code",
                "read_file",
            ]

            # Check if the command name matches any in the list
            return command_name in action_command_names

    return False


def is_openai_function(log):
    """OpenAI API function calls are determined by the presence of the "function_call" key.
    Beebot
    World state actions
    - get_html_content
    - read_file
    - write_file
    - wolfram_alpha_query
    - write_python_code
    - execute_python_file
    - google_search
    - wikipedia
    - install_python_package
    - execute_python_file_in_background
    - get_process_status
    - kill_process
    - analyze_webpage_content
    - get_website_text_content
    - gmail_get_message
    - gmail_create_draft
    - disk_usage
    Internal actions
    - get_more_tools
    - exit
    - rewind_actions
    - delegate_task
    - function_summary

    PolyGPT
    World state actions
    - http.
    - filesystem.
    - ethers.
    - ipfs.
    - web-scraper.
    - ens.
    - safe-factory.
    - safe-manager.
    Internal actions
    - LearnWrap
    - InvokeWrap
    - user

    Input
        The entire LLM response
    """
    # Check for the existence of the "function_call" key in the log
    function_call_existence = bool(log.get("function_call", None))

    if function_call_existence:
        # Check if the "function_call" key exists and has a "name" key
        if "name" in log["function_call"]:
            function_name = log["function_call"]["name"]

            # List of function names that signify an action
            action_function_names = [
                "read_file",
                "write_",
                "wolfram_alpha_query",
                "execute_",
                "install_python_package",
                "get_",
                "kill_process",
                "encyclopedia",
                "gmail_",
                "disk_usage",
                "os_name_and_version",
                "analyze_webpage_content",
                "google_",
                "wikipedia",
                "http.",
                "filesystem.",
                "ethers.",
                "ipfs.",
                "web-scraper.",
                "ens.",
                "safe-factory.",
                "safe-manager.",
            ]

            # Check if the function name matches any in the list
            return any(function_name in action for action in action_function_names)

    return False


def is_action_miniagi(log):
    """Mini-AGI function calls are determined by the presence of different patterns
    World state actions
    - execute_python
    - web_search
    - execute_shell
    - ingest_data
    - process_data
    Internal actions
    - done
    - talk_to_user
    - memorize_thoughts
    """
    # List of function names that signify an action
    action_function_names = [
        "execute_python",
        "web_search",
        "execute_shell",
        "ingest_data",
        "process_data",
    ]

    # Check for the <c>...</c> pattern and whether it matches any action function names
    c_pattern_match = False
    c_pattern_search = re.search(r"<c>(.*?)<\/c>", log)
    if c_pattern_search:
        c_pattern_match = c_pattern_search.group(1) in action_function_names

    # Check for the "ACTION:" pattern and whether it matches any action function names
    action_pattern_match = False
    action_pattern_search = re.search(r"ACTION:\s*(\w+)\s*(\(x\d+\))?", log)
    if action_pattern_search:
        action_pattern_match = action_pattern_search.group(1) in action_function_names

    return c_pattern_match or action_pattern_match


def is_action_turbo(log):
    """Turbos actions are defined by the presence of the "cmd" key.
    World state actions
    - search
    - www
    - py
    - aol
    - put
    - pyf
    Internal actions
    - end
    """
    # List of function names that signify an action
    action_function_names = ["search", "www", "py", "aol", "put", "pyf", "sh", "ls"]

    # Check for the "cmd" key pattern and whether its "name" field matches any action function names
    cmd_pattern_match = False
    cmd_pattern_search = re.search(r'"cmd"\s*:\s*{\s*"name"\s*:\s*"(\w+)"', log)
    if cmd_pattern_search:
        cmd_pattern_match = cmd_pattern_search.group(1) in action_function_names

    return cmd_pattern_match


def is_action_general(log):
    """General actions are defined by the presence of specific keywords such as 'write', 'start', 'create', etc.
    KEYWORDS FOUND SO FAR
    WRITE
    - write
    - start
    - create
    - execute
    - post
    MODIFY
    - modify
    - mutate
    - delete
    - put
    READ
    - read
    - list
    - search
    - find
    - get
    - browse
    - query
    - www
    GENERAL, no specificity
    - command
    - call
    - function
    - action
    - http
    """
    if log is None:
        return False

    if log.get("content", ""):
        log = log["content"]
    elif log.get("function_call", ""):
        log = json.dumps(log["function_call"])

    if isinstance(log, dict):
        print("log is dict", log)

    return bool(
        re.search(
            r"\b(write|start|create|execute|post|modify|mutate|delete|put|search|find|get|browse|query|www|read|list|http)\b",
            log,
        )
    )


def is_action_agent(log, agent, test="", response=""):
    """Determines if a log contains an action based on patterns from different agents."""
    is_action = False

    if log is None:
        print("Log is None", agent, test, response)
        return is_action

    log_content = log.get("content", "")

    if agent == "auto-gpt":
        is_action = is_action_auto_gpt(log_content)
    elif agent in ["beebot", "polygpt"]:
        is_action = is_openai_function(log)
    elif agent == "miniagi":
        is_action = is_action_miniagi(log_content)
    elif agent == "turbo":
        is_action = is_action_turbo(log_content)

    return is_action


In [32]:
import json
import pandas as pd

# Function to convert JSON-like strings to nested dictionaries
def nested_json(x):
    if pd.notna(x):
        d = json.loads(x)
        if "content" in d and isinstance(d["content"], str):
            try:
                d["content"] = json.loads(d["content"])
            except json.JSONDecodeError:
                pass
        return d
    return x

agent_array = set(df['agent'].unique()) - set(agents_to_ignore)
challenges_array = set(df['challenge'].unique()) - set(challenges_to_ignore)

In [26]:
import json
import os
from collections import OrderedDict

challenge = "TestRememberMultipleIds"

# Loop through unique agents
for agent in agent_array:
    
    master_response_dict = OrderedDict()
    master_response_nested_dict = OrderedDict()
    master_actions_dict = OrderedDict()
    master_request_dict = OrderedDict()
    master_general_actions_dict = OrderedDict()
    
    # Replace with your actual DataFrame
    selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]

    # Group by 'benchmark_start_time'
    grouped_df = selected_df.groupby('benchmark_start_time')

    for timestamp, group in grouped_df:
        response_dict = OrderedDict()
        response_nested_dict = OrderedDict()
        actions_dict = OrderedDict()
        request_dict = OrderedDict()
        general_actions_dict = OrderedDict()
        
        total_rows = len(group)
        
        for i, (_, row) in enumerate(group.iterrows()):
            response = json.loads(row['response'])
            request = row['request']
            response_nested = nested_json(row['response'])
            
            response_dict[str(total_rows-i)] = response
            request_dict[str(total_rows-i)] = request
            response_nested_dict[str(total_rows-i)] = response_nested
            
            if is_action_agent(response, agent, challenge, response):
                actions_dict[str(total_rows-i)] = response

            if is_action_general(response):
                general_actions_dict[str(total_rows-i)] = response
        
        response_dict = OrderedDict(reversed(list(response_dict.items())))
        response_nested_dict = OrderedDict(reversed(list(response_nested_dict.items())))
        actions_dict = OrderedDict(reversed(list(actions_dict.items())))
        request_dict = OrderedDict(reversed(list(request_dict.items())))
        general_actions_dict = OrderedDict(reversed(list(general_actions_dict.items())))
        
        master_response_dict[str(timestamp)] = response_dict
        master_response_nested_dict[str(timestamp)] = response_nested_dict
        master_actions_dict[str(timestamp)] = actions_dict
        master_request_dict[str(timestamp)] = request_dict
        master_general_actions_dict[str(timestamp)] = general_actions_dict
    
    os.makedirs(f'specific/{challenge}', exist_ok=True)
    os.makedirs(f'specific/{challenge}/{agent}', exist_ok=True)
    
    with open(f'specific/{challenge}/{agent}/response.json', 'w') as f:
        json.dump(master_response_dict, f, indent=4)

    with open(f'specific/{challenge}/{agent}/regex_specific.json', 'w') as f:
        json.dump(master_actions_dict, f, indent=4)

    with open(f'specific/{challenge}/{agent}/response_nested.json', 'w') as f:
        json.dump(master_response_nested_dict, f, indent=4)
        
    with open(f'specific/{challenge}/{agent}/request.json', 'w') as f:
        json.dump(master_request_dict, f, indent=4)

    with open(f'specific/{challenge}/{agent}/regex_simple.json', 'w') as f:
        json.dump(master_general_actions_dict, f, indent=4)
        
    with open(f'specific/{challenge}/{agent}/response_malicious.json', 'w') as f:
        json.dump(master_response_nested_dict, f, indent=4)



In [30]:
import json
import os
import random
from collections import OrderedDict

# Initialize the master challenge dictionaries outside the agent loop
master_challenge_response_dict = {}
master_challenge_request_dict = {}

# Loop through unique agents
for agent in agent_array:
    
    # Initialize dictionaries to hold data per challenge
    master_response_dict_per_challenge = {}
    master_request_dict_per_challenge = {}

    for challenge in challenges_array:
        
        master_response_dict = OrderedDict()
        master_request_dict = OrderedDict()
        
        # Replace with your actual DataFrame
        selected_df = df.loc[(df['agent'] == agent) & (df['challenge'] == challenge)]

        # Group by 'benchmark_start_time'
        grouped_df = selected_df.groupby('benchmark_start_time')

        for timestamp, group in grouped_df:
            response_dict = OrderedDict()
            request_dict = OrderedDict()
            
            total_rows = len(group)
            
            for i, (_, row) in enumerate(group.iterrows()):
                response = json.loads(row['response'])
                request = row['request']
                
                response_dict[str(total_rows-i)] = response
                request_dict[str(total_rows-i)] = request
            
            response_dict = OrderedDict(reversed(list(response_dict.items())))
            request_dict = OrderedDict(reversed(list(request_dict.items())))
            
            master_response_dict[str(timestamp)] = response_dict
            master_request_dict[str(timestamp)] = request_dict
        
        master_response_dict_per_challenge[challenge] = master_response_dict
        master_request_dict_per_challenge[challenge] = master_request_dict

    # Now write the JSON files, using the master dictionaries for each challenge
    os.makedirs(f'all_logs/{agent}', exist_ok=True)

    with open(f'all_logs/{agent}/response.json', 'w') as f:
        json.dump(master_response_dict_per_challenge, f, indent=4)

    with open(f'all_logs/{agent}/regex_specific.json', 'w') as f:
        json.dump(master_actions_dict_per_challenge, f, indent=4)

In [None]:
import json

# List of agents to process
for agent in agent_array:
    # Read the existing 'response.json' file
    with open(f'all_logs/{agent}/response.json', 'r') as f:
        data = json.load(f)

    # Initialize dictionaries to store filtered logs
    filtered_specific = {}
    filtered_general = {}

    for challenge, timestamps in data.items():
        filtered_specific[challenge] = {}
        filtered_general[challenge] = {}
        
        for timestamp, logs in timestamps.items():
            filtered_specific[challenge][timestamp] = {}
            filtered_general[challenge][timestamp] = {}
            
            for log_num, log in logs.items():
                if is_action_agent(log, agent):
                    filtered_specific[challenge][timestamp][log_num] = log
                if is_action_general(log):
                    filtered_general[challenge][timestamp][log_num] = log

    # Save the filtered logs as new JSON files
    with open(f'all_logs/{agent}/filtered_specific.json', 'w') as f:
        json.dump(filtered_specific, f, indent=4)

    with open(f'all_logs/{agent}/filtered_general.json', 'w') as f:
        json.dump(filtered_general, f, indent=4)


In [None]:
with open('malicious_actions.json', 'r') as f:
    malicious_logs = json.load(f)
    
if agent in ['auto-gpt', 'turbo']:
    response_nested = nested_json(row['response'])

filtered_array = [item for item in malicious_logs if agent in item["agent"]]

# if the command matches

# Randomly choose one element from the filtered array
if filtered_array:
    random_element = random.choice(filtered_array)
    print("Random element containing agent1:", random_element)
else:
    print(f"No elements contain {agent}.")
    

with open(f'{agent}/response_malicious_inserted.json', 'w') as f:
    json.dump(master_response_nested_dict, f, indent=4)