diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..6f7d681 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,6 @@ +# CODEOWNERS file +# This file defines ownership of the codebase. +# When a PR modifies files, GitHub will automatically request reviews from the listed owners. + +# Owners for the entire repository +* @alex-w-99 @dimavrem22 @rayruizhiliao \ No newline at end of file diff --git a/README.md b/README.md index 7f1cbe3..bec2d64 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@

+

@@ -27,38 +28,95 @@ Welcome to Vectorly's Web Hacker... **No API? No Problem!** ## What is a *Routine*? -> A Routine is a portable recipe for automating a web flow. It has: +> A **Routine** is a portable automation recipe that captures how to perform a specific task in any web app. -- name, description -- parameters: input values the routine needs -- operations: ordered steps the browser executes +Define once. Reuse everywhere. Automate anything you can do in a browser. + +Each Routine includes: +- **name** — a human-readable identifier +- **description** — what the Routine does +- **parameters** — input values the Routine needs to run (e.g. URLs, credentials, text) +- **operations** — the ordered browser actions that perform the automation + +Example: +> Navigate to a dashboard, search based on keywords, and return results — all as a reusable Routine. ### Parameters -- Defined as typed inputs (see `src/data_models/production_routine.py:Parameter`). -- Each parameter has a `name`, `type`, `required`, and optional `default`/`examples`. -- Parameters are referenced inside operations using placeholder tokens like `{{argument_1}}`, `{{argument_2}}`. +- Defined as typed inputs (see [`Parameter`](https://github.com/VectorlyApp/web-hacker/blob/main/src/data_models/production_routine.py) class). +- Each parameter has required `name` and `description` fields. Optional fields include `type` (defaults to `string`), `required` (defaults to `true`), `default`, and `examples`. +- Parameters are referenced inside `operations` using placeholder tokens like `"{{paramName}}"` or `\"{{paramName}}\"` (see [Placeholder Interpolation](#placeholder-interpolation-) below). +- **Parameter Types**: Supported types include `string`, `integer`, `number`, `boolean`, `date`, `datetime`, `email`, `url`, and `enum`. +- **Parameter Validation**: Parameters support validation constraints such as `min_length`, `max_length`, `min_value`, `max_value`, `pattern` (regex), `enum_values`, and `format`. +- **Reserved Prefixes**: Parameter names cannot start with reserved prefixes: `sessionStorage`, `localStorage`, `cookie`, `meta`, `uuid`, `epoch_milliseconds`. + ### Operations -Operations are a typed list (see `RoutineOperationUnion`) executed in order: +Operations define the executable steps of a Routine. They are represented as a **typed list** (see [`RoutineOperationUnion`](https://github.com/VectorlyApp/web-hacker/blob/main/src/data_models/production_routine.py)) and are executed sequentially by a browser. + +Each operation specifies a `type` and its parameters: + +- **navigate** — open a URL in the browser. + ```json + { "type": "navigate", "url": "https://example.com" } + ``` +- **sleep** — pause execution for a given duration (in seconds). + ```json + { "type": "sleep", "timeout_seconds": 1.5 } + ``` +- **fetch** — perform an HTTP request defined by an `endpoint` object (method, URL, headers, body, credentials). Optionally, store the response under a `session_storage_key`. + ```json + { + "type": "fetch", + "endpoint": { + "method": "GET", + "url": "https://api.example.com", + "headers": {}, + "body": {}, + "credentials": "same-origin" + }, + "session_storage_key": "userData" + } + ``` +- **return** — return the value previously stored under a `session_storage_key`. + ```json + { "type": "return", "session_storage_key": "userData" } + ``` + +Example sequence: +```json +[ + { "type": "navigate", "url": "https://example.com/login" }, + { "type": "sleep", "timeout_seconds": 1 }, + { + "type": "fetch", + "endpoint": { + "method": "POST", + "url": "/auth", + "body": { "username": "\"{{user}}\"", "password": "\"{{pass}}\"" } + }, + "session_storage_key": "token" + }, + { "type": "return", "session_storage_key": "token" } +] +``` + +This defines a deterministic flow: open → wait → authenticate → return a session token. -- navigate: `{ "type": "navigate", "url": "https://example.com" }` -- sleep: `{ "type": "sleep", "timeout_seconds": 1.5 }` -- fetch: performs an HTTP request described by an `endpoint` object (method, url, headers, body, credentials) and can store results under a `session_storage_key`. -- return: returns the value previously stored under a `session_storage_key`. ### Placeholder Interpolation `{{...}}` Placeholders inside operation fields are resolved at runtime: -- Parameter placeholders: `{{paramName}}` → substituted from routine parameters -- Storage placeholders (read values from the current session): - - `{{sessionStorage:myKey.path.to.value}}` - - `{{localStorage:myKey}}` - - `{{cookie:CookieName}}` +- **Parameter placeholders**: `"{{paramName}}"` or `\"{{paramName}}\"` → substituted from routine parameters +- **Storage placeholders** (read values from the current session): + - `{{sessionStorage:myKey.path.to.value}}` — access nested values in sessionStorage + - `{{localStorage:myKey}}` — access localStorage values + - `{{cookie:CookieName}}` — read cookie values + - `{{meta:name}}` — read meta tag content (e.g., ``) -**Important:** Currently, `sessionStorage`, `localStorage`, and `cookie` placeholder resolution is supported only inside fetch `headers` and `body`. Future versions will support interpolation anywhere in operations. +**Important:** Currently, `sessionStorage`, `localStorage`, `cookie`, and `meta` placeholder resolution is supported only inside fetch `headers` and `body`. Future versions will support interpolation anywhere in operations. Interpolation occurs before an operation executes. For example, a fetch endpoint might be: @@ -67,7 +125,7 @@ Interpolation occurs before an operation executes. For example, a fetch endpoint "type": "fetch", "endpoint": { "method": "GET", - "url": "https://api.example.com/search?arg1={{argument_1}}&arg2={{argument_2}}", + "url": "https://api.example.com/search?paramName1=\"{{paramName1}}\"¶mName2=\"{{paramName1}}\"", "headers": { "Authorization": "Bearer {{cookie:auth_token}}" }, @@ -88,7 +146,7 @@ This substitutes parameter values and injects `auth_token` from cookies. The JSO - Windows (PowerShell): `iwr https://astral.sh/uv/install.ps1 -UseBasicParsing | iex` - OpenAI API key -## Setup Your Environment 🔧 +## Set up Your Environment 🔧 ```bash # 1) Clone and enter the repo @@ -166,72 +224,35 @@ if (!(Test-Path $chrome)) { ## HACK (reverse engineer) WEB APPS 👨🏻‍💻 -### Monitor Browser While Performing Some Task +The reverse engineering process follows a simple three-step workflow: -Use the CDP browser monitor to block trackers and capture network, storage and interaction data while you manually perform tasks in Chrome. +1. **Monitor** — Capture network traffic, storage events, and interactions while you manually perform the target task in Chrome +2. **Discover** — Let the AI agent analyze the captured data and generate a reusable Routine +3. **Execute** — Run the discovered Routine with different parameters to automate the task -Prereq: Chrome running in debug mode (see above). Get a `TAB_ID` from `chrome://inspect/#devices` or `http://127.0.0.1:9222/json`. +Each step is detailed below. Start by ensuring Chrome is running in debug mode (see [Launch Chrome in Debug Mode](#launch-chrome-in-debug-mode-🐞) above). -Basic usage: +### 0. Legal & Privacy Notice ⚠️ +Reverse-engineering and automating a website can violate terms of service. Store captures securely and scrub any sensitive fields before sharing. -``` +### 1. Monitor Browser While Performing Some Task + +Use the CDP browser monitor to block trackers and capture network, storage, and interaction data while you manually perform the task in Chrome. + +**Run this command to start monitoring:** + +```bash python scripts/browser_monitor.py \ --host 127.0.0.1 \ --port 9222 \ --output-dir ./cdp_captures \ - --url https://www.example.com + --url about:blank \ + --incognito ``` -Attach to existing tab: +The script will open a new tab (starting at `about:blank`). Navigate to your target website, then manually perform the actions you want to automate (e.g., search, login, export report). Keep Chrome focused during this process. Press `Ctrl+C` when done; the script will consolidate transactions and produce a HAR automatically. -``` -python scripts/browser_monitor.py -# or -python scripts/browser_monitor.py --tab-id -``` - -Create a new tab automatically: - -``` -python scripts/browser_monitor.py --url https://example.com -``` - -Incognito new tab (only when not supplying TAB_ID): - -``` -python scripts/browser_monitor.py --incognito --url https://example.com -``` - -Attach without navigating (keep current page): - -``` -python scripts/browser_monitor.py --tab-id --no-navigate -``` - -Control output directory behavior: - -``` -# default is to clear; to keep previous outputs -python scripts/browser_monitor.py --keep-output -``` - -Select which resource types to capture (default: XHR, Fetch): - -``` -python scripts/browser_monitor.py --tab-id \ - --capture-resources XHR Fetch -``` - -Disable clearing cookies/storage (cleared by default): - -``` -python scripts/browser_monitor.py --tab-id --no-clear-all -# or granular -python scripts/browser_monitor.py --tab-id --no-clear-cookies -python scripts/browser_monitor.py --tab-id --no-clear-storage -``` - -Output structure (under `--output-dir`, default `./cdp_captures`): +**Output structure** (under `--output-dir`, default `./cdp_captures`): ``` cdp_captures/ @@ -248,15 +269,17 @@ cdp_captures/ │ └── events.jsonl ``` -Tip: Keep Chrome focused while monitoring and perform the target flow (search, checkout, etc.). Press Ctrl+C to stop; the script will consolidate transactions and produce a HAR automatically. +Tip: Keep Chrome focused while monitoring and perform the target flow (search, checkout, etc.). Press Ctrl+C to stop; the script will consolidate transactions and produce a HTTP Archive (HAR) automatically. + +### 2. Run Routine-Discovery Agent (Our Very Smart AI with Very Good Prompts🔮)🤖 -### Run Routine Discovery Agent (Our Very Smart AI with Very Good Prompt🔮)🤖 +Use the **routine-discovery pipeline** to analyze captured data and synthesize a reusable Routine (`navigate → fetch → return`). -Use the routine discovery pipeline to generate a reusable Routine (navigate → fetch → return) from your captured network data. +**Prerequisites:** You’ve already captured a session with the browser monitor (`./cdp_captures` exists). -Prereq: You have already captured data with the browser monitor (see above) and have `./cdp_captures` populated. +**Run the discovery agent:** -Basic usage: +> ⚠️ **Important:** You must specify your own `--task` parameter. The example below is just for demonstration—replace it with a description of what you want to automate. ``` python scripts/discover_routines.py \ @@ -266,9 +289,15 @@ python scripts/discover_routines.py \ --llm-model gpt-5 ``` +**Example tasks:** +- `"recover the api endpoints for searching for trains and their prices"` (shown above) +- `"discover how to search for flights and get pricing"` +- `"find the API endpoint for user authentication"` +- `"extract the endpoint for submitting a job application"` + Arguments: -- **--task**: What you want to achieve? What API endpoint should it discover? +- **--task**: A clear description of what you want to automate. This guides the AI agent to identify which network requests to extract and convert into a Routine. Examples: searching for products, booking appointments, submitting forms, etc. - **--cdp-captures-dir**: Root of prior CDP capture output (default: `./cdp_captures`) - **--output-dir**: Directory to write results (default: `./routine_discovery_output`) - **--llm-model**: LLM to use for reasoning/parsing (default: `gpt-5`) @@ -283,35 +312,51 @@ routine_discovery_output/ └── routine.json # Final Routine model (name, parameters, operations) ``` -### Execute the Discovered Routines 🏃 +### 3. Execute the Discovered Routines 🏃 -Run the example routine: +⚠️ **Prerequisite:** Make sure Chrome is still running in debug mode (see [Launch Chrome in Debug Mode](#launch-chrome-in-debug-mode-🐞) above). The routine execution script connects to the same Chrome debug session on `127.0.0.1:9222`. + +⚠️ **Important:** If you have a string-typed parameter used in a JSON body field, it may need to be escaped. When the agent generates routines, string parameters are sometimes placed as `"{{PARAM}}"` when they should be `"\"{{PARAM}}\""` to ensure proper JSON string escaping. + +**Example:** If you see: +```json +"field": "{{paramName}}" +``` +And `paramName` is a string parameter, manually change it to: +```json +"field": "\"{{paramName}}\"" +``` +This ensures the parameter value is properly quoted as a JSON string when substituted. +Run the example routine: ``` -# Using a parameters file (see examples in `scripts/execute_routine.py`): +# Using a parameters file: python scripts/execute_routine.py \ - --routine-path example_data/amtrak_one_way_train_search_routine.json \ - --parameters-path example_data/amtrak_one_way_train_search_input.json + --routine-path example_routines/amtrak_one_way_train_search_routine.json \ + --parameters-path example_routines/amtrak_one_way_train_search_input.json - -# Or pass parameters inline (JSON string) — matches the script’s examples: +# Or pass parameters inline (JSON string): python scripts/execute_routine.py \ - --routine-path example_data/amtrak_one_way_train_search_routine.json \ - --parameters-dict '{"origin": "boston", "destination": "new york", "departureDate": "2026-03-22"}' + --routine-path example_routines/amtrak_one_way_train_search_routine.json \ + --parameters-dict '{"origin": "BOS", "destination": "NYP", "departureDate": "2026-03-22"}' ``` -Once you have a routine JSON, run it in a real browser session (same Chrome debug session): +Run a discovered routine: ``` python scripts/execute_routine.py \ - --routine-path routine_discovery_output/routine.json \ - --parameters-path routine_discovery_output/test_parameters.json + --routine-path routine_discovery_output/routine.json \ + --parameters-path routine_discovery_output/test_parameters.json ``` +**Note:** Routines execute in a new incognito tab by default (controlled by the routine's `incognito` field). This ensures clean sessions for each execution. + +**Alternative:** Deploy your routine to [console.vectorly.app](https://console.vectorly.app) to expose it as an API endpoint or MCP server for use in production environments. + ## Common Issues ⚠️ - Chrome not detected / cannot connect to DevTools @@ -321,6 +366,14 @@ python scripts/execute_routine.py \ - `OPENAI_API_KEY` not set - Export the key in your shell or create a `.env` file and run via `uv run` (dotenv is loaded). +- `No such file or directory: './cdp_captures/network/transactions/N/A'` or similar transaction path errors + + - The agent cannot find any network transactions relevant to your task. This usually means: + - The `--task` description doesn't match what you actually performed during monitoring + - The relevant network requests weren't captured (they may have been blocked or filtered) + - The task description is too vague or too specific + + - **Fix:** Reword your `--task` parameter to more accurately describe what you did during the monitoring step, or re-run the browser monitor and ensure you perform the exact actions you want to automate. ## Coming Soon 🔮 diff --git a/example_routines/amtrak_one_way_train_search_routine.json b/example_routines/amtrak_one_way_train_search_routine.json index 47272a7..a40e612 100644 --- a/example_routines/amtrak_one_way_train_search_routine.json +++ b/example_routines/amtrak_one_way_train_search_routine.json @@ -9,7 +9,7 @@ "type": "fetch", "endpoint": { "description": "Amtrak station/location autocomplete. GET with query parameter searchTerm; returns JSON with autoCompleterResponse.autoCompleteList.", - "url": "https://www.amtrak.com/services/MapDataService/AutoCompleterArcgis/getResponseList?searchTerm={{origin}}", + "url": "https://www.amtrak.com/services/MapDataService/AutoCompleterArcgis/getResponseList?searchTerm=\"{{origin}}\"", "method": "GET", "headers": {"Accept": "application/json, text/plain, */*"}, "body": {}, @@ -21,7 +21,7 @@ "type": "fetch", "endpoint": { "description": "Amtrak station/location autocomplete. GET with query parameter searchTerm; returns JSON with autoCompleterResponse.autoCompleteList.", - "url": "https://www.amtrak.com/services/MapDataService/AutoCompleterArcgis/getResponseList?searchTerm={{destination}}", + "url": "https://www.amtrak.com/services/MapDataService/AutoCompleterArcgis/getResponseList?searchTerm=\"{{destination}}\"", "method": "GET", "headers": {"Accept": "application/json, text/plain, */*"}, "body": {}, @@ -55,7 +55,7 @@ { "origin": { "code": "{{sessionStorage:amtrak_autocomplete_stations_origin.autoCompleterResponse.autoCompleteList.0.stationCode}}", - "schedule": {"departureDateTime": "{{departureDate}}T00:00:00"} + "schedule": {"departureDateTime": "\"{{departureDate}}\"T00:00:00"} }, "destination": { "code": "{{sessionStorage:amtrak_autocomplete_stations_destination.autoCompleterResponse.autoCompleteList.0.stationCode}}" diff --git a/scripts/browser_monitor.py b/scripts/browser_monitor.py index 42c8743..fa37ced 100644 --- a/scripts/browser_monitor.py +++ b/scripts/browser_monitor.py @@ -210,11 +210,9 @@ def setup_output_directory(output_dir, keep_output): # Create organized subdirectories network_dir = os.path.join(output_dir, "network") storage_dir = os.path.join(output_dir, "storage") - interactions_dir = os.path.join(output_dir, "interactions") os.makedirs(network_dir, exist_ok=True) os.makedirs(storage_dir, exist_ok=True) - os.makedirs(interactions_dir, exist_ok=True) # Create transactions directory for unified request/response storage transactions_dir = os.path.join(network_dir, "transactions") @@ -224,16 +222,12 @@ def setup_output_directory(output_dir, keep_output): # Main directories 'network_dir': network_dir, 'storage_dir': storage_dir, - 'interactions_dir': interactions_dir, 'transactions_dir': transactions_dir, # Storage files 'storage_jsonl_path': os.path.join(storage_dir, "events.jsonl"), - # Interaction files - 'interactions_jsonl_path': os.path.join(interactions_dir, "events.jsonl"), - # Summary file 'summary_path': os.path.join(output_dir, "session_summary.json") } @@ -266,9 +260,6 @@ def save_session_summary(paths, summary, args, start_time, end_time, created_tab }, "storage": { "events": paths['storage_jsonl_path'] - }, - "interactions": { - "events": paths['interactions_jsonl_path'] } } } diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cdp/routine_execution.py b/src/cdp/routine_execution.py index 9a6d159..27e6a99 100644 --- a/src/cdp/routine_execution.py +++ b/src/cdp/routine_execution.py @@ -74,7 +74,20 @@ def _generate_fetch_js( " // Simple tokens (computed locally, no source lookup)", " function replaceSimpleTokens(str){", " if (typeof str !== 'string') return str;", - " str = str.replace(/\\{\\{\\s*epoch_milliseconds\\s*\\}\\}/ig, () => String(Date.now()));", + " // Handle quoted and unquoted: \"{{epoch_milliseconds}}\" or {{epoch_milliseconds}}", + " str = str.replace(/\\\"?\\{\\{\\s*epoch_milliseconds\\s*\\}\\}\\\"?/g, () => String(Date.now()));", + " // Handle {{uuid}} - generate UUID using crypto.randomUUID() if available", + " str = str.replace(/\\\"?\\{\\{\\s*uuid\\s*\\}\\}\\\"?/g, () => {", + " if ('randomUUID' in crypto) {", + " return crypto.randomUUID();", + " }", + " // Fallback for browsers without crypto.randomUUID()", + " return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {", + " const r = Math.random() * 16 | 0;", + " const v = c === 'x' ? r : (r & 0x3 | 0x8);", + " return v.toString(16);", + " });", + " });", " return str;", " }", "", @@ -113,7 +126,7 @@ def _generate_fetch_js( " }", " }", "", - " const PLACEHOLDER = /\\{\\{\\s*(sessionStorage|localStorage|cookie|meta)\\s*:\\s*([^}]+?)\\s*\\}\\}/g;", + " const PLACEHOLDER = /\\\"?\\{\\{\\s*(sessionStorage|localStorage|cookie|meta)\\s*:\\s*([^}]+?)\\s*\\}\\}\\\"?/g;", " function resolveOne(token){", " const [lhs, rhs] = token.split('||');", " const [kind, path] = lhs.split(':');", @@ -137,10 +150,22 @@ def _generate_fetch_js( " function resolvePlaceholders(str){", " if (typeof str !== 'string') return str;", " str = replaceSimpleTokens(str);", + " // Follow test.py pattern: for quoted placeholders, strings use raw value, objects use JSON.stringify", " return str.replace(PLACEHOLDER, (m, _k, inner) => {", " const v = resolveOne(`${_k}:${inner}`);", " if (v === undefined || v === null) return m;", - " return (typeof v === 'object') ? JSON.stringify(v) : String(v);", + " // Check if match was quoted - could be \"{{...}}\" or \\\"{{...}}\\\"", + " // Check for escaped quote \\\" at start/end, or simple quote \"", + " const startsWithEscaped = m.startsWith('\\\\\"') || m.startsWith('\"');", + " const endsWithEscaped = m.endsWith('\\\\\"') || (m.endsWith('\"') && m.length > 2);", + " const isQuoted = startsWithEscaped && endsWithEscaped;", + " if (isQuoted) {", + " // Quoted: strings use raw value (no quotes), objects use JSON.stringify", + " return (typeof v === 'string') ? v : JSON.stringify(v);", + " } else {", + " // Unquoted: always stringify", + " return (typeof v === 'object') ? JSON.stringify(v) : String(v);", + " }", " });", " }", "", @@ -344,7 +369,7 @@ def _execute_fetch_in_session( body_str = json.dumps(endpoint.body) # convert body from dict to str body_str_interpolated = _apply_params(body_str, parameters_dict) body = json.loads(body_str_interpolated) # convert body from str to dict - + # Prepare headers and body for injection hdrs = headers or {} @@ -402,6 +427,11 @@ def _apply_params(text: str, parameters_dict: dict | None) -> str: Only replaces {{param}} where 'param' is in parameters_dict. Leaves other placeholders like {{sessionStorage:...}} untouched. + + Follows the pattern from test.py: + - For string values in quoted placeholders: insert raw string (no quotes) + - For non-string values in quoted placeholders: use json.dumps(value) + - For unquoted placeholders: use str(value) Args: text: Text containing parameter placeholders. @@ -412,15 +442,33 @@ def _apply_params(text: str, parameters_dict: dict | None) -> str: """ if not text or not parameters_dict: return text - pattern = ( - r"\{\{\s*(" + "|".join(map(re.escape, parameters_dict.keys())) + r")\s*\}\}" - ) - - def repl(m): - key = m.group(1) - return str(parameters_dict.get(key, m.group(0))) - - return re.sub(pattern, repl, text) + + for key, value in parameters_dict.items(): + # Compute replacement based on value type (following test.py pattern) + if isinstance(value, str): + literal = value # For strings, insert raw string (no quotes) + else: + literal = json.dumps(value) # For numbers/bools/null, use JSON encoding + + escaped_key = re.escape(key) + + # Pattern 1: Simple quoted placeholder "{{key}}" in JSON string + # Matches: "{{key}}" (when the JSON value itself is the string "{{key}}") + # Use regular string concatenation to avoid f-string brace escaping issues + simple_quoted = '"' + r'\{\{' + r'\s*' + escaped_key + r'\s*' + r'\}\}' + '"' + text = re.sub(simple_quoted, literal, text) + + # Pattern 2: Escaped quote variant \"{{key}}\" + # In JSON string this appears as: \\"{{key}}\\" + # Use regular string concatenation to build pattern with proper escaping + double_escaped = r'\\"' + r'\{\{' + r'\s*' + escaped_key + r'\s*' + r'\}\}' + r'\\"' + text = re.sub(double_escaped, literal, text) + + # Pattern 3: Bare placeholder {{key}} (unquoted, for URL params, etc.) + bare_pattern = r'\{\{' + r'\s*' + escaped_key + r'\s*' + r'\}\}' + text = re.sub(bare_pattern, str(value), text) + + return text def _generate_random_user_agent() -> str: diff --git a/src/data_models/llm_responses.py b/src/data_models/llm_responses.py index 12e7012..5993161 100644 --- a/src/data_models/llm_responses.py +++ b/src/data_models/llm_responses.py @@ -12,7 +12,7 @@ class TransactionIdentificationResponse(BaseModel): Response from the LLM for identifying the network transaction that directly corresponds to the user's requested task. """ - transaction_id: str + transaction_id: str | None description: str url: str method: Method diff --git a/src/data_models/production_routine.py b/src/data_models/production_routine.py index 23f5fbf..a226fd9 100644 --- a/src/data_models/production_routine.py +++ b/src/data_models/production_routine.py @@ -448,24 +448,27 @@ def validate_parameter_usage(self) -> 'Routine': # Extract all parameter names defined_parameters = {param.name for param in self.parameters} - # Find all parameter usages in the JSON: *{{*}}* - param_pattern = r'\{\{.*?\}\}' + # Find all parameter usages in the JSON: *"{{*}}"* + # Match quoted placeholders: "{{param}}" or \"{{param}}\" (escaped quotes in JSON strings) + # \"{{param}}\" in JSON string means "{{param}}" in actual value + # Pattern REQUIRES quotes (either " or \") immediately before {{ and after }} + param_pattern = r'(?:"|\\")\{\{([^}"]*)\}\}(?:"|\\")' matches = re.findall(param_pattern, routine_json) - + # track used parameters used_parameters = set() - + # iterate over all parameter usages for match in matches: - # clean the match from the {{ and }} - match = match.strip()[2:-2].strip() + # clean the match (already extracted the content between braces) + match = match.strip() - # if the parameter name starts with a colon, it is a storage parameter + # if the parameter name contains a colon, it is a storage parameter if ":" in match: kind, path = [p.strip() for p in match.split(":", 1)] - assert kind in ["sessionStorage", "localStorage", "cookie"], f"Invalid prefix in parameter name: {kind}" - assert path, f"Path is required for sessionStorage, localStorage, and cookie: {kind}:{path}" + assert kind in ["sessionStorage", "localStorage", "cookie", "meta"], f"Invalid prefix in parameter name: {kind}" + assert path, f"Path is required for sessionStorage, localStorage, cookie, and meta: {kind}:{path}" continue # if the parameter name is a builtin parameter, add it to the used parameters elif match in builtin_parameter_names: @@ -490,5 +493,4 @@ def validate_parameter_usage(self) -> 'Routine': f"All parameters used in the routine must be defined in parameters." ) - return self - + return self \ No newline at end of file diff --git a/src/routine_discovery/agent.py b/src/routine_discovery/agent.py index 99f4116..6f9640a 100644 --- a/src/routine_discovery/agent.py +++ b/src/routine_discovery/agent.py @@ -77,6 +77,9 @@ def run(self) -> None: # identify the transaction identified_transaction = self.identify_transaction() + if identified_transaction.transaction_id is None: + raise Exception("Failed to identify the network transactions that directly correspond to the user's requested task.") + # confirm the identified transaction confirmation_response = self.confirm_indetified_transaction(identified_transaction) @@ -115,7 +118,7 @@ def run(self) -> None: transaction = self.context_manager.get_transaction_by_id(transaction_id) # extract variables from the transaction - print("Extract variables (args, cookies, tokens, browser variables) from the identified transaction...") + print("Extracting variables (args, cookies, tokens, browser variables) from the identified transaction...") extracted_variables = self.extract_variables(transaction_id) # save the extracted variables @@ -134,13 +137,6 @@ def run(self) -> None: with open(save_path, "w") as f: json.dump(resolved_variables_json, f, ensure_ascii=False, indent=2) print(f"Resolved variables saved to: {save_path}") - - # adding transaction that need to be processed to the queue - for resolved_variable in resolved_variables: - if resolved_variable.transaction_source is not None: - new_transaction_id = resolved_variable.transaction_source.transaction_id - if new_transaction_id not in routine_transactions: - transaction_queue.append(new_transaction_id) # adding transaction data to the routine transactions routine_transactions[transaction_id] = { @@ -149,10 +145,15 @@ def run(self) -> None: "resolved_variables": [resolved_variable.model_dump() for resolved_variable in resolved_variables] } + # adding transaction that need to be processed to the queue + for resolved_variable in resolved_variables: + if resolved_variable.transaction_source is not None: + new_transaction_id = resolved_variable.transaction_source.transaction_id + if new_transaction_id not in routine_transactions: + transaction_queue.append(new_transaction_id) + # construct the routine routine = self.construct_routine(routine_transactions) - - print(f"Finalized routine construction! Routine saved to: {save_path}") # save the routine save_path = os.path.join(self.output_dir, f"routine.json") @@ -230,7 +231,7 @@ def identify_transaction(self) -> TransactionIdentificationResponse: # parse the response to the pydantic model parsed_response = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=TransactionIdentificationResponse, client=self.client, llm_model='gpt-5-nano' @@ -296,7 +297,7 @@ def confirm_indetified_transaction( # parse the response to the pydantic model parsed_response = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=TransactionConfirmationResponse, client=self.client, llm_model='gpt-5-nano' @@ -338,14 +339,14 @@ def extract_variables(self, transaction_id: str) -> ExtractedVariableResponse: transactions.append( { "request": transaction["request"], - "response": transaction["response"], - "response_body": response_body + # "response": transaction["response"], + # "response_body": response_body } ) # add message to the message history message = ( - f"Please extract the variables from the requests of identified network transactions: {transactions}" + f"Please extract the variables from only these network requests (requests only!): {transactions}" f"Please respond in the following format: {ExtractedVariableResponse.model_json_schema()}" "Mark each variable with requires_resolution=True if we need to dynamically resolve this variable at runtime." "If we can most likely hardcode this value, mark requires_resolution=False." @@ -378,7 +379,7 @@ def extract_variables(self, transaction_id: str) -> ExtractedVariableResponse: # parse the response to the pydantic model parsed_response = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=ExtractedVariableResponse, client=self.client, llm_model="gpt-5-nano" @@ -409,13 +410,18 @@ def resolve_variables(self, extracted_variables: ExtractedVariableResponse) -> l # for each variable to resolve, try to find the source of the variable in the storage and transactions for variable in variables_to_resolve: + print(f"Resolving variable: {variable.name} with values to scan for: {variable.values_to_scan_for}") + # get the storage objects that contain the value and are before the latest timestamp storage_objects = [] for value in variable.values_to_scan_for: storage_sources = self.context_manager.scan_storage_for_value( value=value ) - storage_sources.extend(storage_sources) + storage_objects.extend(storage_sources) + + if len(storage_objects) > 0: + print(f"Found {len(storage_objects)} storage sources that contain the value") # get the transaction ids that contain the value and are before the latest timestamp transaction_ids = [] @@ -426,6 +432,9 @@ def resolve_variables(self, extracted_variables: ExtractedVariableResponse) -> l # deduplicate transaction ids transaction_ids = list(set(transaction_ids)) + + if len(transaction_ids) > 0: + print(f"Found {len(transaction_ids)} transaction ids that contain the value: {transaction_ids}") # add the transactions to the vectorstore uuid = str(uuid4()) @@ -445,6 +454,7 @@ def resolve_variables(self, extracted_variables: ExtractedVariableResponse) -> l f"Dot paths should be like this: 'key.data.items[0].id', 'path.to.valiable.0.value', etc." f"For paths in transaction responses, start with the first key of the response body" f"For paths in storage, start with the cookie, local storage, or session storage entry name" + f"If the variable is found in both storage and transactions, you should indicate both sources and resolve them accordinly!" ) self._add_to_message_history("user", message) @@ -480,7 +490,7 @@ def resolve_variables(self, extracted_variables: ExtractedVariableResponse) -> l # parse the response to the pydantic model parsed_response = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=ResolvedVariableResponse, client=self.client, llm_model="gpt-5-nano" @@ -509,7 +519,15 @@ def construct_routine(self, routine_transactions: dict, max_attempts: int = 3) - f"First step of the routine should be to navigate to the target web page and sleep for a bit of time (2-3 seconds). " f"All fetch operations should be constructed as follows: {RoutineFetchOperation.model_json_schema()}. " f"Parameters are only the most important arguments. " - f"You can inject variables by using the following syntax: {{{{parameter_name}}}} {{{{cookie:cookie_name}}}} {{{{sessionStorage:key.path.to.0.value}}}} {{{{local_storage:local_storage_name}}}}. " + f"You can inject variables by using placeholders. CRITICAL: PLACEHOLDERS ARE REPLACED AT RUNTIME AND THE RESULT MUST BE VALID JSON! " + f"For STRING values: Use \\\"{{{{parameter_name}}}}\\\" format (escaped quote + placeholder + escaped quote). " + f"Example: \\\"name\\\": \\\"\\\"{{{{user_name}}}}\\\"\\\". At runtime, \\\"\\\"{{{{user_name}}}}\\\"\\\" is replaced and becomes \\\"name\\\": \\\"John\\\" (valid JSON string). " + f"For NUMERIC values (int, float) or NULL: Use \\\"{{{{parameter_name}}}}\\\" format (regular quote + placeholder + quote). " + f"Example: \\\"amount\\\": \\\"{{{{price}}}}\\\". At runtime, \\\"{{{{price}}}}\\\" is replaced with the numeric value and quotes are removed, becoming \\\"amount\\\": 99.99 (JSON number, not string). " + f"Example: \\\"quantity\\\": \\\"{{{{count}}}}\\\" with value 5 becomes \\\"quantity\\\": 5 (JSON number). " + f"For NULL: \\\"metadata\\\": \\\"{{{{optional_field}}}}\\\" with null value becomes \\\"metadata\\\": null (JSON null). " + f"REMEMBER: After placeholder replacement, the JSON must be valid and parseable! " + f"Placeholder types: {{{{parameter_name}}}} for parameters, {{{{cookie:cookie_name}}}} for cookies, {{{{sessionStorage:key.path.to.0.value}}}} for session storage, {{{{localStorage:local_storage_name}}}} for local storage. " f"You can hardcode unresolved variables to their observed values. " f"You will want to navigate to the target page, then perform the fetch operations in the proper order. " f"Browser variables should be hardcoded to observed values. " @@ -547,7 +565,7 @@ def construct_routine(self, routine_transactions: dict, max_attempts: int = 3) - # parse the response to the pydantic model routine = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=Routine, client=self.client, llm_model=self.llm_model @@ -577,6 +595,18 @@ def productionize_routine(self, routine: Routine) -> Routine: f"You need to clean up this routine to follow the following format: {ProductionRoutine.model_json_schema()}" f"Please respond in the following format: {ProductionRoutine.model_json_schema()}" f"You immediate output needs to be a valid JSON object that conforms to the production routine schema." + f"CRITICAL: PLACEHOLDERS ARE REPLACED AT RUNTIME AND MUST RESULT IN VALID JSON! " + f"EXPLANATION: Placeholders like {{{{key}}}} are replaced at runtime with actual values. The format you choose determines the resulting JSON type. " + f"For STRING values: Use \\\"{{{{key}}}}\\\" format (escaped quote + placeholder + escaped quote). " + f"This means in the JSON file you write: \\\"\\\"{{{{user_name}}}}\\\"\\\". At runtime, the \\\"{{{{user_name}}}}\\\" part gets replaced, " + f"so \\\"\\\"{{{{user_name}}}}\\\"\\\" becomes \\\"\\\"John\\\"\\\" which becomes \\\"John\\\" (valid JSON string). " + f"For NUMERIC/NULL values: Use \\\"{{{{key}}}}\\\" format (regular quote + placeholder + quote). " + f"This means in the JSON file you write: \\\"{{{{item_id}}}}\\\". At runtime, the {{{{item_id}}}} part gets replaced with the number, " + f"and the surrounding quotes are removed, so \\\"{{{{item_id}}}}\\\" with value 42 becomes just 42 (valid JSON number, not string). " + f"Example: \\\"{{{{total_price}}}}\\\" with value 29.99 → becomes 29.99 (quotes removed, valid JSON number). " + f"Example: \\\"{{{{optional_data}}}}\\\" with null → becomes null (quotes removed, valid JSON null). " + """Placeholders will be resolved using this: param_pattern = r'(?:"|\\\\")\\{\\{([^}"]*)\\}\\}(?:"|\\\\")'""" + f"The resulting JSON MUST be valid and parseable after all placeholder replacements are done." ) self._add_to_message_history("user", message) @@ -596,9 +626,10 @@ def productionize_routine(self, routine: Routine) -> Routine: self._add_to_message_history("assistant", response_text) # parse the response to the pydantic model + # context includes the last 2 messages (user prompt + assistant response) to help with parsing production_routine = manual_llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=ProductionRoutine, client=self.client, llm_model=self.llm_model @@ -635,7 +666,7 @@ def get_test_parameters(self, routine: Routine) -> TestParametersResponse: # parse the response to the pydantic model parsed_response = llm_parse_text_to_model( text=response_text, - context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-3:]]), + context="\n".join([f"{msg['role']}: {msg['content']}" for msg in self.message_history[-2:]]), pydantic_model=TestParametersResponse, client=self.client, llm_model="gpt-5-nano" diff --git a/src/routine_discovery/context_manager.py b/src/routine_discovery/context_manager.py index eb88324..f578c1f 100644 --- a/src/routine_discovery/context_manager.py +++ b/src/routine_discovery/context_manager.py @@ -241,7 +241,8 @@ def scan_transaction_responses(self, value: str, max_timestamp: str | None = Non ) ): results.append(transaction_id) - return results + + return list(set(results)) def scan_storage_for_value(self, value: str, max_timestamp: str | None = None) -> list[str]: