Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ jobs:
- name: Lint
run: uv run pylint $(git ls-files '*.py')

#- name: Run tests
# run: uv run pytest tests/ -v
- name: Run tests
run: uv run pytest tests/ -v
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,13 @@ cd web-hacker
uv venv --prompt web-hacker
source .venv/bin/activate # Windows: .venv\\Scripts\\activate

# 3) Install in editable mode via uv (pip-compatible interface)
# 3) Install exactly what lockfile says
uv sync

# 4) Install in editable mode via uv (pip-compatible interface)
uv pip install -e .

# 4) Configure environment
# 5) Configure environment
cp .env.example .env # then edit values
# or set directly
export OPENAI_API_KEY="sk-..."
Expand Down Expand Up @@ -304,7 +307,7 @@ Use the **routine-discovery pipeline** to analyze captured data and synthesize a
**Linux/macOS (bash):**
```bash
python scripts/discover_routines.py \
--task "recover the api endpoints for searching for trains and their prices" \
--task "Recover API endpoints for searching for trains and their prices" \
--cdp-captures-dir ./cdp_captures \
--output-dir ./routine_discovery_output \
--llm-model gpt-5
Expand Down
5 changes: 3 additions & 2 deletions scripts/browser_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
import shutil
import sys

from src.config import Config
from src.cdp.cdp_session import CDPSession
from src.data_models.network import ResourceType
from src.cdp.tab_managements import cdp_new_tab, dispose_context

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)

# ---- Configuration ----
Expand Down Expand Up @@ -381,7 +382,7 @@ def main():
logger.info(f"│ └── response_body.[ext]")
logger.info(f"└── storage/")
logger.info(f" └── events.jsonl")
logger.info()
logger.info("\n")
logger.info(f"Session complete! Check {args.output_dir} for all outputs.")

except Exception as e:
Expand Down
4 changes: 3 additions & 1 deletion scripts/discover_routines.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""
src/scripts/discover_routines.py

Script for discovering routines from the network transactions.
"""

Expand All @@ -13,7 +15,7 @@
from src.routine_discovery.agent import RoutineDiscoveryAgent
from src.routine_discovery.context_manager import ContextManager

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
3 changes: 2 additions & 1 deletion scripts/execute_routine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
import json
import logging

from src.config import Config
from src.cdp.routine_execution import execute_routine
from src.data_models.production_routine import Routine

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
3 changes: 2 additions & 1 deletion src/cdp/cdp_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
import threading
import time

from src.config import Config
from src.cdp.network_monitor import NetworkMonitor
from src.cdp.storage_monitor import StorageMonitor

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
3 changes: 2 additions & 1 deletion src/cdp/network_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from fnmatch import fnmatch
from typing import Any

from src.config import Config
from src.utils.cdp_utils import (
build_pair_dir,
get_set_cookie_values,
Expand All @@ -37,7 +38,7 @@
from src.data_models.network import Stage


logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
9 changes: 8 additions & 1 deletion src/cdp/routine_execution.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
src/cdp/routine_execution.py

Execute a routine using Chrome DevTools Protocol.
"""

import json
import logging
import random
Expand All @@ -9,6 +15,7 @@
import requests
import websocket

from src.config import Config
from src.data_models.production_routine import (
Routine,
Endpoint,
Expand All @@ -18,7 +25,7 @@
RoutineSleepOperation,
)

logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
13 changes: 13 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,32 @@
Centralized environment variable configuration.
"""

import logging
import os
from typing import Any

from dotenv import load_dotenv

load_dotenv()

# configure httpx logger to suppress verbose HTTP logs
logging.getLogger("httpx").setLevel(logging.WARNING)


class Config():
"""
Centralized configuration for environment variables.
"""

# logging configuration
LOG_LEVEL: int = logging.getLevelNamesMapping().get(
os.getenv("LOG_LEVEL", "INFO").upper(),
logging.INFO
)
LOG_DATE_FORMAT: str = os.getenv("LOG_DATE_FORMAT", "%Y-%m-%d %H:%M:%S")
LOG_FORMAT: str = os.getenv("LOG_FORMAT", "[%(asctime)s] %(levelname)s:%(name)s:%(message)s")

# API keys
OPENAI_API_KEY: str | None = os.getenv("OPENAI_API_KEY")

@classmethod
Expand Down
69 changes: 43 additions & 26 deletions src/data_models/production_routine.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
src/data_models/production_routine.py

Production routine data models.
"""

import re
import time
import uuid
Expand Down Expand Up @@ -84,6 +90,7 @@ class BuiltinParameter(BaseModel):
description="Function to generate the builtin parameter value"
)


BUILTIN_PARAMETERS = [
BuiltinParameter(
name="uuid",
Expand All @@ -101,7 +108,6 @@ class BuiltinParameter(BaseModel):
class Parameter(BaseModel):
"""
Parameter model with comprehensive validation and type information.

Fields:
name (str): Parameter name (must be valid Python identifier)
type (ParameterType): Parameter data type
Expand All @@ -117,13 +123,12 @@ class Parameter(BaseModel):
enum_values (list[str] | None): Allowed values for enum type
format (str | None): Format specification (e.g., 'YYYY-MM-DD')
"""

# reserved prefixes: names that cannot be used at the beginning of a parameter name
RESERVED_PREFIXES: ClassVar[list[str]] = [
"sessionStorage", "localStorage", "cookie", "meta", "uuid", "epoch_milliseconds"
]



name: str = Field(..., description="Parameter name (must be valid Python identifier)")
type: ParameterType = Field(
default=ParameterType.STRING,
Expand Down Expand Up @@ -179,24 +184,23 @@ def validate_name(cls, v):
"""Ensure parameter name is a valid Python identifier and not reserved."""
if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', v):
raise ValueError(f"Parameter name '{v}' is not a valid Python identifier")

# Check for reserved prefixes
for prefix in cls.RESERVED_PREFIXES:
if v.startswith(prefix):
raise ValueError(
f"Parameter name '{v}' cannot start with '{prefix}'. "
f"Reserved prefixes: {cls.RESERVED_PREFIXES}"
)

return v

@field_validator('type')
@classmethod
def validate_type_consistency(cls, v, info):
@model_validator(mode='after')
def validate_type_consistency(self) -> 'Parameter':
"""Validate type-specific constraints are consistent."""
if v == ParameterType.ENUM and not info.data.get('enum_values'):
if self.type == ParameterType.ENUM and not self.enum_values:
raise ValueError("enum_values must be provided for enum type")
return v
return self

@field_validator('default')
@classmethod
Expand Down Expand Up @@ -226,7 +230,6 @@ def validate_default_type(cls, v, info):
else:
raise ValueError(f"Default value {v} is not a valid boolean value")
raise ValueError(f"Default value {v} cannot be converted to boolean")

return v

@field_validator('examples')
Expand Down Expand Up @@ -267,7 +270,6 @@ def validate_examples_type(cls, v, info):
return validated_examples



class HTTPMethod(StrEnum):
"""
Supported HTTP methods for API endpoints.
Expand Down Expand Up @@ -319,7 +321,6 @@ class RoutineOperationTypes(StrEnum):
RETURN = "return"



class RoutineOperation(BaseModel):
"""
Base class for routine operations.
Expand Down Expand Up @@ -441,31 +442,47 @@ def validate_parameter_usage(self) -> 'Routine':
and no undefined parameters are used.
Raises ValueError if unused parameters are found or undefined parameters are used.
"""
# Check 0: Ensure name and description fields don't contain parameter placeholders
# These are metadata fields and should not have interpolation patterns
param_pattern = r'\{\{([^}]*)\}\}'
# check in Routine.name
name_matches = re.findall(param_pattern, self.name)
if name_matches:
raise ValueError(
f"Parameter placeholders found in routine name '{self.name}': {name_matches}. "
"The 'name' field is a metadata field and should not contain parameter placeholders like {{param}}."
)
# check in Routine.description
description_matches = re.findall(param_pattern, self.description)
if description_matches:
raise ValueError(
f"Parameter placeholders found in routine description: {description_matches}. "
"The 'description' field is a metadata field and should not contain parameter placeholders like {{param}}."
)

# list of builtin parameter names
builtin_parameter_names = [builtin_parameter.name for builtin_parameter in BUILTIN_PARAMETERS]

# Convert the entire routine to JSON string for searching
routine_json = self.model_dump_json()

# Extract all parameter names
defined_parameters = {param.name for param in self.parameters}

# Find all parameter usages in the JSON: *"{{*}}"*
# Match quoted placeholders: "{{param}}" or \"{{param}}\" (escaped quotes in JSON strings)
# \"{{param}}\" in JSON string means "{{param}}" in actual value
# Pattern REQUIRES quotes (either " or \") immediately before {{ and after }}
param_pattern = r'(?:"|\\")\{\{([^}"]*)\}\}(?:"|\\")'
# Find all parameter usages in the JSON: {{*}}
# Match placeholders anywhere: {{param}}
# This matches parameters whether they're standalone quoted values or embedded in strings
param_pattern = r'\{\{([^}]*)\}\}'
matches = re.findall(param_pattern, routine_json)

# track used parameters
used_parameters = set()

# iterate over all parameter usages
for match in matches:

# clean the match (already extracted the content between braces)
match = match.strip()

# if the parameter name contains a colon, it is a storage parameter
if ":" in match:
kind, path = [p.strip() for p in match.split(":", 1)]
Expand All @@ -484,15 +501,15 @@ def validate_parameter_usage(self) -> 'Routine':
if unused_parameters:
raise ValueError(
f"Unused parameters found in routine '{self.name}': {list(unused_parameters)}. "
f"All defined parameters must be used somewhere in the routine operations."
"All defined parameters must be used somewhere in the routine operations."
)

# Check 2: No undefined parameters should be used
undefined_parameters = used_parameters - defined_parameters
if undefined_parameters:
raise ValueError(
f"Undefined parameters found in routine '{self.name}': {list(undefined_parameters)}. "
f"All parameters used in the routine must be defined in parameters."
"All parameters used in the routine must be defined in parameters."
)

return self
3 changes: 2 additions & 1 deletion src/routine_discovery/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from openai import OpenAI
from pydantic import BaseModel, Field

from src.config import Config
from src.routine_discovery.context_manager import ContextManager
from src.utils.llm_utils import llm_parse_text_to_model, collect_text_from_response, manual_llm_parse_text_to_model
from src.data_models.llm_responses import (
Expand All @@ -26,7 +27,7 @@
from src.data_models.dev_routine import Routine, RoutineFetchOperation
from src.utils.exceptions import TransactionIdentificationFailedError

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)
logger = logging.getLogger(__name__)


Expand Down
Loading