From abcfe76c16ce5f986b8ef7ec84b399d2d6ec306c Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:49:18 +0200 Subject: [PATCH 1/9] Fix handling of AI extract rules --- README.md | 2 ++ scrapingbee/__version__.py | 2 +- scrapingbee/utils.py | 2 ++ tests/test_client.py | 21 +++++++++++++++++++++ tests/test_utils.py | 6 ++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8170fb5..328dc4d 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ Signup to ScrapingBee to [get your API key](https://app.scrapingbee.com/account/ 'device': 'desktop', # Use some data extraction rules 'extract_rules': {'title': 'h1'}, + # Use AI to extract data from the page + 'ai_extract_rules': {'product_name': 'The name of the product', 'price': 'The price in USD'}, # Wrap response in JSON 'json_response': False, # Interact with the webpage you want to scrape diff --git a/scrapingbee/__version__.py b/scrapingbee/__version__.py index 159d48b..0309ae2 100644 --- a/scrapingbee/__version__.py +++ b/scrapingbee/__version__.py @@ -1 +1 @@ -__version__ = "2.0.1" +__version__ = "2.0.2" diff --git a/scrapingbee/utils.py b/scrapingbee/utils.py index adf5759..7501533 100644 --- a/scrapingbee/utils.py +++ b/scrapingbee/utils.py @@ -46,6 +46,8 @@ def process_params(params: dict) -> dict: new_params[k] = process_cookies(v) elif k == 'extract_rules': new_params[k] = process_json_stringify_param(v, 'extract_rules') + elif k == 'ai_extract_rules': + new_params[k] = process_json_stringify_param(v, 'ai_extract_rules') elif k == 'js_scenario': new_params[k] = process_json_stringify_param(v, 'js_scenario') else: diff --git a/tests/test_client.py b/tests/test_client.py index be84469..2a3d0f8 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -113,6 +113,27 @@ def test_get_with_js_scenario(mock_session, client): ) +@mock.patch('scrapingbee.client.Session') +def test_get_with_ai_extract_rules(mock_session, client): + '''It should format the ai_extract_rules and add them to the url''' + client.get('https://httpbin.org', params={ + 'ai_extract_rules': { + "product_name": "The name of the product", + "price": "The price in USD" + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/' + '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' + 'ai_extract_rules=%7B%22product_name%22%3A+%22The+name+of+the+product%22%2C+%22' + 'price%22%3A+%22The+price+in+USD%22%7D', + data=None, + headers=DEFAULT_HEADERS, + ) + + @mock.patch('scrapingbee.client.Session') def test_post(mock_session, client): '''It should make a POST request with some data''' diff --git a/tests/test_utils.py b/tests/test_utils.py index 9d1a8ea..a63e6da 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -46,6 +46,12 @@ def test_process_js_scenario(): assert output == '{"instructions": [{"click": "#buttonId"}]}' +def test_process_ai_extract_rules(): + """It should format ai_extract_rules to a stringified JSON""" + output = process_json_stringify_param({"product_name": "The name of the product", "price": "The price in USD"}, "ai_extract_rules") + assert output == '{"product_name": "The name of the product", "price": "The price in USD"}' + + def test_process_params(): """It should keep boolean parameters""" output = process_params({"render_js": True}) From 932166cc3ad82076b69828cbf5df896b3eb0a1a6 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:52:58 +0200 Subject: [PATCH 2/9] Bump flake8 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e907555..af909be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ certifi==2022.12.7 charset-normalizer==3.1.0 distlib==0.3.6 filelock==3.10.0 -flake8==3.9.2 +flake8==6.0.0 idna==3.4 iniconfig==2.0.0 mccabe==0.6.1 From d2a51339b8fc92847b757965ad49a9e4dce2f000 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:54:16 +0200 Subject: [PATCH 3/9] Remove 3.7 from supported lib --- .github/workflows/main.yaml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 1606316..7653053 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v3 - name: Set up Python diff --git a/setup.py b/setup.py index 4ee8299..94e8519 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,6 @@ 'Programming Language :: Python :: 3.11', 'Topic :: Software Development :: Libraries :: Python Modules', ], - python_requires='>=3.7', + python_requires='>=3.8', install_requires=['requests'], ) From 9fed112ed0a00423676d1046cf1b13ce0fd1d4c0 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:55:22 +0200 Subject: [PATCH 4/9] Update maccabe version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index af909be..d1fabf6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ filelock==3.10.0 flake8==6.0.0 idna==3.4 iniconfig==2.0.0 -mccabe==0.6.1 +mccabe==0.7.0 more-itertools==9.1.0 packaging==23.0 platformdirs==3.1.1 From 620e1268b2d2c0ac88d395b46034edf5beaa4ae7 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:57:16 +0200 Subject: [PATCH 5/9] Update maccabe version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d1fabf6..f90f773 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,8 +12,8 @@ packaging==23.0 platformdirs==3.1.1 pluggy==0.13.1 py==1.11.0 -pycodestyle==2.7.0 -pyflakes==2.3.1 +pycodestyle==2.10.0 +pyflakes==3.0.1 pytest==7.2.2 requests==2.28.2 six==1.16.0 From 2793576cf8a640deb48b85dd17fe661118b68ef5 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:58:23 +0200 Subject: [PATCH 6/9] Fix lint --- tests/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index a63e6da..805e08a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -48,7 +48,8 @@ def test_process_js_scenario(): def test_process_ai_extract_rules(): """It should format ai_extract_rules to a stringified JSON""" - output = process_json_stringify_param({"product_name": "The name of the product", "price": "The price in USD"}, "ai_extract_rules") + output = process_json_stringify_param( + {"product_name": "The name of the product", "price": "The price in USD"}, "ai_extract_rules") assert output == '{"product_name": "The name of the product", "price": "The price in USD"}' From ead18716be60afb5d59133d1c546f3ae089589d9 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 10:59:32 +0200 Subject: [PATCH 7/9] Fix tests --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 805e08a..583e497 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -18,7 +18,7 @@ def test_process_headers(): """It should add a Spb- prefix to header names""" output = process_headers({"Accept-Language": "En-US"}) assert output == { - "User-Agent": "ScrapingBee-Python/2.0.1", + "User-Agent": "ScrapingBee-Python/2.0.2", "Spb-Accept-Language": "En-US", } From bad27f2907624d9a611131c93816ad112d7f388e Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 11:32:12 +0200 Subject: [PATCH 8/9] Add debug --- .github/workflows/main.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 7653053..4f4a3f3 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -45,6 +45,7 @@ jobs: - name: Publish package to Test PyPI (always) uses: pypa/gh-action-pypi-publish@release/v1 with: + verbose: 'true' password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository_url: https://test.pypi.org/legacy/ skip_existing: true From 47efd4175d676f32f0cf665cbbe06f1f77d3f710 Mon Sep 17 00:00:00 2001 From: Pierre de Wulf Date: Thu, 2 Oct 2025 12:06:57 +0200 Subject: [PATCH 9/9] Remove useless CLI step --- .github/workflows/main.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 4f4a3f3..942dc69 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -42,13 +42,6 @@ jobs: - name: Build package run: >- make build - - name: Publish package to Test PyPI (always) - uses: pypa/gh-action-pypi-publish@release/v1 - with: - verbose: 'true' - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ - skip_existing: true - name: Publish package to PyPI (only if pushing a tag) if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1