From 2ae19aee56bed32f5ae34715c29f464bfa329f8e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 22 Jul 2024 16:50:58 +0200 Subject: [PATCH 01/51] update function --- pyproject.toml | 3 +- requirements-dev.lock | 585 -------------------- requirements-dev.txt | 4 - requirements.lock | 366 ------------ scrapegraphai/nodes/generate_answer_node.py | 68 +-- scrapegraphai/utils/__init__.py | 1 + scrapegraphai/utils/merge_results.py | 30 + 7 files changed, 70 insertions(+), 987 deletions(-) delete mode 100644 requirements-dev.lock delete mode 100644 requirements-dev.txt delete mode 100644 requirements.lock create mode 100644 scrapegraphai/utils/merge_results.py diff --git a/pyproject.toml b/pyproject.toml index e5b997ba..30725709 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "undetected-playwright==0.3.0", "semchunk==1.0.1", "html2text==2024.2.26", - "langchain-fireworks==0.1.3" + "langchain-fireworks==0.1.3", + "langchain-community==0.2.9" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock deleted file mode 100644 index b0bcaaa0..00000000 --- a/requirements-dev.lock +++ /dev/null @@ -1,585 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. -aiofiles==23.2.1 - # via burr -aiohttp==3.9.5 - # via langchain - # via langchain-community - # via langchain-fireworks -aiosignal==1.3.1 - # via aiohttp -alabaster==0.7.16 - # via sphinx -altair==5.3.0 - # via streamlit -annotated-types==0.7.0 - # via pydantic -anthropic==0.26.1 - # via langchain-anthropic -anyio==4.3.0 - # via anthropic - # via groq - # via httpx - # via openai - # via starlette - # via watchfiles -astroid==3.2.2 - # via pylint -async-timeout==4.0.3 - # via aiohttp - # via langchain -attrs==23.2.0 - # via aiohttp - # via jsonschema - # via referencing -babel==2.15.0 - # via sphinx -beautifulsoup4==4.12.3 - # via furo - # via google - # via scrapegraphai -blinker==1.8.2 - # via streamlit -boto3==1.34.113 - # via langchain-aws -botocore==1.34.113 - # via boto3 - # via s3transfer -burr==0.22.1 - # via scrapegraphai -cachetools==5.3.3 - # via google-auth - # via streamlit -certifi==2024.2.2 - # via httpcore - # via httpx - # via requests -charset-normalizer==3.3.2 - # via requests -click==8.1.7 - # via burr - # via streamlit - # via typer - # via uvicorn -contourpy==1.2.1 - # via matplotlib -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.6 - # via langchain - # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic -dill==0.3.8 - # via pylint -distro==1.9.0 - # via anthropic - # via groq - # via openai -dnspython==2.6.1 - # via email-validator -docstring-parser==0.16 - # via google-cloud-aiplatform -docutils==0.19 - # via sphinx -email-validator==2.1.1 - # via fastapi -exceptiongroup==1.2.1 - # via anyio - # via pytest -faiss-cpu==1.8.0 - # via scrapegraphai -fastapi==0.111.0 - # via burr - # via fastapi-pagination -fastapi-cli==0.0.4 - # via fastapi -fastapi-pagination==0.12.24 - # via burr -filelock==3.14.0 - # via huggingface-hub -fireworks-ai==0.14.0 - # via langchain-fireworks -fonttools==4.52.1 - # via matplotlib -free-proxy==1.1.1 - # via scrapegraphai -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.5.0 - # via huggingface-hub -furo==2024.5.6 - # via scrapegraphai -gitdb==4.0.11 - # via gitpython -gitpython==3.1.43 - # via streamlit -google==3.0.0 - # via scrapegraphai -google-ai-generativelanguage==0.6.4 - # via google-generativeai -google-api-core==2.19.0 - # via google-ai-generativelanguage - # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-api-python-client==2.130.0 - # via google-generativeai -google-auth==2.29.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-api-python-client - # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-auth-httplib2==0.2.0 - # via google-api-python-client -google-cloud-aiplatform==1.58.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.3 - # via google-cloud-aiplatform -google-cloud-storage==2.17.0 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media -google-generativeai==0.5.4 - # via langchain-google-genai -google-resumable-media==2.7.1 - # via google-cloud-bigquery - # via google-cloud-storage -googleapis-common-protos==1.63.0 - # via google-api-core - # via grpc-google-iam-v1 - # via grpcio-status -graphviz==0.20.3 - # via burr - # via scrapegraphai -greenlet==3.0.3 - # via playwright -groq==0.8.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.64.0 - # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status -grpcio-status==1.62.2 - # via google-api-core -h11==0.14.0 - # via httpcore - # via uvicorn -html2text==2024.2.26 - # via scrapegraphai -httpcore==1.0.5 - # via httpx -httplib2==0.22.0 - # via google-api-python-client - # via google-auth-httplib2 -httptools==0.6.1 - # via uvicorn -httpx==0.27.0 - # via anthropic - # via fastapi - # via fireworks-ai - # via groq - # via openai -httpx-sse==0.4.0 - # via fireworks-ai -huggingface-hub==0.23.1 - # via tokenizers -idna==3.7 - # via anyio - # via email-validator - # via httpx - # via requests - # via yarl -imagesize==1.4.1 - # via sphinx -importlib-metadata==8.0.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib -iniconfig==2.0.0 - # via pytest -isort==5.13.2 - # via pylint -jinja2==3.1.4 - # via altair - # via burr - # via fastapi - # via pydeck - # via sphinx -jiter==0.4.0 - # via anthropic -jmespath==1.0.1 - # via boto3 - # via botocore -jsonpatch==1.33 - # via langchain - # via langchain-core -jsonpointer==2.4 - # via jsonpatch -jsonschema==4.22.0 - # via altair -jsonschema-specifications==2023.12.1 - # via jsonschema -kiwisolver==1.4.5 - # via matplotlib -langchain==0.1.15 - # via scrapegraphai -langchain-anthropic==0.1.11 - # via scrapegraphai -langchain-aws==0.1.3 - # via scrapegraphai -langchain-community==0.0.38 - # via langchain -langchain-core==0.1.52 - # via langchain - # via langchain-anthropic - # via langchain-aws - # via langchain-community - # via langchain-fireworks - # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-openai - # via langchain-text-splitters -langchain-fireworks==0.1.3 - # via scrapegraphai -langchain-google-genai==1.0.3 - # via scrapegraphai -langchain-google-vertexai==1.0.4 - # via scrapegraphai -langchain-groq==0.1.3 - # via scrapegraphai -langchain-openai==0.1.6 - # via scrapegraphai -langchain-text-splitters==0.0.2 - # via langchain -langsmith==0.1.63 - # via langchain - # via langchain-community - # via langchain-core -loguru==0.7.2 - # via burr -lxml==5.2.2 - # via free-proxy -markdown-it-py==3.0.0 - # via rich -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.21.2 - # via dataclasses-json -matplotlib==3.9.0 - # via burr -mccabe==0.7.0 - # via pylint -mdurl==0.1.2 - # via markdown-it-py -minify-html==0.15.0 - # via scrapegraphai -multidict==6.0.5 - # via aiohttp - # via yarl -mypy-extensions==1.0.0 - # via typing-inspect -numpy==1.26.4 - # via altair - # via contourpy - # via faiss-cpu - # via langchain - # via langchain-aws - # via langchain-community - # via matplotlib - # via pandas - # via pyarrow - # via pydeck - # via sf-hamilton - # via shapely - # via streamlit -openai==1.30.3 - # via burr - # via langchain-fireworks - # via langchain-openai -orjson==3.10.3 - # via fastapi - # via langsmith -packaging==23.2 - # via altair - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via huggingface-hub - # via langchain-core - # via marshmallow - # via matplotlib - # via pytest - # via sphinx - # via streamlit -pandas==2.2.2 - # via altair - # via scrapegraphai - # via sf-hamilton - # via streamlit -pillow==10.3.0 - # via fireworks-ai - # via matplotlib - # via streamlit -platformdirs==4.2.2 - # via pylint -playwright==1.43.0 - # via scrapegraphai - # via undetected-playwright -pluggy==1.5.0 - # via pytest -proto-plus==1.23.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager -protobuf==4.25.3 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager - # via google-generativeai - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status - # via proto-plus - # via streamlit -pyarrow==16.1.0 - # via streamlit -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pydantic==2.7.1 - # via anthropic - # via burr - # via fastapi - # via fastapi-pagination - # via fireworks-ai - # via google-cloud-aiplatform - # via google-generativeai - # via groq - # via langchain - # via langchain-core - # via langsmith - # via openai -pydantic-core==2.18.2 - # via pydantic -pydeck==0.9.1 - # via streamlit -pyee==11.1.0 - # via playwright -pygments==2.18.0 - # via furo - # via rich - # via sphinx -pylint==3.2.5 -pyparsing==3.1.2 - # via httplib2 - # via matplotlib -pytest==8.0.0 - # via pytest-mock -pytest-mock==3.14.0 -python-dateutil==2.9.0.post0 - # via botocore - # via google-cloud-bigquery - # via matplotlib - # via pandas -python-dotenv==1.0.1 - # via scrapegraphai - # via uvicorn -python-multipart==0.0.9 - # via fastapi -pytz==2024.1 - # via pandas -pyyaml==6.0.1 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core - # via uvicorn -referencing==0.35.1 - # via jsonschema - # via jsonschema-specifications -regex==2024.5.15 - # via tiktoken -requests==2.32.2 - # via burr - # via free-proxy - # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-fireworks - # via langsmith - # via sphinx - # via streamlit - # via tiktoken -rich==13.7.1 - # via streamlit - # via typer -rpds-py==0.18.1 - # via jsonschema - # via referencing -rsa==4.9 - # via google-auth -s3transfer==0.10.1 - # via boto3 -semchunk==1.0.1 - # via scrapegraphai -sf-hamilton==1.63.0 - # via burr -shapely==2.0.4 - # via google-cloud-aiplatform -shellingham==1.5.4 - # via typer -six==1.16.0 - # via python-dateutil -smmap==5.0.1 - # via gitdb -sniffio==1.3.1 - # via anthropic - # via anyio - # via groq - # via httpx - # via openai -snowballstemmer==2.2.0 - # via sphinx -soupsieve==2.5 - # via beautifulsoup4 -sphinx==6.0.0 - # via furo - # via scrapegraphai - # via sphinx-basic-ng -sphinx-basic-ng==1.0.0b2 - # via furo -sphinxcontrib-applehelp==1.0.8 - # via sphinx -sphinxcontrib-devhelp==1.0.6 - # via sphinx -sphinxcontrib-htmlhelp==2.0.5 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==1.0.7 - # via sphinx -sphinxcontrib-serializinghtml==1.1.10 - # via sphinx -sqlalchemy==2.0.30 - # via langchain - # via langchain-community -starlette==0.37.2 - # via fastapi -streamlit==1.35.0 - # via burr -tenacity==8.3.0 - # via langchain - # via langchain-community - # via langchain-core - # via streamlit -tiktoken==0.7.0 - # via langchain-openai - # via scrapegraphai -tokenizers==0.19.1 - # via anthropic -toml==0.10.2 - # via streamlit -tomli==2.0.1 - # via pylint - # via pytest -tomlkit==0.12.5 - # via pylint -toolz==0.12.1 - # via altair -tornado==6.4 - # via streamlit -tqdm==4.66.4 - # via google-generativeai - # via huggingface-hub - # via openai - # via scrapegraphai - # via semchunk -typer==0.12.3 - # via fastapi-cli -typing-extensions==4.12.0 - # via altair - # via anthropic - # via anyio - # via astroid - # via fastapi - # via fastapi-pagination - # via google-generativeai - # via groq - # via huggingface-hub - # via openai - # via pydantic - # via pydantic-core - # via pyee - # via pylint - # via sf-hamilton - # via sqlalchemy - # via starlette - # via streamlit - # via typer - # via typing-inspect - # via uvicorn -typing-inspect==0.9.0 - # via dataclasses-json - # via sf-hamilton -tzdata==2024.1 - # via pandas -ujson==5.10.0 - # via fastapi -undetected-playwright==0.3.0 - # via scrapegraphai -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.18 - # via botocore - # via requests -uvicorn==0.29.0 - # via burr - # via fastapi -uvloop==0.19.0 - # via uvicorn -watchfiles==0.21.0 - # via uvicorn -websockets==12.0 - # via uvicorn -yarl==1.9.4 - # via aiohttp -zipp==3.19.2 - # via importlib-metadata - # via importlib-resources diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index d33296d5..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx==7.1.2 -furo==2024.5.6 -pytest==8.0.0 -burr[start]==0.22.1 \ No newline at end of file diff --git a/requirements.lock b/requirements.lock deleted file mode 100644 index 7a8bb455..00000000 --- a/requirements.lock +++ /dev/null @@ -1,366 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. -aiohttp==3.9.5 - # via langchain - # via langchain-community - # via langchain-fireworks -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anthropic==0.26.1 - # via langchain-anthropic -anyio==4.3.0 - # via anthropic - # via groq - # via httpx - # via openai -async-timeout==4.0.3 - # via aiohttp - # via langchain -attrs==23.2.0 - # via aiohttp -beautifulsoup4==4.12.3 - # via google - # via scrapegraphai -boto3==1.34.113 - # via langchain-aws -botocore==1.34.113 - # via boto3 - # via s3transfer -cachetools==5.3.3 - # via google-auth -certifi==2024.2.2 - # via httpcore - # via httpx - # via requests -charset-normalizer==3.3.2 - # via requests -dataclasses-json==0.6.6 - # via langchain - # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic -distro==1.9.0 - # via anthropic - # via groq - # via openai -docstring-parser==0.16 - # via google-cloud-aiplatform -exceptiongroup==1.2.1 - # via anyio -faiss-cpu==1.8.0 - # via scrapegraphai -filelock==3.14.0 - # via huggingface-hub -fireworks-ai==0.14.0 - # via langchain-fireworks -free-proxy==1.1.1 - # via scrapegraphai -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.5.0 - # via huggingface-hub -google==3.0.0 - # via scrapegraphai -google-ai-generativelanguage==0.6.4 - # via google-generativeai -google-api-core==2.19.0 - # via google-ai-generativelanguage - # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-api-python-client==2.130.0 - # via google-generativeai -google-auth==2.29.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-api-python-client - # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage - # via google-generativeai -google-auth-httplib2==0.2.0 - # via google-api-python-client -google-cloud-aiplatform==1.58.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.3 - # via google-cloud-aiplatform -google-cloud-storage==2.17.0 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media -google-generativeai==0.5.4 - # via langchain-google-genai -google-resumable-media==2.7.1 - # via google-cloud-bigquery - # via google-cloud-storage -googleapis-common-protos==1.63.0 - # via google-api-core - # via grpc-google-iam-v1 - # via grpcio-status -graphviz==0.20.3 - # via scrapegraphai -greenlet==3.0.3 - # via playwright -groq==0.8.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.64.0 - # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status -grpcio-status==1.62.2 - # via google-api-core -h11==0.14.0 - # via httpcore -html2text==2024.2.26 - # via scrapegraphai -httpcore==1.0.5 - # via httpx -httplib2==0.22.0 - # via google-api-python-client - # via google-auth-httplib2 -httpx==0.27.0 - # via anthropic - # via fireworks-ai - # via groq - # via openai -httpx-sse==0.4.0 - # via fireworks-ai -huggingface-hub==0.23.1 - # via tokenizers -idna==3.7 - # via anyio - # via httpx - # via requests - # via yarl -jiter==0.4.0 - # via anthropic -jmespath==1.0.1 - # via boto3 - # via botocore -jsonpatch==1.33 - # via langchain - # via langchain-core -jsonpointer==2.4 - # via jsonpatch -langchain==0.1.15 - # via scrapegraphai -langchain-anthropic==0.1.11 - # via scrapegraphai -langchain-aws==0.1.3 - # via scrapegraphai -langchain-community==0.0.38 - # via langchain -langchain-core==0.1.52 - # via langchain - # via langchain-anthropic - # via langchain-aws - # via langchain-community - # via langchain-fireworks - # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-openai - # via langchain-text-splitters -langchain-fireworks==0.1.3 - # via scrapegraphai -langchain-google-genai==1.0.3 - # via scrapegraphai -langchain-google-vertexai==1.0.4 - # via scrapegraphai -langchain-groq==0.1.3 - # via scrapegraphai -langchain-openai==0.1.6 - # via scrapegraphai -langchain-text-splitters==0.0.2 - # via langchain -langsmith==0.1.63 - # via langchain - # via langchain-community - # via langchain-core -lxml==5.2.2 - # via free-proxy -marshmallow==3.21.2 - # via dataclasses-json -minify-html==0.15.0 - # via scrapegraphai -multidict==6.0.5 - # via aiohttp - # via yarl -mypy-extensions==1.0.0 - # via typing-inspect -numpy==1.26.4 - # via faiss-cpu - # via langchain - # via langchain-aws - # via langchain-community - # via pandas - # via shapely -openai==1.30.3 - # via langchain-fireworks - # via langchain-openai -orjson==3.10.3 - # via langsmith -packaging==23.2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via huggingface-hub - # via langchain-core - # via marshmallow -pandas==2.2.2 - # via scrapegraphai -pillow==10.3.0 - # via fireworks-ai -playwright==1.43.0 - # via scrapegraphai - # via undetected-playwright -proto-plus==1.23.0 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager -protobuf==4.25.3 - # via google-ai-generativelanguage - # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager - # via google-generativeai - # via googleapis-common-protos - # via grpc-google-iam-v1 - # via grpcio-status - # via proto-plus -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pydantic==2.7.1 - # via anthropic - # via fireworks-ai - # via google-cloud-aiplatform - # via google-generativeai - # via groq - # via langchain - # via langchain-core - # via langsmith - # via openai -pydantic-core==2.18.2 - # via pydantic -pyee==11.1.0 - # via playwright -pyparsing==3.1.2 - # via httplib2 -python-dateutil==2.9.0.post0 - # via botocore - # via google-cloud-bigquery - # via pandas -python-dotenv==1.0.1 - # via scrapegraphai -pytz==2024.1 - # via pandas -pyyaml==6.0.1 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core -regex==2024.5.15 - # via tiktoken -requests==2.32.2 - # via free-proxy - # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-fireworks - # via langsmith - # via tiktoken -rsa==4.9 - # via google-auth -s3transfer==0.10.1 - # via boto3 -semchunk==1.0.1 - # via scrapegraphai -shapely==2.0.4 - # via google-cloud-aiplatform -six==1.16.0 - # via python-dateutil -sniffio==1.3.1 - # via anthropic - # via anyio - # via groq - # via httpx - # via openai -soupsieve==2.5 - # via beautifulsoup4 -sqlalchemy==2.0.30 - # via langchain - # via langchain-community -tenacity==8.3.0 - # via langchain - # via langchain-community - # via langchain-core -tiktoken==0.7.0 - # via langchain-openai - # via scrapegraphai -tokenizers==0.19.1 - # via anthropic -tqdm==4.66.4 - # via google-generativeai - # via huggingface-hub - # via openai - # via scrapegraphai - # via semchunk -typing-extensions==4.12.0 - # via anthropic - # via anyio - # via google-generativeai - # via groq - # via huggingface-hub - # via openai - # via pydantic - # via pydantic-core - # via pyee - # via sqlalchemy - # via typing-inspect -typing-inspect==0.9.0 - # via dataclasses-json -tzdata==2024.1 - # via pandas -undetected-playwright==0.3.0 - # via scrapegraphai -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.18 - # via botocore - # via requests -yarl==1.9.4 - # via aiohttp diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index f764e58b..eb440a75 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,6 +7,8 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm +import asyncio +from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode @@ -109,42 +111,46 @@ def execute(self, state: dict) -> dict: chains_dict = {} + if len(doc) == 1: + prompt = PromptTemplate( + template=template_no_chunks_prompt, + input_variables=["question"], + partial_variables={"context": doc, + "format_instructions": format_instructions}) + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) + + state.update({self.output[0]: answer}) + return state + # Use tqdm to add progress bar for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): - if len(doc) == 1: - prompt = PromptTemplate( - template=template_no_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "format_instructions": format_instructions}) - chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) - break prompt = PromptTemplate( - template=template_chunks_prompt, - input_variables=["question"], - partial_variables={"context": chunk, - "chunk_id": i + 1, - "format_instructions": format_instructions}) - # Dynamically name the chains based on their index + template=template_chunks, + input_variables=["question"], + partial_variables={"context": chunk, + "chunk_id": i + 1, + "format_instructions": format_instructions}) + # Add chain to dictionary with dynamic name chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser - if len(chains_dict) > 1: - # Use dictionary unpacking to pass the dynamically named chains to RunnableParallel - map_chain = RunnableParallel(**chains_dict) - # Chain - answer = map_chain.invoke({"question": user_prompt}) - # Merge the answers from the chunks - merge_prompt = PromptTemplate( - template = template_merge_prompt, - input_variables=["context", "question"], - partial_variables={"format_instructions": format_instructions}, - ) - merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - - # Update the state with the generated answer - state.update({self.output[0]: answer}) + + async def process_chains(): + async_runner = RunnableParallel() + for chain_name, chain in chains_dict.items(): + async_runner.add(chain.ainvoke([{"question": user_prompt}] * len(doc))) + + batch_results = await async_runner.run() + return batch_results + + loop = asyncio.get_event_loop() + batch_answers = loop.run_until_complete(process_chains()) + + # Merge batch results (assuming same structure) + merged_answer = merge_results(batch_answers) + answers = merged_answer + + state.update({self.output[0]: answers}) return state diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 707d2b18..15fd6886 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,3 +11,4 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md +from .merge_results import merge_results diff --git a/scrapegraphai/utils/merge_results.py b/scrapegraphai/utils/merge_results.py new file mode 100644 index 00000000..ff5728fa --- /dev/null +++ b/scrapegraphai/utils/merge_results.py @@ -0,0 +1,30 @@ +def merge_results(batch_answers): + """ + Merges the results from single-chunk processing and batch processing, and adds separators between the chunks. + Parameters: + ----------- + answers : list of str + A list of strings containing the results from single-chunk processing. + + batch_answers : list of dict + A list of dictionaries, where each dictionary contains a key "text" with the batch processing result as a string. + + Returns: + -------- + str + A single string containing all merged results, with each result separated by a newline character. + + Example: + -------- + >>> answers = ["Result from single-chunk 1", "Result from single-chunk 2"] + >>> batch_answers = [{"text": "Result from batch 1"}, {"text": "Result from batch 2"}] + >>> merge_results(answers, batch_answers) + 'Result from single-chunk 1\nResult from single-chunk 2\nResult from batch 1\nResult from batch 2' + """ + # Combine answers from single-chunk processing and batch processing + merged_answers = [answer["text"] for answer in batch_answers] + + # Add separators between chunks + merged_answers = "\n".join(merged_answers) + + return merged_answers \ No newline at end of file From 0c4b2908d98efbb2b0a6faf68618a801d726bb5f Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 22 Jul 2024 19:58:33 +0200 Subject: [PATCH 02/51] feat: add generate_answer node paralellization Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/nodes/generate_answer_node.py | 26 +++++++++------------ 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index eb440a75..9cd5dce5 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,13 +1,12 @@ """ GenerateAnswerNode Module """ - +import asyncio from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -import asyncio from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI @@ -136,21 +135,18 @@ def execute(self, state: dict) -> dict: chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser + async_runner = RunnableParallel(**chains_dict) - async def process_chains(): - async_runner = RunnableParallel() - for chain_name, chain in chains_dict.items(): - async_runner.add(chain.ainvoke([{"question": user_prompt}] * len(doc))) - - batch_results = await async_runner.run() - return batch_results + batch_results = async_runner.invoke({"question": user_prompt}) - loop = asyncio.get_event_loop() - batch_answers = loop.run_until_complete(process_chains()) + merge_prompt = PromptTemplate( + template = template_merge_prompt, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions}, + ) - # Merge batch results (assuming same structure) - merged_answer = merge_results(batch_answers) - answers = merged_answer + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) - state.update({self.output[0]: answers}) + state.update({self.output[0]: answer}) return state From cf2734392cda6ef6eda50517671d44d4b06e26c7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 23 Jul 2024 13:05:50 +0200 Subject: [PATCH 03/51] removed unused function --- scrapegraphai/nodes/generate_answer_node.py | 1 - scrapegraphai/utils/__init__.py | 1 - scrapegraphai/utils/merge_results.py | 30 --------------------- 3 files changed, 32 deletions(-) delete mode 100644 scrapegraphai/utils/merge_results.py diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 9cd5dce5..d864124e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -7,7 +7,6 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..utils.merge_results import merge_results from ..utils.logging import get_logger from ..models import Ollama, OpenAI from .base_node import BaseNode diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 15fd6886..707d2b18 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -11,4 +11,3 @@ from .cleanup_html import cleanup_html from .logging import * from .convert_to_md import convert_to_md -from .merge_results import merge_results diff --git a/scrapegraphai/utils/merge_results.py b/scrapegraphai/utils/merge_results.py deleted file mode 100644 index ff5728fa..00000000 --- a/scrapegraphai/utils/merge_results.py +++ /dev/null @@ -1,30 +0,0 @@ -def merge_results(batch_answers): - """ - Merges the results from single-chunk processing and batch processing, and adds separators between the chunks. - Parameters: - ----------- - answers : list of str - A list of strings containing the results from single-chunk processing. - - batch_answers : list of dict - A list of dictionaries, where each dictionary contains a key "text" with the batch processing result as a string. - - Returns: - -------- - str - A single string containing all merged results, with each result separated by a newline character. - - Example: - -------- - >>> answers = ["Result from single-chunk 1", "Result from single-chunk 2"] - >>> batch_answers = [{"text": "Result from batch 1"}, {"text": "Result from batch 2"}] - >>> merge_results(answers, batch_answers) - 'Result from single-chunk 1\nResult from single-chunk 2\nResult from batch 1\nResult from batch 2' - """ - # Combine answers from single-chunk processing and batch processing - merged_answers = [answer["text"] for answer in batch_answers] - - # Add separators between chunks - merged_answers = "\n".join(merged_answers) - - return merged_answers \ No newline at end of file From df1ecc00192a48abc6bbbe16444507c4bdf6362c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 23 Jul 2024 13:06:59 +0200 Subject: [PATCH 04/51] Update generate_answer_node.py --- scrapegraphai/nodes/generate_answer_node.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index d864124e..81812598 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -107,8 +107,6 @@ def execute(self, state: dict) -> dict: template_chunks_prompt = self.additional_info + template_chunks_prompt template_merge_prompt = self.additional_info + template_merge_prompt - chains_dict = {} - if len(doc) == 1: prompt = PromptTemplate( template=template_no_chunks_prompt, @@ -121,7 +119,7 @@ def execute(self, state: dict) -> dict: state.update({self.output[0]: answer}) return state - # Use tqdm to add progress bar + chains_dict = {} for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)): prompt = PromptTemplate( From 7080a0afd527a34ada33ee2d3ace8e724d879df7 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 23 Jul 2024 14:15:37 +0000 Subject: [PATCH 05/51] ci(release): 1.11.0-beta.1 [skip ci] ## [1.11.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.4...v1.11.0-beta.1) (2024-07-23) ### Features * add new toml ([fcb3220](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fcb3220868e7ef1127a7a47f40d0379be282e6eb)) * add nvidia connection ([fc0dadb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fc0dadb8f812dfd636dec856921a971b58695ce3)) ### Bug Fixes * **md_conversion:** add absolute links md, added missing dependency ([12b5ead](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12b5eada6ea783770afd630ede69b8cf867a7ded)) ### chore * **dependecies:** add script to auto-update requirements ([3289c7b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3289c7bf5ec58ac3d04e9e5e8e654af9abcee228)) * **ci:** set up workflow for requirements auto-update ([295fc28](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/295fc28ceb02c78198f7fbe678352503b3259b6b)) * update requirements.txt ([c7bac98](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c7bac98d2e79e5ab98fa65d7efa858a2cdda1622)) * upgrade dependencies and scripts ([74d142e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/74d142eaae724b087eada9c0c876b40a2ccc7cae)) * **pyproject:** upgrade dependencies ([0425124](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0425124c570f765b98fcf67ba6649f4f9fe76b15)) ### Docs * add hero image ([4182e23](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4182e23e3b8d8f141b119b6014ae3ff20b3892e3)) * updated readme ([c377ae0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c377ae0544a78ebdc0d15f8d23b3846c26876c8c)) ### CI * **release:** 1.10.0-beta.6 [skip ci] ([254bde7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/254bde7008b41ffa434925e3ae84340c53a565bd)) * **release:** 1.10.0-beta.7 [skip ci] ([1756e85](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1756e8522f3874de8afbef9ac327f9b3f1a49d07)) * **release:** 1.10.0-beta.8 [skip ci] ([255e569](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/255e569172b1029bc2a723b2ec66bcf3d3ee3791)) --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43b7b08e..d59af3cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,37 @@ +## [1.11.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.4...v1.11.0-beta.1) (2024-07-23) + + +### Features + +* add new toml ([fcb3220](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fcb3220868e7ef1127a7a47f40d0379be282e6eb)) +* add nvidia connection ([fc0dadb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fc0dadb8f812dfd636dec856921a971b58695ce3)) + + +### Bug Fixes + +* **md_conversion:** add absolute links md, added missing dependency ([12b5ead](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12b5eada6ea783770afd630ede69b8cf867a7ded)) + + +### chore + +* **dependecies:** add script to auto-update requirements ([3289c7b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3289c7bf5ec58ac3d04e9e5e8e654af9abcee228)) +* **ci:** set up workflow for requirements auto-update ([295fc28](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/295fc28ceb02c78198f7fbe678352503b3259b6b)) +* update requirements.txt ([c7bac98](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c7bac98d2e79e5ab98fa65d7efa858a2cdda1622)) +* upgrade dependencies and scripts ([74d142e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/74d142eaae724b087eada9c0c876b40a2ccc7cae)) +* **pyproject:** upgrade dependencies ([0425124](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0425124c570f765b98fcf67ba6649f4f9fe76b15)) + + +### Docs + +* add hero image ([4182e23](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4182e23e3b8d8f141b119b6014ae3ff20b3892e3)) +* updated readme ([c377ae0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c377ae0544a78ebdc0d15f8d23b3846c26876c8c)) + + +### CI + +* **release:** 1.10.0-beta.6 [skip ci] ([254bde7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/254bde7008b41ffa434925e3ae84340c53a565bd)) +* **release:** 1.10.0-beta.7 [skip ci] ([1756e85](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1756e8522f3874de8afbef9ac327f9b3f1a49d07)) +* **release:** 1.10.0-beta.8 [skip ci] ([255e569](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/255e569172b1029bc2a723b2ec66bcf3d3ee3791)) ## [1.10.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.0-beta.7...v1.10.0-beta.8) (2024-07-23) diff --git a/pyproject.toml b/pyproject.toml index 989e32be..9fbc763d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.10.4" +version = "1.11.0b1" From 2edad66788cbd92f197e3b37db13c44bfa39e36a Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 23 Jul 2024 20:51:49 +0200 Subject: [PATCH 06/51] chore: rebuild requirements --- requirements.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 124840e5..440bf78a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ langchain>=0.2.10 -langchain_community>=0.2.9 langchain-google-genai>=1.0.7 -langchain-fireworks>=0.1.3 langchain-google-vertexai langchain-openai>=0.1.17 langchain-groq>=0.1.3 @@ -22,4 +20,5 @@ playwright>=1.43.0 google>=3.0.0 undetected-playwright>=0.3.0 semchunk>=1.0.1 - +langchain-fireworks>=0.1.3 +langchain-community>=0.2.9 From 377d679eecd62611c0c9a3cba8202c6f0719ed31 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:17:48 +0200 Subject: [PATCH 07/51] feat: pdate models_tokens.py --- scrapegraphai/helpers/models_tokens.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index b3d61065..0724ee95 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -51,6 +51,10 @@ "falcon": 2048, "llama2": 4096, "llama3": 8192, + "llama3:70b": 8192, + "llama3.1":128000, + "llama3.1:70b": 128000, + "lama3.1:405b": 128000, "scrapegraph": 8192, "llava": 4096, "mixtral:8x22b-instruct": 65536, From bf6d487bbb26187b32f5985433b54025f6437af5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 24 Jul 2024 13:19:41 +0000 Subject: [PATCH 08/51] ci(release): 1.11.0-beta.2 [skip ci] ## [1.11.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.1...v1.11.0-beta.2) (2024-07-24) ### Features * pdate models_tokens.py ([377d679](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/377d679eecd62611c0c9a3cba8202c6f0719ed31)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d59af3cb..c770f2f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.11.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.1...v1.11.0-beta.2) (2024-07-24) + + +### Features + +* pdate models_tokens.py ([377d679](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/377d679eecd62611c0c9a3cba8202c6f0719ed31)) + ## [1.11.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.10.4...v1.11.0-beta.1) (2024-07-23) diff --git a/pyproject.toml b/pyproject.toml index 9fbc763d..83c0d1ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b1" +version = "1.11.0b2" From f336c95c2d1833d1f829d61ae7fa415ac2caf250 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 25 Jul 2024 09:17:00 +0200 Subject: [PATCH 09/51] fix: add llama 3.1 From 66f9421fc216f0984d5a393101d1c109b08eaa33 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 25 Jul 2024 07:18:11 +0000 Subject: [PATCH 10/51] ci(release): 1.11.0-beta.3 [skip ci] ## [1.11.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.2...v1.11.0-beta.3) (2024-07-25) ### Bug Fixes * add llama 3.1 ([f336c95](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f336c95c2d1833d1f829d61ae7fa415ac2caf250)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c770f2f1..9d43c83b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.11.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.2...v1.11.0-beta.3) (2024-07-25) + + +### Bug Fixes + +* add llama 3.1 ([f336c95](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f336c95c2d1833d1f829d61ae7fa415ac2caf250)) + ## [1.11.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.1...v1.11.0-beta.2) (2024-07-24) diff --git a/pyproject.toml b/pyproject.toml index 83c0d1ac..ccb549c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b2" +version = "1.11.0b3" From 51db43a129ef05c050b6de017598a664119594ba Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 25 Jul 2024 09:50:57 +0000 Subject: [PATCH 11/51] ci(release): 1.11.0-beta.4 [skip ci] ## [1.11.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.3...v1.11.0-beta.4) (2024-07-25) ### Features * add generate_answer node paralellization ([0c4b290](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0c4b2908d98efbb2b0a6faf68618a801d726bb5f)) ### chore * rebuild requirements ([2edad66](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2edad66788cbd92f197e3b37db13c44bfa39e36a)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d43c83b..ea0c578f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.11.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.3...v1.11.0-beta.4) (2024-07-25) + + +### Features + +* add generate_answer node paralellization ([0c4b290](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0c4b2908d98efbb2b0a6faf68618a801d726bb5f)) + + +### chore + +* rebuild requirements ([2edad66](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2edad66788cbd92f197e3b37db13c44bfa39e36a)) + ## [1.11.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.2...v1.11.0-beta.3) (2024-07-25) diff --git a/pyproject.toml b/pyproject.toml index 9f9f6ad8..b7b0d55d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b3" +version = "1.11.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 5137b8aa5bafdb04a6cdc53e7d3fe5c43572f293 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 26 Jul 2024 10:18:08 +0200 Subject: [PATCH 12/51] Update models_tokens.py --- scrapegraphai/helpers/models_tokens.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 0724ee95..cb00435d 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -196,6 +196,11 @@ "fireworks": { "llama-v2-7b": 4096, "mixtral-8x7b-instruct": 4096, - "nomic-ai/nomic-embed-text-v1.5": 8192 + "nomic-ai/nomic-embed-text-v1.5": 8192, + "llama-3.1-405B-instruct": 131072, + "llama-3.1-70B-instruct": 131072, + "llama-3.1-8B-instruct": 131072, + "mixtral-moe-8x22B-instruct": 65536, + "mixtral-moe-8x7B-instruct": 65536 }, } From 05cf9adaa7bed093c5700cd2feb47b01ab88402f Mon Sep 17 00:00:00 2001 From: amosdinh Date: Fri, 26 Jul 2024 12:10:14 +0200 Subject: [PATCH 13/51] Ollama: Use no json format when creating the search query --- scrapegraphai/nodes/search_internet_node.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 97fed67b..2474ab60 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -10,6 +10,7 @@ from ..utils.logging import get_logger from ..utils.research_web import search_on_web from .base_node import BaseNode +from ..models import Ollama class SearchInternetNode(BaseNode): @@ -94,7 +95,14 @@ def execute(self, state: dict) -> dict: # Execute the chain to get the search query search_answer = search_prompt | self.llm_model | output_parser - search_query = search_answer.invoke({"user_prompt": user_prompt})[0] + + # Ollama: Use no json format when creating the search query + if isinstance(self.llm_model, Ollama) and self.llm_model.format == 'json': + self.llm_model.format = None + search_query = search_answer.invoke({"user_prompt": user_prompt})[0] + self.llm_model.format = 'json' + else: + search_query = search_answer.invoke({"user_prompt": user_prompt})[0] self.logger.info(f"Search Query: {search_query}") From d177afb68be036465ede1f567d2562b145d77d36 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:17:47 +0200 Subject: [PATCH 14/51] refactor(Ollama): integrate new LangChain chat init --- requirements-dev.lock | 11 +++++++++++ requirements.lock | 12 ++++++++++++ scrapegraphai/graphs/abstract_graph.py | 16 +++++++++++----- scrapegraphai/models/ollama.py | 17 ----------------- 4 files changed, 34 insertions(+), 22 deletions(-) delete mode 100644 scrapegraphai/models/ollama.py diff --git a/requirements-dev.lock b/requirements-dev.lock index 405395c4..bce18810 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -12,6 +12,7 @@ aiofiles==24.1.0 # via burr aiohttp==3.9.5 # via langchain + # via langchain-community # via langchain-fireworks # via langchain-nvidia-ai-endpoints aiosignal==1.3.1 @@ -179,6 +180,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -262,6 +264,7 @@ langchain-core==0.2.22 # via langchain # via langchain-anthropic # via langchain-aws + # via langchain-community # via langchain-fireworks # via langchain-google-genai # via langchain-google-vertexai @@ -285,6 +288,7 @@ langchain-text-splitters==0.2.2 # via langchain langsmith==0.1.93 # via langchain + # via langchain-community # via langchain-core loguru==0.7.2 # via burr @@ -319,6 +323,7 @@ numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws + # via langchain-community # via matplotlib # via pandas # via pyarrow @@ -339,6 +344,7 @@ packaging==24.1 # via google-cloud-bigquery # via huggingface-hub # via langchain-core + # via marshmallow # via matplotlib # via pytest # via sphinx @@ -429,6 +435,7 @@ pytz==2024.1 pyyaml==6.0.1 # via huggingface-hub # via langchain + # via langchain-community # via langchain-core # via uvicorn referencing==0.35.1 @@ -444,6 +451,7 @@ requests==2.32.3 # via google-cloud-storage # via huggingface-hub # via langchain + # via langchain-community # via langchain-fireworks # via langsmith # via sphinx @@ -501,12 +509,14 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx sqlalchemy==2.0.31 # via langchain + # via langchain-community starlette==0.37.2 # via fastapi streamlit==1.36.0 # via burr tenacity==8.5.0 # via langchain + # via langchain-community # via langchain-core # via streamlit tiktoken==0.7.0 @@ -557,6 +567,7 @@ typing-extensions==4.12.2 # via typing-inspect # via uvicorn typing-inspect==0.9.0 + # via dataclasses-json # via sf-hamilton tzdata==2024.1 # via pandas diff --git a/requirements.lock b/requirements.lock index 9d0602db..aa03fd14 100644 --- a/requirements.lock +++ b/requirements.lock @@ -10,6 +10,7 @@ -e file:. aiohttp==3.9.5 # via langchain + # via langchain-community # via langchain-fireworks # via langchain-nvidia-ai-endpoints aiosignal==1.3.1 @@ -127,6 +128,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -183,6 +185,7 @@ langchain-core==0.2.22 # via langchain # via langchain-anthropic # via langchain-aws + # via langchain-community # via langchain-fireworks # via langchain-google-genai # via langchain-google-vertexai @@ -206,6 +209,7 @@ langchain-text-splitters==0.2.2 # via langchain langsmith==0.1.93 # via langchain + # via langchain-community # via langchain-core lxml==5.2.2 # via free-proxy @@ -226,6 +230,7 @@ numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws + # via langchain-community # via pandas # via shapely openai==1.37.0 @@ -239,6 +244,7 @@ packaging==24.1 # via google-cloud-bigquery # via huggingface-hub # via langchain-core + # via marshmallow pandas==2.2.2 # via scrapegraphai pillow==10.4.0 @@ -296,6 +302,7 @@ pytz==2024.1 pyyaml==6.0.1 # via huggingface-hub # via langchain + # via langchain-community # via langchain-core regex==2024.5.15 # via tiktoken @@ -306,6 +313,7 @@ requests==2.32.3 # via google-cloud-storage # via huggingface-hub # via langchain + # via langchain-community # via langchain-fireworks # via langsmith # via tiktoken @@ -332,6 +340,7 @@ sqlalchemy==2.0.31 # via langchain-community tenacity==8.5.0 # via langchain + # via langchain-community # via langchain-core tiktoken==0.7.0 # via langchain-openai @@ -356,6 +365,9 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via sqlalchemy + # via typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json tzdata==2024.1 # via pandas undetected-playwright==0.3.0 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 91396ae0..f1c9ff92 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,6 +7,8 @@ import uuid from pydantic import BaseModel +from langchain_community.chat_models import ChatOllama + from langchain_aws import BedrockEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings @@ -19,22 +21,23 @@ from ..models import ( Anthropic, AzureOpenAI, + OpenAI, Bedrock, Gemini, Groq, HuggingFace, - Ollama, - OpenAI, OneApi, Fireworks, VertexAI, Nvidia ) from ..models.ernie import Ernie +from langchain.chat_models import init_chat_model + from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek +from ..models import AzureOpenAI, OpenAI, Bedrock, Gemini, Groq, HuggingFace, Anthropic, DeepSeek class AbstractGraph(ABC): @@ -213,8 +216,10 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return VertexAI(llm_params) + elif "ollama" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("ollama/")[-1] + llm_params["model_provider"] = "ollama" # allow user to set model_tokens in config try: @@ -231,7 +236,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except AttributeError: self.model_token = 8192 - return Ollama(llm_params) + return init_chat_model(**llm_params) + elif "hugging_face" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] try: @@ -320,7 +326,7 @@ def _create_default_embedder(self, llm_config=None) -> object: return FireworksEmbeddings(model=self.llm_model.model_name) elif isinstance(self.llm_model, Nvidia): return NVIDIAEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, Ollama): + elif isinstance(self.llm_model, ChatOllama): # unwrap the kwargs from the model whihc is a dict params = self.llm_model._lc_kwargs # remove streaming and temperature diff --git a/scrapegraphai/models/ollama.py b/scrapegraphai/models/ollama.py deleted file mode 100644 index 4bf48178..00000000 --- a/scrapegraphai/models/ollama.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Ollama Module -""" -from langchain_community.chat_models import ChatOllama - - -class Ollama(ChatOllama): - """ - A wrapper for the ChatOllama class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) From 5e3eb6e43df4bd4c452d34b49f254235e9ff1b22 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:26:20 +0200 Subject: [PATCH 15/51] refactor(OpenAI): integrate new LangChain chat init --- scrapegraphai/graphs/abstract_graph.py | 6 +++--- scrapegraphai/models/openai.py | 17 ----------------- 2 files changed, 3 insertions(+), 20 deletions(-) delete mode 100644 scrapegraphai/models/openai.py diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index f1c9ff92..a0d0c52c 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -8,6 +8,7 @@ from pydantic import BaseModel from langchain_community.chat_models import ChatOllama +from langchain_openai import ChatOpenAI from langchain_aws import BedrockEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings @@ -21,7 +22,6 @@ from ..models import ( Anthropic, AzureOpenAI, - OpenAI, Bedrock, Gemini, Groq, @@ -37,7 +37,7 @@ from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info from ..helpers import models_tokens -from ..models import AzureOpenAI, OpenAI, Bedrock, Gemini, Groq, HuggingFace, Anthropic, DeepSeek +from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Anthropic, DeepSeek class AbstractGraph(ABC): @@ -311,7 +311,7 @@ def _create_default_embedder(self, llm_config=None) -> object: return GoogleGenerativeAIEmbeddings( google_api_key=llm_config["api_key"], model="models/embedding-001" ) - if isinstance(self.llm_model, OpenAI): + if isinstance(self.llm_model, ChatOpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, base_url=self.llm_model.openai_api_base) elif isinstance(self.llm_model, DeepSeek): diff --git a/scrapegraphai/models/openai.py b/scrapegraphai/models/openai.py deleted file mode 100644 index bfd9d74c..00000000 --- a/scrapegraphai/models/openai.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -OpenAI Module -""" -from langchain_openai import ChatOpenAI - - -class OpenAI(ChatOpenAI): - """ - A wrapper for the ChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) From 9e795f4e35efa91de850d976f8f6b51232f9073e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 29 Jul 2024 17:35:10 +0200 Subject: [PATCH 16/51] removed unused init files Co-Authored-By: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> --- requirements-dev.lock | 1 - requirements.lock | 1 - scrapegraphai/graphs/abstract_graph.py | 5 +++-- scrapegraphai/models/__init__.py | 2 -- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index bce18810..2c56f3db 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -180,7 +180,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index aa03fd14..a943dff1 100644 --- a/requirements.lock +++ b/requirements.lock @@ -128,7 +128,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index a0d0c52c..e1ce18f0 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -163,9 +163,10 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: if "gpt-" in llm_params["model"]: try: self.model_token = models_tokens["openai"][llm_params["model"]] + llm_params["model_provider"] = "openai" except KeyError as exc: raise KeyError("Model not supported") from exc - return OpenAI(llm_params) + return init_chat_model(**llm_params) elif "oneapi" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -455,4 +456,4 @@ def run(self) -> str: """ Abstract method to execute the graph and return the result. """ - pass + pass \ No newline at end of file diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index bfcb84d6..81bceeb8 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -2,12 +2,10 @@ __init__.py file for models folder """ -from .openai import OpenAI from .azure_openai import AzureOpenAI from .openai_itt import OpenAIImageToText from .openai_tts import OpenAITextToSpeech from .gemini import Gemini -from .ollama import Ollama from .hugging_face import HuggingFace from .groq import Groq from .bedrock import Bedrock From 2c5f934f101e319ec4e61009d4c464ca4626c1ff Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:46:17 +0200 Subject: [PATCH 17/51] refactor: remove LangChain wrappers --- scrapegraphai/nodes/generate_answer_node.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 81812598..5022b16f 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -6,9 +6,10 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel +from langchain_openai import ChatOpenAI +from langchain_community.chat_models import ChatOllama from tqdm import tqdm from ..utils.logging import get_logger -from ..models import Ollama, OpenAI from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md @@ -41,7 +42,7 @@ def __init__( self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( @@ -93,7 +94,7 @@ def execute(self, state: dict) -> dict: format_instructions = output_parser.get_format_instructions() - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = template_no_chunks_md template_chunks_prompt = template_chunks_md template_merge_prompt = template_merge_md From 25066b2bc51517e50058231664230b8edef365b9 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:49:21 +0200 Subject: [PATCH 18/51] refactor: remove LangChain wrappers for Ollama --- scrapegraphai/nodes/generate_answer_omni_node.py | 4 ++-- scrapegraphai/nodes/generate_answer_pdf_node.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index c2f2b65d..98be26dd 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -10,7 +10,7 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..models import Ollama +from langchain_community.chat_models import ChatOllama # Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni @@ -44,7 +44,7 @@ def __init__( super().__init__(node_name, "node", input, output, 3, node_config) self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 7add7948..47f14e86 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -10,7 +10,7 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm -from ..models import Ollama +from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger # Imports from the library @@ -59,7 +59,7 @@ def __init__( super().__init__(node_name, "node", input, output, 2, node_config) self.llm_model = node_config["llm_model"] - if isinstance(node_config["llm_model"], Ollama): + if isinstance(node_config["llm_model"], ChatOllama): self.llm_model.format="json" self.verbose = ( From f6b7c6a4309d7a7460d46f75a76a926711a99f3c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 29 Jul 2024 17:56:32 +0200 Subject: [PATCH 19/51] refactoring Co-Authored-By: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> --- scrapegraphai/nodes/fetch_node.py | 9 ++++----- scrapegraphai/nodes/generate_answer_node.py | 1 + scrapegraphai/nodes/search_internet_node.py | 6 ++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 56366677..64a80cfe 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -4,7 +4,7 @@ import json from typing import List, Optional - +from langchain_openai import ChatOpenAI import pandas as pd import requests from langchain_community.document_loaders import PyPDFLoader @@ -14,7 +14,6 @@ from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode -from ..models import OpenAI class FetchNode(BaseNode): @@ -165,7 +164,7 @@ def execute(self, state): parsed_content = source - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) compressed_document = [ @@ -184,7 +183,7 @@ def execute(self, state): if not self.cut: parsed_content = cleanup_html(response, source) - if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator): + if (isinstance(self.llm_model, ChatOpenAI) and not self.script_creator) or (self.force and not self.script_creator): parsed_content = convert_to_md(source, input_data[0]) compressed_document = [Document(page_content=parsed_content)] else: @@ -206,7 +205,7 @@ def execute(self, state): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") parsed_content = document[0].page_content - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: parsed_content = convert_to_md(document[0].page_content, input_data[0]) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 5022b16f..12ae6f0f 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -9,6 +9,7 @@ from langchain_openai import ChatOpenAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm +from langchain_openai import ChatOpenAI from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 2474ab60..7588b995 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -6,12 +6,10 @@ from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate - +from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger from ..utils.research_web import search_on_web from .base_node import BaseNode -from ..models import Ollama - class SearchInternetNode(BaseNode): """ @@ -97,7 +95,7 @@ def execute(self, state: dict) -> dict: search_answer = search_prompt | self.llm_model | output_parser # Ollama: Use no json format when creating the search query - if isinstance(self.llm_model, Ollama) and self.llm_model.format == 'json': + if isinstance(self.llm_model, ChatOllama) and self.llm_model.format == 'json': self.llm_model.format = None search_query = search_answer.invoke({"user_prompt": user_prompt})[0] self.llm_model.format = 'json' From 5007167af1cb9a8a4f8ed9925ae765bff06017e1 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 29 Jul 2024 17:59:16 +0200 Subject: [PATCH 20/51] removed unused models Co-Authored-By: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> --- scrapegraphai/models/__init__.py | 9 -------- scrapegraphai/models/anthropic.py | 17 -------------- scrapegraphai/models/azure_openai.py | 17 -------------- scrapegraphai/models/bedrock.py | 19 ---------------- scrapegraphai/models/fireworks.py | 33 ---------------------------- scrapegraphai/models/gemini.py | 20 ----------------- scrapegraphai/models/groq.py | 17 -------------- scrapegraphai/models/hugging_face.py | 17 -------------- scrapegraphai/models/vertex.py | 16 -------------- 9 files changed, 165 deletions(-) delete mode 100644 scrapegraphai/models/anthropic.py delete mode 100644 scrapegraphai/models/azure_openai.py delete mode 100644 scrapegraphai/models/bedrock.py delete mode 100644 scrapegraphai/models/fireworks.py delete mode 100644 scrapegraphai/models/gemini.py delete mode 100644 scrapegraphai/models/groq.py delete mode 100644 scrapegraphai/models/hugging_face.py delete mode 100644 scrapegraphai/models/vertex.py diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index 81bceeb8..9d27884b 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -1,17 +1,8 @@ """ __init__.py file for models folder """ - -from .azure_openai import AzureOpenAI from .openai_itt import OpenAIImageToText from .openai_tts import OpenAITextToSpeech -from .gemini import Gemini -from .hugging_face import HuggingFace -from .groq import Groq -from .bedrock import Bedrock -from .anthropic import Anthropic from .deepseek import DeepSeek from .oneapi import OneApi -from .fireworks import Fireworks -from .vertex import VertexAI from .nvidia import Nvidia diff --git a/scrapegraphai/models/anthropic.py b/scrapegraphai/models/anthropic.py deleted file mode 100644 index 3a7480d0..00000000 --- a/scrapegraphai/models/anthropic.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Anthropic Module -""" -from langchain_anthropic import ChatAnthropic - - -class Anthropic(ChatAnthropic): - """ - A wrapper for the ChatAnthropic class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) \ No newline at end of file diff --git a/scrapegraphai/models/azure_openai.py b/scrapegraphai/models/azure_openai.py deleted file mode 100644 index ae47d4e6..00000000 --- a/scrapegraphai/models/azure_openai.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -AzureOpenAI Module -""" -from langchain_openai import AzureChatOpenAI - - -class AzureOpenAI(AzureChatOpenAI): - """ - A wrapper for the AzureChatOpenAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py deleted file mode 100644 index 06299075..00000000 --- a/scrapegraphai/models/bedrock.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -Bedrock Module -""" -from langchain_aws import ChatBedrock - - -class Bedrock(ChatBedrock): - """Class for wrapping bedrock module""" - - def __init__(self, llm_config: dict): - """ - A wrapper for the ChatBedrock class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - # Initialize the superclass (ChatBedrock) with provided config parameters - super().__init__(**llm_config) diff --git a/scrapegraphai/models/fireworks.py b/scrapegraphai/models/fireworks.py deleted file mode 100644 index 445c4846..00000000 --- a/scrapegraphai/models/fireworks.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Fireworks Module -""" -from langchain_fireworks import ChatFireworks - - -class Fireworks(ChatFireworks): - """ - Initializes the Fireworks class. - - Args: - llm_config (dict): A dictionary containing configuration parameters for the LLM (required). - The specific keys and values will depend on the LLM implementation - used by the underlying `ChatFireworks` class. Consult its documentation - for details. - - Raises: - ValueError: If required keys are missing from the llm_config dictionary. - """ - - def __init__(self, llm_config: dict): - """ - Initializes the Fireworks class. - - Args: - llm_config (dict): A dictionary containing configuration parameters for the LLM. - The specific keys and values will depend on the LLM implementation. - - Raises: - ValueError: If required keys are missing from the llm_config dictionary. - """ - - super().__init__(**llm_config) diff --git a/scrapegraphai/models/gemini.py b/scrapegraphai/models/gemini.py deleted file mode 100644 index 1c939c6c..00000000 --- a/scrapegraphai/models/gemini.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Gemini Module -""" -from langchain_google_genai import ChatGoogleGenerativeAI - - -class Gemini(ChatGoogleGenerativeAI): - """ - A wrapper for the Gemini class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model - (e.g., model="gemini-pro") - """ - - def __init__(self, llm_config: dict): - # replace "api_key" to "google_api_key" - llm_config["google_api_key"] = llm_config.pop("api_key", None) - super().__init__(**llm_config) diff --git a/scrapegraphai/models/groq.py b/scrapegraphai/models/groq.py deleted file mode 100644 index 755f50aa..00000000 --- a/scrapegraphai/models/groq.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Groq Module -""" - -from langchain_groq import ChatGroq - -class Groq(ChatGroq): - """ - A wrapper for the Groq class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model (e.g., model="llama3-70b-8192") - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) \ No newline at end of file diff --git a/scrapegraphai/models/hugging_face.py b/scrapegraphai/models/hugging_face.py deleted file mode 100644 index 9696db1e..00000000 --- a/scrapegraphai/models/hugging_face.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -HuggingFace Module -""" -from langchain_community.chat_models.huggingface import ChatHuggingFace - - -class HuggingFace(ChatHuggingFace): - """ - A wrapper for the HuggingFace class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/vertex.py b/scrapegraphai/models/vertex.py deleted file mode 100644 index eb4676fc..00000000 --- a/scrapegraphai/models/vertex.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -VertexAI Module -""" -from langchain_google_vertexai import ChatVertexAI - -class VertexAI(ChatVertexAI): - """ - A wrapper for the ChatVertexAI class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) From 927548624034b3c30eca60011d216720102d1815 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 29 Jul 2024 21:57:37 +0200 Subject: [PATCH 21/51] refactor: remove redundant LangChain wrappers --- pyproject.toml | 3 +- requirements-dev.lock | 49 ++++++++++++++++ requirements.lock | 52 +++++++++++++++++ requirements.txt | 1 + scrapegraphai/graphs/abstract_graph.py | 77 ++++++++++++-------------- 5 files changed, 140 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b7b0d55d..bee7b61d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,8 @@ dependencies = [ "undetected-playwright>=0.3.0", "semchunk>=1.0.1", "langchain-fireworks>=0.1.3", - "langchain-community>=0.2.9" + "langchain-community>=0.2.9", + "langchain-huggingface>=0.0.3", ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 2c56f3db..0b3ef491 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -106,6 +106,8 @@ fastapi-pagination==0.12.26 # via burr filelock==3.15.4 # via huggingface-hub + # via torch + # via transformers fireworks-ai==0.14.0 # via langchain-fireworks fonttools==4.53.1 @@ -117,6 +119,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub + # via torch furo==2024.5.6 # via scrapegraphai gitdb==4.0.11 @@ -180,6 +183,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -212,7 +216,10 @@ httpx==0.27.0 httpx-sse==0.4.0 # via fireworks-ai huggingface-hub==0.24.0 + # via langchain-huggingface + # via sentence-transformers # via tokenizers + # via transformers idna==3.7 # via anyio # via email-validator @@ -235,11 +242,14 @@ jinja2==3.1.4 # via fastapi # via pydeck # via sphinx + # via torch jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore +joblib==1.4.2 + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -268,6 +278,7 @@ langchain-core==0.2.22 # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq + # via langchain-huggingface # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters @@ -279,6 +290,8 @@ langchain-google-vertexai==1.0.7 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai +langchain-huggingface==0.0.3 + # via scrapegraphai langchain-nvidia-ai-endpoints==0.1.6 # via scrapegraphai langchain-openai==0.1.17 @@ -309,6 +322,8 @@ minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -316,6 +331,8 @@ multiprocess==0.70.16 # via mpire mypy-extensions==1.0.0 # via typing-inspect +networkx==3.2.1 + # via torch numpy==1.26.4 # via altair # via contourpy @@ -327,9 +344,13 @@ numpy==1.26.4 # via pandas # via pyarrow # via pydeck + # via scikit-learn + # via scipy + # via sentence-transformers # via sf-hamilton # via shapely # via streamlit + # via transformers openai==1.37.0 # via burr # via langchain-fireworks @@ -348,6 +369,7 @@ packaging==24.1 # via pytest # via sphinx # via streamlit + # via transformers pandas==2.2.2 # via altair # via scrapegraphai @@ -357,6 +379,7 @@ pillow==10.4.0 # via fireworks-ai # via langchain-nvidia-ai-endpoints # via matplotlib + # via sentence-transformers # via streamlit platformdirs==4.2.2 # via pylint @@ -436,12 +459,14 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core + # via transformers # via uvicorn referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.5.15 # via tiktoken + # via transformers requests==2.32.3 # via burr # via free-proxy @@ -456,6 +481,7 @@ requests==2.32.3 # via sphinx # via streamlit # via tiktoken + # via transformers rich==13.7.1 # via streamlit # via typer @@ -466,8 +492,17 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.3 + # via transformers +scikit-learn==1.5.1 + # via sentence-transformers +scipy==1.13.1 + # via scikit-learn + # via sentence-transformers semchunk==2.2.0 # via scrapegraphai +sentence-transformers==3.0.1 + # via langchain-huggingface sf-hamilton==1.72.1 # via burr shapely==2.0.5 @@ -513,16 +548,22 @@ starlette==0.37.2 # via fastapi streamlit==1.36.0 # via burr +sympy==1.13.1 + # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core # via streamlit +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 # via anthropic + # via langchain-huggingface + # via transformers toml==0.10.2 # via streamlit tomli==2.0.1 @@ -532,6 +573,8 @@ tomlkit==0.13.0 # via pylint toolz==0.12.1 # via altair +torch==2.2.2 + # via sentence-transformers tornado==6.4.1 # via streamlit tqdm==4.66.4 @@ -541,6 +584,11 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk + # via sentence-transformers + # via transformers +transformers==4.43.3 + # via langchain-huggingface + # via sentence-transformers typer==0.12.3 # via fastapi-cli typing-extensions==4.12.2 @@ -562,6 +610,7 @@ typing-extensions==4.12.2 # via sqlalchemy # via starlette # via streamlit + # via torch # via typer # via typing-inspect # via uvicorn diff --git a/requirements.lock b/requirements.lock index a943dff1..a9df041e 100644 --- a/requirements.lock +++ b/requirements.lock @@ -63,6 +63,8 @@ faiss-cpu==1.8.0.post1 # via scrapegraphai filelock==3.15.4 # via huggingface-hub + # via torch + # via transformers fireworks-ai==0.14.0 # via langchain-fireworks free-proxy==1.1.1 @@ -72,6 +74,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub + # via torch google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.6 @@ -128,6 +131,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -156,17 +160,24 @@ httpx==0.27.0 httpx-sse==0.4.0 # via fireworks-ai huggingface-hub==0.24.0 + # via langchain-huggingface + # via sentence-transformers # via tokenizers + # via transformers idna==3.7 # via anyio # via httpx # via requests # via yarl +jinja2==3.1.4 + # via torch jiter==0.5.0 # via anthropic jmespath==1.0.1 # via boto3 # via botocore +joblib==1.4.2 + # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -189,6 +200,7 @@ langchain-core==0.2.22 # via langchain-google-genai # via langchain-google-vertexai # via langchain-groq + # via langchain-huggingface # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters @@ -200,6 +212,8 @@ langchain-google-vertexai==1.0.7 # via scrapegraphai langchain-groq==0.1.6 # via scrapegraphai +langchain-huggingface==0.0.3 + # via scrapegraphai langchain-nvidia-ai-endpoints==0.1.6 # via scrapegraphai langchain-openai==0.1.17 @@ -212,12 +226,16 @@ langsmith==0.1.93 # via langchain-core lxml==5.2.2 # via free-proxy +markupsafe==2.1.5 + # via jinja2 marshmallow==3.21.3 # via dataclasses-json minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk +mpmath==1.3.0 + # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -225,13 +243,19 @@ multiprocess==0.70.16 # via mpire mypy-extensions==1.0.0 # via typing-inspect +networkx==3.2.1 + # via torch numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws # via langchain-community # via pandas + # via scikit-learn + # via scipy + # via sentence-transformers # via shapely + # via transformers openai==1.37.0 # via langchain-fireworks # via langchain-openai @@ -244,11 +268,13 @@ packaging==24.1 # via huggingface-hub # via langchain-core # via marshmallow + # via transformers pandas==2.2.2 # via scrapegraphai pillow==10.4.0 # via fireworks-ai # via langchain-nvidia-ai-endpoints + # via sentence-transformers playwright==1.45.0 # via scrapegraphai # via undetected-playwright @@ -303,8 +329,10 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core + # via transformers regex==2024.5.15 # via tiktoken + # via transformers requests==2.32.3 # via free-proxy # via google-api-core @@ -316,12 +344,22 @@ requests==2.32.3 # via langchain-fireworks # via langsmith # via tiktoken + # via transformers rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 +safetensors==0.4.3 + # via transformers +scikit-learn==1.5.1 + # via sentence-transformers +scipy==1.13.1 + # via scikit-learn + # via sentence-transformers semchunk==2.2.0 # via scrapegraphai +sentence-transformers==3.0.1 + # via langchain-huggingface shapely==2.0.5 # via google-cloud-aiplatform six==1.16.0 @@ -337,15 +375,23 @@ soupsieve==2.5 sqlalchemy==2.0.31 # via langchain # via langchain-community +sympy==1.13.1 + # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 # via anthropic + # via langchain-huggingface + # via transformers +torch==2.2.2 + # via sentence-transformers tqdm==4.66.4 # via google-generativeai # via huggingface-hub @@ -353,6 +399,11 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk + # via sentence-transformers + # via transformers +transformers==4.43.3 + # via langchain-huggingface + # via sentence-transformers typing-extensions==4.12.2 # via anthropic # via anyio @@ -364,6 +415,7 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via sqlalchemy + # via torch # via typing-inspect typing-inspect==0.9.0 # via dataclasses-json diff --git a/requirements.txt b/requirements.txt index 440bf78a..8f3f5da5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ undetected-playwright>=0.3.0 semchunk>=1.0.1 langchain-fireworks>=0.1.3 langchain-community>=0.2.9 +langchain-huggingface>=0.0.3 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index e1ce18f0..f27d1aee 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -3,33 +3,28 @@ """ from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional import uuid from pydantic import BaseModel from langchain_community.chat_models import ChatOllama from langchain_openai import ChatOpenAI -from langchain_aws import BedrockEmbeddings -from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings +from langchain_aws import BedrockEmbeddings, ChatBedrock +from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings +from langchain_community.embeddings import OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings -from langchain_google_vertexai import VertexAIEmbeddings +from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings +from langchain_google_genai import ChatGoogleGenerativeAI from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings -from langchain_fireworks import FireworksEmbeddings -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings +from langchain_fireworks import FireworksEmbeddings, ChatFireworks +from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings from ..helpers import models_tokens from ..models import ( - Anthropic, - AzureOpenAI, - Bedrock, - Gemini, - Groq, - HuggingFace, OneApi, - Fireworks, - VertexAI, - Nvidia + Nvidia, + DeepSeek ) from ..models.ernie import Ernie from langchain.chat_models import init_chat_model @@ -37,7 +32,6 @@ from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info from ..helpers import models_tokens -from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Anthropic, DeepSeek class AbstractGraph(ABC): @@ -181,7 +175,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) except KeyError as exc: raise KeyError("Model not supported") from exc - return Fireworks(llm_params) + llm_params["model_provider"] = "fireworks" + return init_chat_model(**llm_params) elif "azure" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -189,7 +184,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = models_tokens["azure"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return AzureOpenAI(llm_params) + llm_params["model_provider"] = "azure_openai" + return init_chat_model(**llm_params) elif "nvidia" in llm_params["model"]: try: self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] @@ -203,20 +199,23 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = models_tokens["gemini"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return Gemini(llm_params) + llm_params["model_provider"] = "google_genai " + return init_chat_model(**llm_params) elif llm_params["model"].startswith("claude"): llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["claude"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return Anthropic(llm_params) + llm_params["model_provider"] = "anthropic" + return init_chat_model(**llm_params) elif llm_params["model"].startswith("vertexai"): try: self.model_token = models_tokens["vertexai"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return VertexAI(llm_params) + llm_params["model_provider"] = "google_vertexai" + return init_chat_model(**llm_params) elif "ollama" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("ollama/")[-1] @@ -246,7 +245,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return HuggingFace(llm_params) + llm_params["model_provider"] = "hugging_face" + return init_chat_model(**llm_params) elif "groq" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] @@ -255,7 +255,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return Groq(llm_params) + llm_params["model_provider"] = "groq" + return init_chat_model(**llm_params) elif "bedrock" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] model_id = llm_params["model"] @@ -265,22 +266,16 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return Bedrock( - { - "client": client, - "model_id": model_id, - "model_kwargs": { - "temperature": llm_params["temperature"], - }, - } - ) + llm_params["model_provider"] = "bedrock" + return init_chat_model(**llm_params) elif "claude-3-" in llm_params["model"]: try: self.model_token = models_tokens["claude"]["claude3"] except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return Anthropic(llm_params) + llm_params["model_provider"] = "anthropic" + return init_chat_model(**llm_params) elif "deepseek" in llm_params["model"]: try: self.model_token = models_tokens["deepseek"][llm_params["model"]] @@ -308,7 +303,7 @@ def _create_default_embedder(self, llm_config=None) -> object: Raises: ValueError: If the model is not supported. """ - if isinstance(self.llm_model, Gemini): + if isinstance(self.llm_model, ChatGoogleGenerativeAI): return GoogleGenerativeAIEmbeddings( google_api_key=llm_config["api_key"], model="models/embedding-001" ) @@ -317,13 +312,13 @@ def _create_default_embedder(self, llm_config=None) -> object: base_url=self.llm_model.openai_api_base) elif isinstance(self.llm_model, DeepSeek): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, VertexAI): + elif isinstance(self.llm_model, ChatVertexAI): return VertexAIEmbeddings() elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model - elif isinstance(self.llm_model, AzureOpenAI): + elif isinstance(self.llm_model, AzureChatOpenAI): return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, Fireworks): + elif isinstance(self.llm_model, ChatFireworks): return FireworksEmbeddings(model=self.llm_model.model_name) elif isinstance(self.llm_model, Nvidia): return NVIDIAEmbeddings(model=self.llm_model.model_name) @@ -335,9 +330,9 @@ def _create_default_embedder(self, llm_config=None) -> object: params.pop("temperature", None) return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, HuggingFace): - return HuggingFaceHubEmbeddings(model=self.llm_model.model) - elif isinstance(self.llm_model, Bedrock): + elif isinstance(self.llm_model, ChatHuggingFace): + return HuggingFaceEmbeddings(model=self.llm_model.model) + elif isinstance(self.llm_model, ChatBedrock): return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) else: raise ValueError("Embedding Model missing or not supported") @@ -384,7 +379,7 @@ def _create_embedder(self, embedder_config: dict) -> object: models_tokens["hugging_face"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return HuggingFaceHubEmbeddings(model=embedder_params["model"]) + return HuggingFaceEmbeddings(model=embedder_params["model"]) elif "fireworks" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: From bc2c9967d2f13ade6eeb7b23e9b423f6e79aa890 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:59:12 +0200 Subject: [PATCH 22/51] refactor: remove redundant wrappers for Ernie and Nvidia --- scrapegraphai/graphs/abstract_graph.py | 12 ++++++------ scrapegraphai/models/ernie.py | 17 ----------------- scrapegraphai/models/nvidia.py | 25 ------------------------- 3 files changed, 6 insertions(+), 48 deletions(-) delete mode 100644 scrapegraphai/models/ernie.py delete mode 100644 scrapegraphai/models/nvidia.py diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index f27d1aee..50de0a94 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -19,14 +19,14 @@ from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings from langchain_fireworks import FireworksEmbeddings, ChatFireworks from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings +from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA +from langchain_community.chat_models import ErnieBotChat from ..helpers import models_tokens from ..models import ( OneApi, - Nvidia, DeepSeek ) -from ..models.ernie import Ernie + from langchain.chat_models import init_chat_model from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info @@ -192,7 +192,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) except KeyError as exc: raise KeyError("Model not supported") from exc - return Nvidia(llm_params) + return ChatNVIDIA(llm_params) elif "gemini" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] try: @@ -289,7 +289,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 - return Ernie(llm_params) + return ErnieBotChat(llm_params) else: raise ValueError("Model provided by the configuration not supported") @@ -320,7 +320,7 @@ def _create_default_embedder(self, llm_config=None) -> object: return AzureOpenAIEmbeddings() elif isinstance(self.llm_model, ChatFireworks): return FireworksEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, Nvidia): + elif isinstance(self.llm_model, ChatNVIDIA): return NVIDIAEmbeddings(model=self.llm_model.model_name) elif isinstance(self.llm_model, ChatOllama): # unwrap the kwargs from the model whihc is a dict diff --git a/scrapegraphai/models/ernie.py b/scrapegraphai/models/ernie.py deleted file mode 100644 index 75e2a261..00000000 --- a/scrapegraphai/models/ernie.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Ernie Module -""" -from langchain_community.chat_models import ErnieBotChat - - -class Ernie(ErnieBotChat): - """ - A wrapper for the ErnieBotChat class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) diff --git a/scrapegraphai/models/nvidia.py b/scrapegraphai/models/nvidia.py deleted file mode 100644 index 48ce3c0f..00000000 --- a/scrapegraphai/models/nvidia.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -This is a Python wrapper class for ChatNVIDIA. -It provides default configuration and could be extended with additional methods if needed. -The purpose of this wrapper is to simplify the creation of instances of ChatNVIDIA by providing -default configurations for certain parameters, -allowing users to focus on specifying other important parameters without having -to understand all the details of the underlying class's constructor. -It inherits from the base class ChatNVIDIA and overrides -its init method to provide a more user-friendly interface. -The constructor takes one argument: llm_config, which is used to initialize the superclass -with default configuration. -""" - -from langchain_nvidia_ai_endpoints import ChatNVIDIA - -class Nvidia(ChatNVIDIA): - """ A wrapper for the Nvidia class that provides default configuration - and could be extended with additional methods if needed. - - Args: - llm_config (dict): Configuration parameters for the language model. - """ - - def __init__(self, llm_config: dict): - super().__init__(**llm_config) From 07ef383ab94318d070a71685aa80af0c0d48d129 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 30 Jul 2024 11:11:31 +0200 Subject: [PATCH 23/51] add rye packages Co-Authored-By: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> --- requirements-dev.lock | 1 - requirements.lock | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index 0b3ef491..6bbbd4b9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -183,7 +183,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index a9df041e..b4d1015d 100644 --- a/requirements.lock +++ b/requirements.lock @@ -131,7 +131,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 From 88710f1a7c7d50f57108456112da30d1a12a1ba1 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:57:08 +0200 Subject: [PATCH 24/51] chore: remove unused import --- scrapegraphai/models/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index 9d27884b..ce798ad8 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -5,4 +5,3 @@ from .openai_tts import OpenAITextToSpeech from .deepseek import DeepSeek from .oneapi import OneApi -from .nvidia import Nvidia From 1db164e9e682eefbc1414989a043fefa2e9009c2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 30 Jul 2024 16:12:31 +0200 Subject: [PATCH 25/51] feat: fix tests --- examples/single_node/robot_node.py | 4 ++-- tests/nodes/robot_node_test.py | 3 +-- tests/nodes/search_internet_node_test.py | 4 ++-- tests/nodes/search_link_node_test.py | 4 ++-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/single_node/robot_node.py b/examples/single_node/robot_node.py index f51f8649..c2bcbbd1 100644 --- a/examples/single_node/robot_node.py +++ b/examples/single_node/robot_node.py @@ -2,7 +2,7 @@ Example of custom graph using existing nodes """ -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import RobotsNode # ************************************************ @@ -26,7 +26,7 @@ # Define the node # ************************************************ -llm_model = Ollama(graph_config["llm"]) +llm_model = ChatOllama(graph_config["llm"]) robots_node = RobotsNode( input="url", diff --git a/tests/nodes/robot_node_test.py b/tests/nodes/robot_node_test.py index 00a45b05..62527dda 100644 --- a/tests/nodes/robot_node_test.py +++ b/tests/nodes/robot_node_test.py @@ -1,7 +1,6 @@ import pytest from unittest.mock import MagicMock - -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import RobotsNode @pytest.fixture diff --git a/tests/nodes/search_internet_node_test.py b/tests/nodes/search_internet_node_test.py index db2cbdee..8e198448 100644 --- a/tests/nodes/search_internet_node_test.py +++ b/tests/nodes/search_internet_node_test.py @@ -1,5 +1,5 @@ import unittest -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import SearchInternetNode class TestSearchInternetNode(unittest.TestCase): @@ -18,7 +18,7 @@ def setUp(self): } # Define the model - self.llm_model = Ollama(self.graph_config["llm"]) + self.llm_model = ChatOllama(self.graph_config["llm"]) # Initialize the SearchInternetNode self.search_node = SearchInternetNode( diff --git a/tests/nodes/search_link_node_test.py b/tests/nodes/search_link_node_test.py index 648db4ee..1f8c5a58 100644 --- a/tests/nodes/search_link_node_test.py +++ b/tests/nodes/search_link_node_test.py @@ -1,5 +1,5 @@ import pytest -from scrapegraphai.models import Ollama +from langchain_community.chat_models import ChatOllama from scrapegraphai.nodes import SearchLinkNode from unittest.mock import patch, MagicMock @@ -18,7 +18,7 @@ def setup(): } # Instantiate the LLM model with the configuration - llm_model = Ollama(graph_config["llm"]) + llm_model = ChatOllama(graph_config["llm"]) # Define the SearchLinkNode with necessary configurations search_link_node = SearchLinkNode( From b15fd9f4dc3643c9904a2cbaa5f392a6805c9762 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 30 Jul 2024 14:19:46 +0000 Subject: [PATCH 26/51] ci(release): 1.11.0-beta.5 [skip ci] ## [1.11.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.4...v1.11.0-beta.5) (2024-07-30) ### Features * fix tests ([1db164e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1db164e9e682eefbc1414989a043fefa2e9009c2)) ### chore * remove unused import ([88710f1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/88710f1a7c7d50f57108456112da30d1a12a1ba1)) ### Refactor * **Ollama:** integrate new LangChain chat init ([d177afb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d177afb68be036465ede1f567d2562b145d77d36)) * **OpenAI:** integrate new LangChain chat init ([5e3eb6e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5e3eb6e43df4bd4c452d34b49f254235e9ff1b22)) * remove LangChain wrappers ([2c5f934](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2c5f934f101e319ec4e61009d4c464ca4626c1ff)) * remove LangChain wrappers for Ollama ([25066b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/25066b2bc51517e50058231664230b8edef365b9)) * remove redundant LangChain wrappers ([9275486](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/927548624034b3c30eca60011d216720102d1815)) * remove redundant wrappers for Ernie and Nvidia ([bc2c996](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bc2c9967d2f13ade6eeb7b23e9b423f6e79aa890)) --- CHANGELOG.md | 22 ++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea0c578f..1d7b4c62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,25 @@ +## [1.11.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.4...v1.11.0-beta.5) (2024-07-30) + + +### Features + +* fix tests ([1db164e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/1db164e9e682eefbc1414989a043fefa2e9009c2)) + + +### chore + +* remove unused import ([88710f1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/88710f1a7c7d50f57108456112da30d1a12a1ba1)) + + +### Refactor + +* **Ollama:** integrate new LangChain chat init ([d177afb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d177afb68be036465ede1f567d2562b145d77d36)) +* **OpenAI:** integrate new LangChain chat init ([5e3eb6e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5e3eb6e43df4bd4c452d34b49f254235e9ff1b22)) +* remove LangChain wrappers ([2c5f934](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2c5f934f101e319ec4e61009d4c464ca4626c1ff)) +* remove LangChain wrappers for Ollama ([25066b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/25066b2bc51517e50058231664230b8edef365b9)) +* remove redundant LangChain wrappers ([9275486](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/927548624034b3c30eca60011d216720102d1815)) +* remove redundant wrappers for Ernie and Nvidia ([bc2c996](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bc2c9967d2f13ade6eeb7b23e9b423f6e79aa890)) + ## [1.11.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.3...v1.11.0-beta.4) (2024-07-25) diff --git a/pyproject.toml b/pyproject.toml index bee7b61d..77d48e36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b4" +version = "1.11.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From b17756d934e0a26791bb51aa60a8c79b3f8b82a4 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:07:19 +0200 Subject: [PATCH 27/51] style: enforce pylint styling --- scrapegraphai/graphs/abstract_graph.py | 95 +++++++++++++------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 50de0a94..b022607c 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,31 +7,24 @@ import uuid from pydantic import BaseModel -from langchain_community.chat_models import ChatOllama -from langchain_openai import ChatOpenAI - +from langchain_community.chat_models import ChatOllama, ErnieBotChat from langchain_aws import BedrockEmbeddings, ChatBedrock from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings from langchain_community.embeddings import OllamaEmbeddings -from langchain_google_genai import GoogleGenerativeAIEmbeddings +from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings -from langchain_google_genai import ChatGoogleGenerativeAI -from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings from langchain_fireworks import FireworksEmbeddings, ChatFireworks from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA -from langchain_community.chat_models import ErnieBotChat +from langchain.chat_models import init_chat_model + from ..helpers import models_tokens from ..models import ( OneApi, DeepSeek ) +from ..utils.logging import set_verbosity_warning, set_verbosity_info -from langchain.chat_models import init_chat_model - -from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info - -from ..helpers import models_tokens class AbstractGraph(ABC): @@ -65,14 +58,14 @@ class AbstractGraph(ABC): >>> result = my_graph.run() """ - def __init__(self, prompt: str, config: dict, + def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[BaseModel] = None): self.prompt = prompt self.source = source self.config = config self.schema = schema - self.llm_model = self._create_llm(config["llm"], chat=True) + self.llm_model = self._create_llm(config["llm"]) self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( config["embeddings"]) self.verbose = False if config is None else config.get( @@ -128,7 +121,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - def _create_llm(self, llm_config: dict, chat=False) -> object: + def _create_llm(self, llm_config: dict) -> object: """ Create a large language model instance based on the configuration provided. @@ -148,9 +141,9 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: # If model instance is passed directly instead of the model details if "model_instance" in llm_params: try: - self.model_token = llm_params["model_tokens"] + self.model_token = llm_params["model_tokens"] except KeyError as exc: - raise KeyError("model_tokens not specified") from exc + raise KeyError("model_tokens not specified") from exc return llm_params["model_instance"] # Instantiate the language model based on the model name @@ -161,7 +154,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return init_chat_model(**llm_params) - elif "oneapi" in llm_params["model"]: + + if "oneapi" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] try: @@ -169,7 +163,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return OneApi(llm_params) - elif "fireworks" in llm_params["model"]: + + if "fireworks" in llm_params["model"]: try: self.model_token = models_tokens["fireworks"][llm_params["model"].split("/")[-1]] llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) @@ -177,7 +172,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc llm_params["model_provider"] = "fireworks" return init_chat_model(**llm_params) - elif "azure" in llm_params["model"]: + + if "azure" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] try: @@ -186,14 +182,16 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc llm_params["model_provider"] = "azure_openai" return init_chat_model(**llm_params) - elif "nvidia" in llm_params["model"]: + + if "nvidia" in llm_params["model"]: try: self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) except KeyError as exc: raise KeyError("Model not supported") from exc return ChatNVIDIA(llm_params) - elif "gemini" in llm_params["model"]: + + if "gemini" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["gemini"][llm_params["model"]] @@ -201,7 +199,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc llm_params["model_provider"] = "google_genai " return init_chat_model(**llm_params) - elif llm_params["model"].startswith("claude"): + + if llm_params["model"].startswith("claude"): llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["claude"][llm_params["model"]] @@ -209,7 +208,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: raise KeyError("Model not supported") from exc llm_params["model_provider"] = "anthropic" return init_chat_model(**llm_params) - elif llm_params["model"].startswith("vertexai"): + + if llm_params["model"].startswith("vertexai"): try: self.model_token = models_tokens["vertexai"][llm_params["model"]] except KeyError as exc: @@ -217,7 +217,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: llm_params["model_provider"] = "google_vertexai" return init_chat_model(**llm_params) - elif "ollama" in llm_params["model"]: + if "ollama" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("ollama/")[-1] llm_params["model_provider"] = "ollama" @@ -238,7 +238,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: return init_chat_model(**llm_params) - elif "hugging_face" in llm_params["model"]: + if "hugging_face" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["hugging_face"][llm_params["model"]] @@ -247,7 +247,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = 8192 llm_params["model_provider"] = "hugging_face" return init_chat_model(**llm_params) - elif "groq" in llm_params["model"]: + + if "groq" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] try: @@ -257,10 +258,9 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = 8192 llm_params["model_provider"] = "groq" return init_chat_model(**llm_params) - elif "bedrock" in llm_params["model"]: + + if "bedrock" in llm_params["model"]: llm_params["model"] = llm_params["model"].split("/")[-1] - model_id = llm_params["model"] - client = llm_params.get("client", None) try: self.model_token = models_tokens["bedrock"][llm_params["model"]] except KeyError: @@ -268,7 +268,8 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = 8192 llm_params["model_provider"] = "bedrock" return init_chat_model(**llm_params) - elif "claude-3-" in llm_params["model"]: + + if "claude-3-" in llm_params["model"]: try: self.model_token = models_tokens["claude"]["claude3"] except KeyError: @@ -276,22 +277,24 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: self.model_token = 8192 llm_params["model_provider"] = "anthropic" return init_chat_model(**llm_params) - elif "deepseek" in llm_params["model"]: + + if "deepseek" in llm_params["model"]: try: self.model_token = models_tokens["deepseek"][llm_params["model"]] except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 return DeepSeek(llm_params) - elif "ernie" in llm_params["model"]: + + if "ernie" in llm_params["model"]: try: self.model_token = models_tokens["ernie"][llm_params["model"]] except KeyError: print("model not found, using default token size (8192)") self.model_token = 8192 return ErnieBotChat(llm_params) - else: - raise ValueError("Model provided by the configuration not supported") + + raise ValueError("Model provided by the configuration not supported") def _create_default_embedder(self, llm_config=None) -> object: """ @@ -308,7 +311,7 @@ def _create_default_embedder(self, llm_config=None) -> object: google_api_key=llm_config["api_key"], model="models/embedding-001" ) if isinstance(self.llm_model, ChatOpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, base_url=self.llm_model.openai_api_base) elif isinstance(self.llm_model, DeepSeek): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) @@ -356,7 +359,7 @@ def _create_embedder(self, embedder_config: dict) -> object: # Instantiate the embedding model based on the model name if "openai" in embedder_params["model"]: return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - elif "azure" in embedder_params["model"]: + if "azure" in embedder_params["model"]: return AzureOpenAIEmbeddings() if "nvidia" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) @@ -364,36 +367,36 @@ def _create_embedder(self, embedder_config: dict) -> object: models_tokens["nvidia"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], + return NVIDIAEmbeddings(model=embedder_params["model"], nvidia_api_key=embedder_params["api_key"]) - elif "ollama" in embedder_params["model"]: + if "ollama" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["ollama"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return OllamaEmbeddings(**embedder_params) - elif "hugging_face" in embedder_params["model"]: + if "hugging_face" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["hugging_face"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return HuggingFaceEmbeddings(model=embedder_params["model"]) - elif "fireworks" in embedder_params["model"]: + if "fireworks" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: models_tokens["fireworks"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return FireworksEmbeddings(model=embedder_params["model"]) - elif "gemini" in embedder_params["model"]: + if "gemini" in embedder_params["model"]: try: models_tokens["gemini"][embedder_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - elif "bedrock" in embedder_params["model"]: + if "bedrock" in embedder_params["model"]: embedder_params["model"] = embedder_params["model"].split("/")[-1] client = embedder_params.get("client", None) try: @@ -401,8 +404,8 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - else: - raise ValueError("Model provided by the configuration not supported") + + raise ValueError("Model provided by the configuration not supported") def get_state(self, key=None) -> dict: """ "" @@ -444,11 +447,9 @@ def _create_graph(self): """ Abstract method to create a graph representation. """ - pass @abstractmethod def run(self) -> str: """ Abstract method to execute the graph and return the result. """ - pass \ No newline at end of file From bb73d916a1a7b378438038ec928eeda6d8f6ac9d Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:41:09 +0200 Subject: [PATCH 28/51] refactor: reuse code for common interface models --- scrapegraphai/graphs/abstract_graph.py | 157 ++++++++----------------- 1 file changed, 49 insertions(+), 108 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index b022607c..306901e8 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -146,138 +146,61 @@ def _create_llm(self, llm_config: dict) -> object: raise KeyError("model_tokens not specified") from exc return llm_params["model_instance"] - # Instantiate the language model based on the model name - if "gpt-" in llm_params["model"]: + # Instantiate the language model based on the model name (models that use the common interface) + def handle_model(model_name, provider, token_key, default_token=8192): try: - self.model_token = models_tokens["openai"][llm_params["model"]] - llm_params["model_provider"] = "openai" - except KeyError as exc: - raise KeyError("Model not supported") from exc + self.model_token = models_tokens[provider][token_key] + except KeyError: + print(f"Model not found, using default token size ({default_token})") + self.model_token = default_token + llm_params["model_provider"] = provider + llm_params["model"] = model_name return init_chat_model(**llm_params) - if "oneapi" in llm_params["model"]: - # take the model after the last dash - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["oneapi"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OneApi(llm_params) + if "gpt-" in llm_params["model"]: + return handle_model(llm_params["model"], "openai", llm_params["model"]) if "fireworks" in llm_params["model"]: - try: - self.model_token = models_tokens["fireworks"][llm_params["model"].split("/")[-1]] - llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) - except KeyError as exc: - raise KeyError("Model not supported") from exc - llm_params["model_provider"] = "fireworks" - return init_chat_model(**llm_params) + model_name = "/".join(llm_params["model"].split("/")[1:]) + token_key = llm_params["model"].split("/")[-1] + return handle_model(model_name, "fireworks", token_key) if "azure" in llm_params["model"]: - # take the model after the last dash - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["azure"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - llm_params["model_provider"] = "azure_openai" - return init_chat_model(**llm_params) - - if "nvidia" in llm_params["model"]: - try: - self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] - llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) - except KeyError as exc: - raise KeyError("Model not supported") from exc - return ChatNVIDIA(llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "azure_openai", model_name) if "gemini" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["gemini"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - llm_params["model_provider"] = "google_genai " - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "google_genai", model_name) if llm_params["model"].startswith("claude"): - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["claude"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - llm_params["model_provider"] = "anthropic" - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "anthropic", model_name) if llm_params["model"].startswith("vertexai"): - try: - self.model_token = models_tokens["vertexai"][llm_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - llm_params["model_provider"] = "google_vertexai" - return init_chat_model(**llm_params) + return handle_model(llm_params["model"], "google_vertexai", llm_params["model"]) if "ollama" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("ollama/")[-1] - llm_params["model_provider"] = "ollama" - - # allow user to set model_tokens in config - try: - if "model_tokens" in llm_params: - self.model_token = llm_params["model_tokens"] - elif llm_params["model"] in models_tokens["ollama"]: - try: - self.model_token = models_tokens["ollama"][llm_params["model"]] - except KeyError as exc: - print("model not found, using default token size (8192)") - self.model_token = 8192 - else: - self.model_token = 8192 - except AttributeError: - self.model_token = 8192 - - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("ollama/")[-1] + token_key = model_name if "model_tokens" not in llm_params else llm_params["model_tokens"] + return handle_model(model_name, "ollama", token_key) if "hugging_face" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["hugging_face"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - llm_params["model_provider"] = "hugging_face" - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "hugging_face", model_name) if "groq" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - - try: - self.model_token = models_tokens["groq"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - llm_params["model_provider"] = "groq" - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "groq", model_name) if "bedrock" in llm_params["model"]: - llm_params["model"] = llm_params["model"].split("/")[-1] - try: - self.model_token = models_tokens["bedrock"][llm_params["model"]] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - llm_params["model_provider"] = "bedrock" - return init_chat_model(**llm_params) + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "bedrock", model_name) if "claude-3-" in llm_params["model"]: - try: - self.model_token = models_tokens["claude"]["claude3"] - except KeyError: - print("model not found, using default token size (8192)") - self.model_token = 8192 - llm_params["model_provider"] = "anthropic" - return init_chat_model(**llm_params) + return handle_model(llm_params["model"], "anthropic", "claude3") + # Instantiate the language model based on the model name (models that do not use the common interface) if "deepseek" in llm_params["model"]: try: self.model_token = models_tokens["deepseek"][llm_params["model"]] @@ -293,7 +216,25 @@ def _create_llm(self, llm_config: dict) -> object: print("model not found, using default token size (8192)") self.model_token = 8192 return ErnieBotChat(llm_params) + + if "oneapi" in llm_params["model"]: + # take the model after the last dash + llm_params["model"] = llm_params["model"].split("/")[-1] + try: + self.model_token = models_tokens["oneapi"][llm_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return OneApi(llm_params) + + if "nvidia" in llm_params["model"]: + try: + self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] + llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) + except KeyError as exc: + raise KeyError("Model not supported") from exc + return ChatNVIDIA(llm_params) + # Raise an error if the model did not match any of the previous cases raise ValueError("Model provided by the configuration not supported") def _create_default_embedder(self, llm_config=None) -> object: From 4caed545e5030460b2d5e46f9ad90546ce36f0ee Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 31 Jul 2024 19:49:59 +0200 Subject: [PATCH 29/51] feat: intregration of firebase --- pyproject.toml | 1 + scrapegraphai/docloaders/__init__.py | 1 + scrapegraphai/docloaders/broswer_base.py | 46 ++++++++++++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 scrapegraphai/docloaders/broswer_base.py diff --git a/pyproject.toml b/pyproject.toml index 77d48e36..2738bfd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "langchain-fireworks>=0.1.3", "langchain-community>=0.2.9", "langchain-huggingface>=0.0.3", + "browserbase==0.3.0" ] license = "MIT" diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index a9e45407..0efdc879 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,3 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader +from .broswer_base import browser_base_fetch \ No newline at end of file diff --git a/scrapegraphai/docloaders/broswer_base.py b/scrapegraphai/docloaders/broswer_base.py new file mode 100644 index 00000000..6127c097 --- /dev/null +++ b/scrapegraphai/docloaders/broswer_base.py @@ -0,0 +1,46 @@ +""" +browserbase integration module +""" +from browserbase import Browserbase + +def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: + """ + BrowserBase Fetch + + This module provides an interface to the BrowserBase API. + + The `browser_base_fetch` function takes three arguments: + - `api_key`: The API key provided by BrowserBase. + - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. + - `link`: The URL or link that you want to fetch data from. + + It initializes a Browserbase object with the given API key and project ID, + then uses this object to load the specified link. It returns the result of the loading operation. + + Example usage: + + ``` + from browser_base_fetch import browser_base_fetch + + result = browser_base_fetch(api_key="your_api_key", + project_id="your_project_id", link="https://example.com") + print(result) + ``` + + Please note that you need to replace "your_api_key" and "your_project_id" + with your actual BrowserBase API key and project ID. + + Args: + api_key (str): The API key provided by BrowserBase. + project_id (str): The ID of the project on BrowserBase where you want to fetch data from. + link (str): The URL or link that you want to fetch data from. + + Returns: + object: The result of the loading operation. + """ + + browserbase = Browserbase(api_key=api_key, project_id=project_id) + + result = browserbase.load(link) + + return result From 74ed8d06c5db4f9734521c2f84f4379b18b7308f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 31 Jul 2024 17:51:22 +0000 Subject: [PATCH 30/51] ci(release): 1.11.0-beta.6 [skip ci] ## [1.11.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.5...v1.11.0-beta.6) (2024-07-31) ### Features * intregration of firebase ([4caed54](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4caed545e5030460b2d5e46f9ad90546ce36f0ee)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d7b4c62..53e36c8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.11.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.5...v1.11.0-beta.6) (2024-07-31) + + +### Features + +* intregration of firebase ([4caed54](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4caed545e5030460b2d5e46f9ad90546ce36f0ee)) + ## [1.11.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.4...v1.11.0-beta.5) (2024-07-30) diff --git a/pyproject.toml b/pyproject.toml index 2738bfd6..4a7fe29f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b5" +version = "1.11.0b6" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From fb87d01ced72c0912be86ae01d93ceefa5d2df08 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 11:27:10 +0200 Subject: [PATCH 31/51] Create browser_base.py --- examples/extras/browser_base.py | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 examples/extras/browser_base.py diff --git a/examples/extras/browser_base.py b/examples/extras/browser_base.py new file mode 100644 index 00000000..465c80ba --- /dev/null +++ b/examples/extras/browser_base.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "gpt-3.5-turbo", + }, + "browser_base": { + "api_key": os.getenv("BROWSER_BASE_API_KEY"), + "project_id": os.getenv("BROWSER_BASE_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) From 7076ab12d3e07d02a96ca00375454385303ae004 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 11:31:27 +0200 Subject: [PATCH 32/51] allignment --- pyproject.toml | 1 + requirements-dev.lock | 5 ++++ requirements.lock | 5 ++++ scrapegraphai/docloaders/__init__.py | 1 + scrapegraphai/docloaders/browser_base.py | 38 ++++++++++++++++++++++++ 5 files changed, 50 insertions(+) create mode 100644 scrapegraphai/docloaders/browser_base.py diff --git a/pyproject.toml b/pyproject.toml index 77d48e36..2738bfd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "langchain-fireworks>=0.1.3", "langchain-community>=0.2.9", "langchain-huggingface>=0.0.3", + "browserbase==0.3.0" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 6bbbd4b9..24b7156d 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -54,6 +54,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai burr==0.22.1 # via scrapegraphai cachetools==5.4.0 @@ -208,6 +210,7 @@ httptools==0.6.1 # via uvicorn httpx==0.27.0 # via anthropic + # via browserbase # via fastapi # via fireworks-ai # via groq @@ -383,6 +386,7 @@ pillow==10.4.0 platformdirs==4.2.2 # via pylint playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright pluggy==1.5.0 @@ -412,6 +416,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via burr # via fastapi # via fastapi-pagination diff --git a/requirements.lock b/requirements.lock index b4d1015d..0e8bb930 100644 --- a/requirements.lock +++ b/requirements.lock @@ -37,6 +37,8 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer +browserbase==0.3.0 + # via scrapegraphai cachetools==5.4.0 # via google-auth certifi==2024.7.4 @@ -153,6 +155,7 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via anthropic + # via browserbase # via fireworks-ai # via groq # via openai @@ -275,6 +278,7 @@ pillow==10.4.0 # via langchain-nvidia-ai-endpoints # via sentence-transformers playwright==1.45.0 + # via browserbase # via scrapegraphai # via undetected-playwright proto-plus==1.24.0 @@ -299,6 +303,7 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 # via anthropic + # via browserbase # via fireworks-ai # via google-cloud-aiplatform # via google-generativeai diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index a9e45407..51561a42 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,3 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader +from .broswer_base import browser_base_fetch diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py new file mode 100644 index 00000000..8f2a0b8e --- /dev/null +++ b/scrapegraphai/docloaders/browser_base.py @@ -0,0 +1,38 @@ +""" +browserbase integration module +""" +from browserbase import Browserbase + +def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: + """ + BrowserBase Fetch + This module provides an interface to the BrowserBase API. + The `browser_base_fetch` function takes three arguments: + - `api_key`: The API key provided by BrowserBase. + - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. + - `link`: The URL or link that you want to fetch data from. + It initializes a Browserbase object with the given API key and project ID, + then uses this object to load the specified link. + It returns the result of the loading operation. + Example usage: + ``` + from browser_base_fetch import browser_base_fetch + result = browser_base_fetch(api_key="your_api_key", + project_id="your_project_id", link="https://example.com") + print(result) + ``` + Please note that you need to replace "your_api_key" and "your_project_id" + with your actual BrowserBase API key and project ID. + Args: + api_key (str): The API key provided by BrowserBase. + project_id (str): The ID of the project on BrowserBase where you want to fetch data from. + link (str): The URL or link that you want to fetch data from. + Returns: + object: The result of the loading operation. + """ + + browserbase = Browserbase(api_key=api_key, project_id=project_id) + + result = browserbase.load(link) + + return result From a94ebcde0078d66d33e67f7e0a87850efb92d408 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Thu, 1 Aug 2024 11:53:17 +0200 Subject: [PATCH 33/51] refactor: move embeddings code from AbstractGraph to RAGNode --- scrapegraphai/graphs/abstract_graph.py | 123 +-------------------- scrapegraphai/nodes/rag_node.py | 144 ++++++++++++++++++++++++- 2 files changed, 142 insertions(+), 125 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 306901e8..4ed08057 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,15 +7,8 @@ import uuid from pydantic import BaseModel -from langchain_community.chat_models import ChatOllama, ErnieBotChat -from langchain_aws import BedrockEmbeddings, ChatBedrock -from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings -from langchain_community.embeddings import OllamaEmbeddings -from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings -from langchain_fireworks import FireworksEmbeddings, ChatFireworks -from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA +from langchain_community.chat_models import ErnieBotChat +from langchain_nvidia_ai_endpoints import ChatNVIDIA from langchain.chat_models import init_chat_model from ..helpers import models_tokens @@ -66,8 +59,6 @@ def __init__(self, prompt: str, config: dict, self.config = config self.schema = schema self.llm_model = self._create_llm(config["llm"]) - self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( - config["embeddings"]) self.verbose = False if config is None else config.get( "verbose", False) self.headless = True if config is None else config.get( @@ -237,116 +228,6 @@ def handle_model(model_name, provider, token_key, default_token=8192): # Raise an error if the model did not match any of the previous cases raise ValueError("Model provided by the configuration not supported") - def _create_default_embedder(self, llm_config=None) -> object: - """ - Create an embedding model instance based on the chosen llm model. - - Returns: - object: An instance of the embedding model client. - - Raises: - ValueError: If the model is not supported. - """ - if isinstance(self.llm_model, ChatGoogleGenerativeAI): - return GoogleGenerativeAIEmbeddings( - google_api_key=llm_config["api_key"], model="models/embedding-001" - ) - if isinstance(self.llm_model, ChatOpenAI): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, - base_url=self.llm_model.openai_api_base) - elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, ChatVertexAI): - return VertexAIEmbeddings() - elif isinstance(self.llm_model, AzureOpenAIEmbeddings): - return self.llm_model - elif isinstance(self.llm_model, AzureChatOpenAI): - return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, ChatFireworks): - return FireworksEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, ChatNVIDIA): - return NVIDIAEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, ChatOllama): - # unwrap the kwargs from the model whihc is a dict - params = self.llm_model._lc_kwargs - # remove streaming and temperature - params.pop("streaming", None) - params.pop("temperature", None) - - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, ChatHuggingFace): - return HuggingFaceEmbeddings(model=self.llm_model.model) - elif isinstance(self.llm_model, ChatBedrock): - return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) - else: - raise ValueError("Embedding Model missing or not supported") - - def _create_embedder(self, embedder_config: dict) -> object: - """ - Create an embedding model instance based on the configuration provided. - - Args: - embedder_config (dict): Configuration parameters for the embedding model. - - Returns: - object: An instance of the embedding model client. - - Raises: - KeyError: If the model is not supported. - """ - embedder_params = {**embedder_config} - if "model_instance" in embedder_config: - return embedder_params["model_instance"] - # Instantiate the embedding model based on the model name - if "openai" in embedder_params["model"]: - return OpenAIEmbeddings(api_key=embedder_params["api_key"]) - if "azure" in embedder_params["model"]: - return AzureOpenAIEmbeddings() - if "nvidia" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) - if "ollama" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["ollama"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return OllamaEmbeddings(**embedder_params) - if "hugging_face" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceEmbeddings(model=embedder_params["model"]) - if "fireworks" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) - if "gemini" in embedder_params["model"]: - try: - models_tokens["gemini"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) - if "bedrock" in embedder_params["model"]: - embedder_params["model"] = embedder_params["model"].split("/")[-1] - client = embedder_params.get("client", None) - try: - models_tokens["bedrock"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) - - raise ValueError("Model provided by the configuration not supported") def get_state(self, key=None) -> dict: """ "" diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index a4f58191..952daa6c 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -14,8 +14,20 @@ from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS +from langchain_community.chat_models import ChatOllama +from langchain_aws import BedrockEmbeddings, ChatBedrock +from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings +from langchain_community.embeddings import OllamaEmbeddings +from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI +from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings +from langchain_fireworks import FireworksEmbeddings, ChatFireworks +from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI +from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA + from ..utils.logging import get_logger from .base_node import BaseNode +from ..helpers import models_tokens +from ..models import DeepSeek class RAGNode(BaseNode): @@ -95,10 +107,21 @@ def execute(self, state: dict) -> dict: self.logger.info("--- (updated chunks metadata) ---") # check if embedder_model is provided, if not use llm_model - self.embedder_model = ( - self.embedder_model if self.embedder_model else self.llm_model - ) - embeddings = self.embedder_model + if self.embedder_model is not None: + embeddings = self.embedder_model + elif 'embeddings' in self.node_config: + try: + embeddings = self._create_embedder(self.node_config['embedder_config']) + except Exception: + try: + embeddings = self._create_default_embedder() + self.embedder_model = embeddings + except ValueError: + embeddings = self.llm_model + self.embedder_model = self.llm_model + else: + embeddings = self.llm_model + self.embedder_model = self.llm_model folder_name = self.node_config.get("cache_path", "cache") @@ -141,3 +164,116 @@ def execute(self, state: dict) -> dict: state.update({self.output[0]: compressed_docs}) return state + + + def _create_default_embedder(self, llm_config=None) -> object: + """ + Create an embedding model instance based on the chosen llm model. + + Returns: + object: An instance of the embedding model client. + + Raises: + ValueError: If the model is not supported. + """ + if isinstance(self.llm_model, ChatGoogleGenerativeAI): + return GoogleGenerativeAIEmbeddings( + google_api_key=llm_config["api_key"], model="models/embedding-001" + ) + if isinstance(self.llm_model, ChatOpenAI): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, + base_url=self.llm_model.openai_api_base) + elif isinstance(self.llm_model, DeepSeek): + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) + elif isinstance(self.llm_model, ChatVertexAI): + return VertexAIEmbeddings() + elif isinstance(self.llm_model, AzureOpenAIEmbeddings): + return self.llm_model + elif isinstance(self.llm_model, AzureChatOpenAI): + return AzureOpenAIEmbeddings() + elif isinstance(self.llm_model, ChatFireworks): + return FireworksEmbeddings(model=self.llm_model.model_name) + elif isinstance(self.llm_model, ChatNVIDIA): + return NVIDIAEmbeddings(model=self.llm_model.model_name) + elif isinstance(self.llm_model, ChatOllama): + # unwrap the kwargs from the model whihc is a dict + params = self.llm_model._lc_kwargs + # remove streaming and temperature + params.pop("streaming", None) + params.pop("temperature", None) + + return OllamaEmbeddings(**params) + elif isinstance(self.llm_model, ChatHuggingFace): + return HuggingFaceEmbeddings(model=self.llm_model.model) + elif isinstance(self.llm_model, ChatBedrock): + return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) + else: + raise ValueError("Embedding Model missing or not supported") + + + def _create_embedder(self, embedder_config: dict) -> object: + """ + Create an embedding model instance based on the configuration provided. + + Args: + embedder_config (dict): Configuration parameters for the embedding model. + + Returns: + object: An instance of the embedding model client. + + Raises: + KeyError: If the model is not supported. + """ + embedder_params = {**embedder_config} + if "model_instance" in embedder_config: + return embedder_params["model_instance"] + # Instantiate the embedding model based on the model name + if "openai" in embedder_params["model"]: + return OpenAIEmbeddings(api_key=embedder_params["api_key"]) + if "azure" in embedder_params["model"]: + return AzureOpenAIEmbeddings() + if "nvidia" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["nvidia"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return NVIDIAEmbeddings(model=embedder_params["model"], + nvidia_api_key=embedder_params["api_key"]) + if "ollama" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["ollama"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return OllamaEmbeddings(**embedder_params) + if "hugging_face" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["hugging_face"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return HuggingFaceEmbeddings(model=embedder_params["model"]) + if "fireworks" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["fireworks"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return FireworksEmbeddings(model=embedder_params["model"]) + if "gemini" in embedder_params["model"]: + try: + models_tokens["gemini"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return GoogleGenerativeAIEmbeddings(model=embedder_params["model"]) + if "bedrock" in embedder_params["model"]: + embedder_params["model"] = embedder_params["model"].split("/")[-1] + client = embedder_params.get("client", None) + try: + models_tokens["bedrock"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) + + raise ValueError("Model provided by the configuration not supported") From 5ecdbe715f4bb223fa1be834fda07ccea2a51cb9 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 12:51:18 +0200 Subject: [PATCH 34/51] feat: add integration in the abstract grapgh --- ...ser_base.py => browser_base_integration.py} | 6 ++++-- scrapegraphai/docloaders/__init__.py | 2 +- scrapegraphai/graphs/abstract_graph.py | 15 ++++++++------- scrapegraphai/nodes/fetch_node.py | 18 ++++++++++++++---- 4 files changed, 27 insertions(+), 14 deletions(-) rename examples/extras/{browser_base.py => browser_base_integration.py} (98%) diff --git a/examples/extras/browser_base.py b/examples/extras/browser_base_integration.py similarity index 98% rename from examples/extras/browser_base.py rename to examples/extras/browser_base_integration.py index 465c80ba..97529879 100644 --- a/examples/extras/browser_base.py +++ b/examples/extras/browser_base_integration.py @@ -2,10 +2,12 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json +from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -from dotenv import load_dotenv + load_dotenv() # ************************************************ diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 51561a42..45a3783d 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,4 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader -from .broswer_base import browser_base_fetch +from .browser_base import browser_base_fetch diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 50de0a94..2ccc988b 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -72,15 +72,16 @@ def __init__(self, prompt: str, config: dict, self.source = source self.config = config self.schema = schema - self.llm_model = self._create_llm(config["llm"], chat=True) - self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( - config["embeddings"]) - self.verbose = False if config is None else config.get( + self.llm_model = self._create_llm(self.config["llm"], chat=True) + self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder( + self.config["embeddings"]) + self.verbose = False if self.config is None else self.config.get( "verbose", False) - self.headless = True if config is None else config.get( + self.headless = True if self.config is None else config.get( "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - self.cache_path = config.get("cache_path", False) + self.loader_kwargs = self.config.get("loader_kwargs", {}) + self.cache_path = self.config.get("cache_path", False) + self.browser_base = self.config.get("browser_base") # Create the graph self.graph = self._create_graph() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 64a80cfe..95561a66 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -11,6 +11,7 @@ from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader +from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode @@ -74,6 +75,8 @@ def __init__( False if node_config is None else node_config.get("cut", True) ) + self.browser_base = node_config.get("browser_base") + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -164,7 +167,7 @@ def execute(self, state): parsed_content = source - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) compressed_document = [ @@ -177,7 +180,7 @@ def execute(self, state): if response.status_code == 200: if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - + parsed_content = response if not self.cut: @@ -198,8 +201,15 @@ def execute(self, state): if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) - loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() + if self.browser_base is not None: + document = [ + Document(page_content= browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), source), + metadata={}) + ] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() if not document or not document[0].page_content.strip(): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") From 65f9e3a24c8f192d42fb467c03a33fd4b1f64588 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 12:58:35 +0200 Subject: [PATCH 35/51] Delete browser_base.py --- scrapegraphai/docloaders/browser_base.py | 38 ------------------------ 1 file changed, 38 deletions(-) delete mode 100644 scrapegraphai/docloaders/browser_base.py diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py deleted file mode 100644 index 8f2a0b8e..00000000 --- a/scrapegraphai/docloaders/browser_base.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -browserbase integration module -""" -from browserbase import Browserbase - -def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: - """ - BrowserBase Fetch - This module provides an interface to the BrowserBase API. - The `browser_base_fetch` function takes three arguments: - - `api_key`: The API key provided by BrowserBase. - - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. - - `link`: The URL or link that you want to fetch data from. - It initializes a Browserbase object with the given API key and project ID, - then uses this object to load the specified link. - It returns the result of the loading operation. - Example usage: - ``` - from browser_base_fetch import browser_base_fetch - result = browser_base_fetch(api_key="your_api_key", - project_id="your_project_id", link="https://example.com") - print(result) - ``` - Please note that you need to replace "your_api_key" and "your_project_id" - with your actual BrowserBase API key and project ID. - Args: - api_key (str): The API key provided by BrowserBase. - project_id (str): The ID of the project on BrowserBase where you want to fetch data from. - link (str): The URL or link that you want to fetch data from. - Returns: - object: The result of the loading operation. - """ - - browserbase = Browserbase(api_key=api_key, project_id=project_id) - - result = browserbase.load(link) - - return result From d03eedccd718379f267fa305165ad61a295112f8 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:05:13 +0200 Subject: [PATCH 36/51] Update chromium.py --- scrapegraphai/docloaders/chromium.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 474c22de..cb0cfd9a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -1,3 +1,6 @@ +""" +Chromium module +""" import asyncio from typing import Any, AsyncIterator, Iterator, List, Optional From e21d461710e036eb3f71382a2d0d832bf1863c39 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:16:49 +0200 Subject: [PATCH 37/51] push --- .../docloaders/{broswer_base.py => browser_base.py} | 8 +++++--- scrapegraphai/nodes/fetch_node.py | 9 ++++----- 2 files changed, 9 insertions(+), 8 deletions(-) rename scrapegraphai/docloaders/{broswer_base.py => browser_base.py} (83%) diff --git a/scrapegraphai/docloaders/broswer_base.py b/scrapegraphai/docloaders/browser_base.py similarity index 83% rename from scrapegraphai/docloaders/broswer_base.py rename to scrapegraphai/docloaders/browser_base.py index 6127c097..47798e29 100644 --- a/scrapegraphai/docloaders/broswer_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -2,8 +2,9 @@ browserbase integration module """ from browserbase import Browserbase +from typing import List -def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: +def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ BrowserBase Fetch @@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: - `link`: The URL or link that you want to fetch data from. It initializes a Browserbase object with the given API key and project ID, - then uses this object to load the specified link. It returns the result of the loading operation. + then uses this object to load the specified link. + It returns the result of the loading operation. Example usage: @@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object: browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load(link) + result = browserbase.load([link]) return result diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 95561a66..741f6a22 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,11 +202,10 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - document = [ - Document(page_content= browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), source), - metadata={}) - ] + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), source) + + document = [Document(page_content= data, metadata={"source": "html file"})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 968c69e217d9c180b9b8c2aa52ca59b9a1733525 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:23:54 +0200 Subject: [PATCH 38/51] fix: fixed bug on fetch_node Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/docloaders/browser_base.py | 2 +- scrapegraphai/nodes/fetch_node.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 47798e29..dd290d2d 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -43,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load([link]) + result = browserbase.load(link) return result diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 741f6a22..86b02bf6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,10 +202,11 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), source) + if self.browser_base is not None: + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) - document = [Document(page_content= data, metadata={"source": "html file"})] + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 6d8e02cd62ecf213cfff6e8258b79564db8eeb55 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:24:32 +0200 Subject: [PATCH 39/51] Update browser_base.py Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/docloaders/browser_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index dd290d2d..77628bc5 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -1,8 +1,8 @@ """ browserbase integration module """ -from browserbase import Browserbase from typing import List +from browserbase import Browserbase def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ @@ -43,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load(link) + result = browserbase.load([link]) return result From be870a43161cb2ed7f0f60553c2f3742c6b939eb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 13:24:48 +0200 Subject: [PATCH 40/51] Update fetch_node.py Co-Authored-By: Federico Minutoli <40361744+DiTo97@users.noreply.github.com> --- scrapegraphai/nodes/fetch_node.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 86b02bf6..4971ddb3 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -202,11 +202,10 @@ def execute(self, state): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - if self.browser_base is not None: - data = browser_base_fetch(self.browser_base.get("api_key"), - self.browser_base.get("project_id"), [source]) + data = browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), [source]) - document = [Document(page_content=content, metadata={"source": source}) for content in data] + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() From 0b4cfd6522dcad0eb418f0badd0f7824a1efd534 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 14:38:50 +0200 Subject: [PATCH 41/51] fix: abstract_graph and removed unused embeddings Co-Authored-By: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> --- examples/bedrock/csv_scraper_bedrock.py | 3 - .../csv_scraper_graph_multi_bedrock.py | 3 - examples/bedrock/custom_graph_bedrock.py | 3 - examples/bedrock/json_scraper_bedrock.py | 3 - .../bedrock/json_scraper_multi_bedrock.py | 3 - examples/bedrock/pdf_scraper_graph_bedrock.py | 3 - .../pdf_scraper_graph_multi_bedrock.py | 3 - examples/bedrock/scrape_plain_text_bedrock.py | 3 - examples/bedrock/script_generator_bedrock.py | 5 +- .../bedrock/script_multi_generator_bedrock.py | 5 +- examples/bedrock/search_graph_bedrock.py | 3 - .../bedrock/search_graph_schema_bedrock.py | 3 - examples/bedrock/search_link_graph_bedrock.py | 3 - examples/bedrock/smart_scraper_bedrock.py | 3 - .../bedrock/smart_scraper_multi_bedrock.py | 3 - .../bedrock/smart_scraper_schema_bedrock.py | 3 - examples/bedrock/xml_scraper_bedrock.py | 3 - .../xml_scraper_graph_multi_bedrock.py | 3 - examples/deepseek/csv_scraper_deepseek.py | 5 -- .../csv_scraper_graph_multi_deepseek.py | 5 -- examples/deepseek/custom_graph_deepseek.py | 89 ------------------- examples/deepseek/json_scraper_deepseek.py | 5 -- .../deepseek/json_scraper_multi_deepseek.py | 5 -- .../deepseek/pdf_scraper_graph_deepseek.py | 5 -- .../deepseek/pdf_scraper_multi_deepseek.py | 5 -- .../deepseek/scrape_plain_text_deepseek.py | 5 -- .../deepseek/script_generator_deepseek.py | 5 -- .../script_multi_generator_deepseek.py | 5 -- examples/deepseek/search_graph_deepseek.py | 5 -- .../deepseek/search_graph_schema_deepseek.py | 5 -- .../deepseek/search_link_graph_deepseek.py | 5 -- examples/deepseek/smart_scraper_deepseek.py | 5 -- .../deepseek/smart_scraper_multi_deepseek.py | 5 -- .../deepseek/smart_scraper_schema_deepseek.py | 5 -- examples/deepseek/xml_scraper_deepseek.py | 5 -- .../xml_scraper_graph_multi_deepseek.py | 5 -- examples/fireworks/csv_scraper_fireworks.py | 6 -- .../csv_scraper_graph_multi_fireworks.py | 5 -- examples/fireworks/custom_graph_fireworks.py | 27 +----- examples/fireworks/deep_scraper_fireworks.py | 7 +- .../fireworks/json_scraper_fireworkspy.py | 5 -- .../fireworks/json_scraper_multi_fireworks.py | 5 -- examples/fireworks/pdf_scraper_fireworks.py | 5 -- .../fireworks/pdf_scraper_multi_fireworks.py | 5 -- .../fireworks/scrape_plain_text_fireworks.py | 5 -- .../fireworks/script_generator_fireworks.py | 5 -- .../script_generator_schema_fireworks.py | 5 -- .../script_multi_generator_fireworks.py | 5 -- examples/fireworks/search_graph_fireworks.py | 5 -- .../search_graph_schema_fireworks.py | 5 -- .../fireworks/search_link_graph_fireworks.py | 5 -- examples/fireworks/smart_scraper_fireworks.py | 5 -- .../smart_scraper_multi_fireworks.py | 6 +- .../smart_scraper_schema_fireworks.py | 5 -- examples/fireworks/xml_scraper_fireworks.py | 5 -- .../xml_scraper_graph_multi_fireworks.py | 5 -- examples/groq/csv_scraper_graph_multi_groq.py | 5 -- examples/groq/csv_scraper_groq.py | 5 -- examples/groq/custom_graph_groq.py | 22 +---- examples/groq/json_scraper_groq.py | 5 -- examples/groq/json_scraper_multi_groq.py | 5 -- examples/groq/pdf_scraper_graph_groq.py | 5 -- examples/groq/pdf_scraper_multi_groq.py | 5 -- examples/groq/scrape_plain_text_groq.py | 5 -- examples/groq/script_generator_groq.py | 5 -- examples/groq/script_multi_generator_groq.py | 5 -- examples/groq/search_graph_groq.py | 5 -- examples/groq/search_graph_schema_groq.py | 5 -- examples/groq/search_link_graph_groq.py | 5 -- examples/groq/smart_scraper_groq.py | 5 -- examples/groq/smart_scraper_multi_groq.py | 5 -- examples/groq/smart_scraper_schema_groq.py | 5 -- examples/groq/xml_scraper_graph_multi_groq.py | 5 -- examples/groq/xml_scraper_groq.py | 5 -- .../csv_scraper_graph_multi_huggingfacehub.py | 1 - .../csv_scraper_huggingfacehub.py | 1 - .../custom_graph_huggingfacehub.py | 1 - .../json_scraper_huggingfacehub.py | 1 - .../json_scraper_multi_huggingfacehub.py | 1 - .../pdf_scraper_graph_huggingfacehub.py | 1 - .../pdf_scraper_multi_huggingfacehub.py | 1 - .../scrape_plain_text_huggingfacehub.py | 1 - .../script_generator_huggingfacehub.py | 1 - .../script_multi_generator_huggingfacehub.py | 1 - .../search_graph_huggingfacehub.py | 1 - .../search_link_graph_huggingfacehub.py | 1 - .../smart_scraper_huggingfacehub.py | 1 - .../smart_scraper_multi_huggingfacehub.py | 1 - .../smart_scraper_schema_huggingfacehub.py | 1 - .../xml_scraper_graph_multi_huggingfacehub.py | 1 - .../xml_scraper_huggingfacehub.py | 1 - examples/local_models/custom_graph_ollama.py | 24 ++--- .../local_models/json_scraper_multi_ollama.py | 4 - examples/local_models/json_scraper_ollama.py | 5 -- .../local_models/pdf_scraper_multi_ollama.py | 4 - examples/local_models/pdf_scraper_ollama.py | 4 - .../local_models/scrape_plain_text_ollama.py | 5 -- .../script_multi_generator_ollama.py | 5 -- examples/local_models/search_graph_ollama.py | 5 -- .../search_graph_schema_ollama.py | 5 -- .../local_models/search_link_graph_ollama.py | 6 +- examples/local_models/smart_scraper_ollama.py | 6 +- .../smart_scraper_schema_ollama.py | 6 +- .../xml_scraper_graph_multi_ollama.py | 6 +- examples/local_models/xml_scraper_ollama.py | 5 -- scrapegraphai/graphs/abstract_graph.py | 3 - 106 files changed, 21 insertions(+), 554 deletions(-) delete mode 100644 examples/deepseek/custom_graph_deepseek.py diff --git a/examples/bedrock/csv_scraper_bedrock.py b/examples/bedrock/csv_scraper_bedrock.py index f015f77b..a69417c0 100644 --- a/examples/bedrock/csv_scraper_bedrock.py +++ b/examples/bedrock/csv_scraper_bedrock.py @@ -33,9 +33,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # ************************************************ diff --git a/examples/bedrock/csv_scraper_graph_multi_bedrock.py b/examples/bedrock/csv_scraper_graph_multi_bedrock.py index c776c508..b9dd7f6f 100644 --- a/examples/bedrock/csv_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/csv_scraper_graph_multi_bedrock.py @@ -28,9 +28,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py index 45358555..9002a598 100644 --- a/examples/bedrock/custom_graph_bedrock.py +++ b/examples/bedrock/custom_graph_bedrock.py @@ -28,9 +28,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/json_scraper_bedrock.py b/examples/bedrock/json_scraper_bedrock.py index 0729adfe..dc1bf769 100644 --- a/examples/bedrock/json_scraper_bedrock.py +++ b/examples/bedrock/json_scraper_bedrock.py @@ -32,9 +32,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/json_scraper_multi_bedrock.py b/examples/bedrock/json_scraper_multi_bedrock.py index 5dc666b8..5848ef17 100644 --- a/examples/bedrock/json_scraper_multi_bedrock.py +++ b/examples/bedrock/json_scraper_multi_bedrock.py @@ -10,9 +10,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } FILE_NAME = "inputs/example.json" diff --git a/examples/bedrock/pdf_scraper_graph_bedrock.py b/examples/bedrock/pdf_scraper_graph_bedrock.py index 6ee4b753..dcef848e 100644 --- a/examples/bedrock/pdf_scraper_graph_bedrock.py +++ b/examples/bedrock/pdf_scraper_graph_bedrock.py @@ -18,9 +18,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/pdf_scraper_graph_multi_bedrock.py b/examples/bedrock/pdf_scraper_graph_multi_bedrock.py index 7102c406..37e61c42 100644 --- a/examples/bedrock/pdf_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/pdf_scraper_graph_multi_bedrock.py @@ -11,9 +11,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # *************** diff --git a/examples/bedrock/scrape_plain_text_bedrock.py b/examples/bedrock/scrape_plain_text_bedrock.py index 01bec609..0214a1e3 100644 --- a/examples/bedrock/scrape_plain_text_bedrock.py +++ b/examples/bedrock/scrape_plain_text_bedrock.py @@ -33,9 +33,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/script_generator_bedrock.py b/examples/bedrock/script_generator_bedrock.py index 0d3f7d07..26863193 100644 --- a/examples/bedrock/script_generator_bedrock.py +++ b/examples/bedrock/script_generator_bedrock.py @@ -19,10 +19,7 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - }, - "library": "beautifulsoup" + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/bedrock/script_multi_generator_bedrock.py b/examples/bedrock/script_multi_generator_bedrock.py index 2f892546..ecef966d 100644 --- a/examples/bedrock/script_multi_generator_bedrock.py +++ b/examples/bedrock/script_multi_generator_bedrock.py @@ -15,10 +15,7 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - }, - "library": "beautifulsoup" + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/bedrock/search_graph_bedrock.py b/examples/bedrock/search_graph_bedrock.py index 9b32d3db..b27f6e5d 100644 --- a/examples/bedrock/search_graph_bedrock.py +++ b/examples/bedrock/search_graph_bedrock.py @@ -16,9 +16,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } # ************************************************ diff --git a/examples/bedrock/search_graph_schema_bedrock.py b/examples/bedrock/search_graph_schema_bedrock.py index 90539155..a49ba730 100644 --- a/examples/bedrock/search_graph_schema_bedrock.py +++ b/examples/bedrock/search_graph_schema_bedrock.py @@ -27,9 +27,6 @@ class Dishes(BaseModel): "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py index 116dea01..fc1e6233 100644 --- a/examples/bedrock/search_link_graph_bedrock.py +++ b/examples/bedrock/search_link_graph_bedrock.py @@ -15,9 +15,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_bedrock.py b/examples/bedrock/smart_scraper_bedrock.py index 03394434..9c747c00 100644 --- a/examples/bedrock/smart_scraper_bedrock.py +++ b/examples/bedrock/smart_scraper_bedrock.py @@ -19,9 +19,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_multi_bedrock.py b/examples/bedrock/smart_scraper_multi_bedrock.py index 7aeb71cd..b363d6ab 100644 --- a/examples/bedrock/smart_scraper_multi_bedrock.py +++ b/examples/bedrock/smart_scraper_multi_bedrock.py @@ -17,9 +17,6 @@ "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/smart_scraper_schema_bedrock.py b/examples/bedrock/smart_scraper_schema_bedrock.py index 6213ea1f..2829efec 100644 --- a/examples/bedrock/smart_scraper_schema_bedrock.py +++ b/examples/bedrock/smart_scraper_schema_bedrock.py @@ -26,9 +26,6 @@ class Projects(BaseModel): "client": "client_name", "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 - }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" } } diff --git a/examples/bedrock/xml_scraper_bedrock.py b/examples/bedrock/xml_scraper_bedrock.py index 018a8387..5f81fbf6 100644 --- a/examples/bedrock/xml_scraper_bedrock.py +++ b/examples/bedrock/xml_scraper_bedrock.py @@ -32,9 +32,6 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - } } # ************************************************ diff --git a/examples/bedrock/xml_scraper_graph_multi_bedrock.py b/examples/bedrock/xml_scraper_graph_multi_bedrock.py index a0ed3560..638ce280 100644 --- a/examples/bedrock/xml_scraper_graph_multi_bedrock.py +++ b/examples/bedrock/xml_scraper_graph_multi_bedrock.py @@ -29,9 +29,6 @@ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "temperature": 0.0 }, - "embeddings": { - "model": "bedrock/cohere.embed-multilingual-v3" - } } # ************************************************ diff --git a/examples/deepseek/csv_scraper_deepseek.py b/examples/deepseek/csv_scraper_deepseek.py index fd55469d..b734b543 100644 --- a/examples/deepseek/csv_scraper_deepseek.py +++ b/examples/deepseek/csv_scraper_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/csv_scraper_graph_multi_deepseek.py b/examples/deepseek/csv_scraper_graph_multi_deepseek.py index d665bc31..ea5e9154 100644 --- a/examples/deepseek/csv_scraper_graph_multi_deepseek.py +++ b/examples/deepseek/csv_scraper_graph_multi_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/custom_graph_deepseek.py b/examples/deepseek/custom_graph_deepseek.py deleted file mode 100644 index a265db95..00000000 --- a/examples/deepseek/custom_graph_deepseek.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Example of custom graph using Gemini Google model -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.models import Gemini -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -deepseek_key = os.getenv("DEEPSEEK_APIKEY") - -graph_config = { - "llm": { - "model": "deepseek-chat", - "openai_api_key": deepseek_key, - "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "verbose": True, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = Gemini(graph_config["llm"]) - -# define the nodes for the graph -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={"chunk_size": 4096} -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={"llm": llm_model}, -) -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - node_config={"llm": llm_model}, -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes={ - fetch_node, - parse_node, - rag_node, - generate_answer_node, - }, - edges={ - (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) - }, - entry_point=fetch_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "List me the projects with their description", - "url": "https://perinim.github.io/projects/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/deepseek/json_scraper_deepseek.py b/examples/deepseek/json_scraper_deepseek.py index 696a08d9..dfe6f489 100644 --- a/examples/deepseek/json_scraper_deepseek.py +++ b/examples/deepseek/json_scraper_deepseek.py @@ -29,11 +29,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/json_scraper_multi_deepseek.py b/examples/deepseek/json_scraper_multi_deepseek.py index 17660ddb..b957dde0 100644 --- a/examples/deepseek/json_scraper_multi_deepseek.py +++ b/examples/deepseek/json_scraper_multi_deepseek.py @@ -15,11 +15,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/pdf_scraper_graph_deepseek.py b/examples/deepseek/pdf_scraper_graph_deepseek.py index fe6f2658..d66bbef5 100644 --- a/examples/deepseek/pdf_scraper_graph_deepseek.py +++ b/examples/deepseek/pdf_scraper_graph_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/pdf_scraper_multi_deepseek.py b/examples/deepseek/pdf_scraper_multi_deepseek.py index c884b798..211e4635 100644 --- a/examples/deepseek/pdf_scraper_multi_deepseek.py +++ b/examples/deepseek/pdf_scraper_multi_deepseek.py @@ -15,11 +15,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/scrape_plain_text_deepseek.py b/examples/deepseek/scrape_plain_text_deepseek.py index 7076dd39..d7a070d7 100644 --- a/examples/deepseek/scrape_plain_text_deepseek.py +++ b/examples/deepseek/scrape_plain_text_deepseek.py @@ -31,11 +31,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/script_generator_deepseek.py b/examples/deepseek/script_generator_deepseek.py index 09db0876..fd5fd4dd 100644 --- a/examples/deepseek/script_generator_deepseek.py +++ b/examples/deepseek/script_generator_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/deepseek/script_multi_generator_deepseek.py b/examples/deepseek/script_multi_generator_deepseek.py index 41e363b5..2ebfd90a 100644 --- a/examples/deepseek/script_multi_generator_deepseek.py +++ b/examples/deepseek/script_multi_generator_deepseek.py @@ -20,11 +20,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/deepseek/search_graph_deepseek.py b/examples/deepseek/search_graph_deepseek.py index d607e1b1..176d6107 100644 --- a/examples/deepseek/search_graph_deepseek.py +++ b/examples/deepseek/search_graph_deepseek.py @@ -18,11 +18,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/deepseek/search_graph_schema_deepseek.py b/examples/deepseek/search_graph_schema_deepseek.py index 8debee2f..f5db278e 100644 --- a/examples/deepseek/search_graph_schema_deepseek.py +++ b/examples/deepseek/search_graph_schema_deepseek.py @@ -34,11 +34,6 @@ class Dishes(BaseModel): "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py index 30e4a9b3..6a35f177 100644 --- a/examples/deepseek/search_link_graph_deepseek.py +++ b/examples/deepseek/search_link_graph_deepseek.py @@ -19,11 +19,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_deepseek.py b/examples/deepseek/smart_scraper_deepseek.py index 9fe00a2a..ed291b02 100644 --- a/examples/deepseek/smart_scraper_deepseek.py +++ b/examples/deepseek/smart_scraper_deepseek.py @@ -21,11 +21,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_multi_deepseek.py b/examples/deepseek/smart_scraper_multi_deepseek.py index c88ab525..fafe7261 100644 --- a/examples/deepseek/smart_scraper_multi_deepseek.py +++ b/examples/deepseek/smart_scraper_multi_deepseek.py @@ -19,11 +19,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/smart_scraper_schema_deepseek.py b/examples/deepseek/smart_scraper_schema_deepseek.py index a16ae575..5cbbb702 100644 --- a/examples/deepseek/smart_scraper_schema_deepseek.py +++ b/examples/deepseek/smart_scraper_schema_deepseek.py @@ -33,11 +33,6 @@ class Projects(BaseModel): "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/xml_scraper_deepseek.py b/examples/deepseek/xml_scraper_deepseek.py index 3b2af61b..ba401b91 100644 --- a/examples/deepseek/xml_scraper_deepseek.py +++ b/examples/deepseek/xml_scraper_deepseek.py @@ -31,11 +31,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/deepseek/xml_scraper_graph_multi_deepseek.py b/examples/deepseek/xml_scraper_graph_multi_deepseek.py index 5d3c29d5..0f53a6b2 100644 --- a/examples/deepseek/xml_scraper_graph_multi_deepseek.py +++ b/examples/deepseek/xml_scraper_graph_multi_deepseek.py @@ -30,11 +30,6 @@ "model": "deepseek-chat", "openai_api_key": deepseek_key, "openai_api_base": 'https://api.deepseek.com/v1', - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/fireworks/csv_scraper_fireworks.py b/examples/fireworks/csv_scraper_fireworks.py index b1d7526d..f588c4c5 100644 --- a/examples/fireworks/csv_scraper_fireworks.py +++ b/examples/fireworks/csv_scraper_fireworks.py @@ -29,12 +29,6 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, - "verbose": True, "headless": False, } diff --git a/examples/fireworks/csv_scraper_graph_multi_fireworks.py b/examples/fireworks/csv_scraper_graph_multi_fireworks.py index 81393d60..ebc46e61 100644 --- a/examples/fireworks/csv_scraper_graph_multi_fireworks.py +++ b/examples/fireworks/csv_scraper_graph_multi_fireworks.py @@ -28,11 +28,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py index a02b774e..d0dcd994 100644 --- a/examples/fireworks/custom_graph_fireworks.py +++ b/examples/fireworks/custom_graph_fireworks.py @@ -4,9 +4,7 @@ import os from dotenv import load_dotenv - -from langchain_openai import OpenAIEmbeddings -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode load_dotenv() @@ -21,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, @@ -35,8 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) +llm_model = ChatOpenAI(graph_config["llm"]) # define the nodes for the graph robot_node = RobotsNode( @@ -65,15 +57,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -92,14 +76,11 @@ robot_node, fetch_node, parse_node, - rag_node, - generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/fireworks/deep_scraper_fireworks.py b/examples/fireworks/deep_scraper_fireworks.py index 67a80868..86fb1717 100644 --- a/examples/fireworks/deep_scraper_fireworks.py +++ b/examples/fireworks/deep_scraper_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "max_depth": 1 @@ -49,4 +44,4 @@ graph_exec_info = deep_scraper_graph.get_execution_info() print(deep_scraper_graph.get_state("relevant_links")) -print(prettify_exec_info(graph_exec_info)) \ No newline at end of file +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/json_scraper_fireworkspy.py b/examples/fireworks/json_scraper_fireworkspy.py index 0dd188fb..a76a89c5 100644 --- a/examples/fireworks/json_scraper_fireworkspy.py +++ b/examples/fireworks/json_scraper_fireworkspy.py @@ -29,11 +29,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/json_scraper_multi_fireworks.py b/examples/fireworks/json_scraper_multi_fireworks.py index b4cf4fc7..cd16c525 100644 --- a/examples/fireworks/json_scraper_multi_fireworks.py +++ b/examples/fireworks/json_scraper_multi_fireworks.py @@ -14,11 +14,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/pdf_scraper_fireworks.py b/examples/fireworks/pdf_scraper_fireworks.py index 20db556b..3bb3f3d4 100644 --- a/examples/fireworks/pdf_scraper_fireworks.py +++ b/examples/fireworks/pdf_scraper_fireworks.py @@ -15,11 +15,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/fireworks/pdf_scraper_multi_fireworks.py b/examples/fireworks/pdf_scraper_multi_fireworks.py index 891a4454..c1077061 100644 --- a/examples/fireworks/pdf_scraper_multi_fireworks.py +++ b/examples/fireworks/pdf_scraper_multi_fireworks.py @@ -20,11 +20,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/fireworks/scrape_plain_text_fireworks.py b/examples/fireworks/scrape_plain_text_fireworks.py index a45b2691..331f05e2 100644 --- a/examples/fireworks/scrape_plain_text_fireworks.py +++ b/examples/fireworks/scrape_plain_text_fireworks.py @@ -32,11 +32,6 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, } diff --git a/examples/fireworks/script_generator_fireworks.py b/examples/fireworks/script_generator_fireworks.py index dea59e12..2ee3294c 100644 --- a/examples/fireworks/script_generator_fireworks.py +++ b/examples/fireworks/script_generator_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/script_generator_schema_fireworks.py b/examples/fireworks/script_generator_schema_fireworks.py index f7aa4c83..6355a4e8 100644 --- a/examples/fireworks/script_generator_schema_fireworks.py +++ b/examples/fireworks/script_generator_schema_fireworks.py @@ -32,11 +32,6 @@ class Projects(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "library": "beautifulsoup", diff --git a/examples/fireworks/script_multi_generator_fireworks.py b/examples/fireworks/script_multi_generator_fireworks.py index 42aff923..98671768 100644 --- a/examples/fireworks/script_multi_generator_fireworks.py +++ b/examples/fireworks/script_multi_generator_fireworks.py @@ -19,11 +19,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "library": "beautifulsoup", diff --git a/examples/fireworks/search_graph_fireworks.py b/examples/fireworks/search_graph_fireworks.py index 4d4d33cb..a091190c 100644 --- a/examples/fireworks/search_graph_fireworks.py +++ b/examples/fireworks/search_graph_fireworks.py @@ -18,11 +18,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/search_graph_schema_fireworks.py b/examples/fireworks/search_graph_schema_fireworks.py index 9180522b..d88d991e 100644 --- a/examples/fireworks/search_graph_schema_fireworks.py +++ b/examples/fireworks/search_graph_schema_fireworks.py @@ -33,11 +33,6 @@ class Dishes(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py index a1d3a979..e71e2a4f 100644 --- a/examples/fireworks/search_link_graph_fireworks.py +++ b/examples/fireworks/search_link_graph_fireworks.py @@ -18,11 +18,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 2, "verbose": True, diff --git a/examples/fireworks/smart_scraper_fireworks.py b/examples/fireworks/smart_scraper_fireworks.py index 40071d8f..cff9aedb 100644 --- a/examples/fireworks/smart_scraper_fireworks.py +++ b/examples/fireworks/smart_scraper_fireworks.py @@ -20,11 +20,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/smart_scraper_multi_fireworks.py b/examples/fireworks/smart_scraper_multi_fireworks.py index 68e28055..09e2c811 100644 --- a/examples/fireworks/smart_scraper_multi_fireworks.py +++ b/examples/fireworks/smart_scraper_multi_fireworks.py @@ -19,11 +19,7 @@ "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False, } diff --git a/examples/fireworks/smart_scraper_schema_fireworks.py b/examples/fireworks/smart_scraper_schema_fireworks.py index b8685c3e..d71593f3 100644 --- a/examples/fireworks/smart_scraper_schema_fireworks.py +++ b/examples/fireworks/smart_scraper_schema_fireworks.py @@ -31,11 +31,6 @@ class Projects(BaseModel): "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/xml_scraper_fireworks.py b/examples/fireworks/xml_scraper_fireworks.py index efc98bd8..59d9e6a3 100644 --- a/examples/fireworks/xml_scraper_fireworks.py +++ b/examples/fireworks/xml_scraper_fireworks.py @@ -28,11 +28,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/fireworks/xml_scraper_graph_multi_fireworks.py b/examples/fireworks/xml_scraper_graph_multi_fireworks.py index d14b8db0..690836a4 100644 --- a/examples/fireworks/xml_scraper_graph_multi_fireworks.py +++ b/examples/fireworks/xml_scraper_graph_multi_fireworks.py @@ -29,11 +29,6 @@ "llm": { "api_key": fireworks_api_key, "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False, diff --git a/examples/groq/csv_scraper_graph_multi_groq.py b/examples/groq/csv_scraper_graph_multi_groq.py index 87e3279c..475b8cac 100644 --- a/examples/groq/csv_scraper_graph_multi_groq.py +++ b/examples/groq/csv_scraper_graph_multi_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/csv_scraper_groq.py b/examples/groq/csv_scraper_groq.py index 20839a75..805ce5fc 100644 --- a/examples/groq/csv_scraper_groq.py +++ b/examples/groq/csv_scraper_groq.py @@ -31,11 +31,6 @@ "api_key": groq_key, "temperature": 0 }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, } # ************************************************ # Create the CSVScraperGraph instance and run it diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py index d0384ffd..79d2f0c6 100644 --- a/examples/groq/custom_graph_groq.py +++ b/examples/groq/custom_graph_groq.py @@ -4,7 +4,7 @@ import os from dotenv import load_dotenv -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode load_dotenv() @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False @@ -33,7 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) +llm_model = ChatOpenAI(graph_config["llm"]) # define the nodes for the graph robot_node = RobotsNode( @@ -62,14 +57,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -88,14 +76,12 @@ robot_node, fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/groq/json_scraper_groq.py b/examples/groq/json_scraper_groq.py index 3faddae8..a9099069 100644 --- a/examples/groq/json_scraper_groq.py +++ b/examples/groq/json_scraper_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/json_scraper_multi_groq.py b/examples/groq/json_scraper_multi_groq.py index 13b49be6..df3b9276 100644 --- a/examples/groq/json_scraper_multi_groq.py +++ b/examples/groq/json_scraper_multi_groq.py @@ -15,11 +15,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/pdf_scraper_graph_groq.py b/examples/groq/pdf_scraper_graph_groq.py index a9ca57ee..2560c11e 100644 --- a/examples/groq/pdf_scraper_graph_groq.py +++ b/examples/groq/pdf_scraper_graph_groq.py @@ -18,11 +18,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, } diff --git a/examples/groq/pdf_scraper_multi_groq.py b/examples/groq/pdf_scraper_multi_groq.py index f1afc058..c43a7087 100644 --- a/examples/groq/pdf_scraper_multi_groq.py +++ b/examples/groq/pdf_scraper_multi_groq.py @@ -14,11 +14,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/scrape_plain_text_groq.py b/examples/groq/scrape_plain_text_groq.py index 73cda250..329df51f 100644 --- a/examples/groq/scrape_plain_text_groq.py +++ b/examples/groq/scrape_plain_text_groq.py @@ -32,11 +32,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/script_generator_groq.py b/examples/groq/script_generator_groq.py index a370eb3c..9e280e2b 100644 --- a/examples/groq/script_generator_groq.py +++ b/examples/groq/script_generator_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/script_multi_generator_groq.py b/examples/groq/script_multi_generator_groq.py index 1757a3de..31f4041e 100644 --- a/examples/groq/script_multi_generator_groq.py +++ b/examples/groq/script_multi_generator_groq.py @@ -20,11 +20,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "library": "beautifulsoup" } diff --git a/examples/groq/search_graph_groq.py b/examples/groq/search_graph_groq.py index e82ffb7c..e3044c0e 100644 --- a/examples/groq/search_graph_groq.py +++ b/examples/groq/search_graph_groq.py @@ -21,11 +21,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/search_graph_schema_groq.py b/examples/groq/search_graph_schema_groq.py index 41f03dc4..4cc2209d 100644 --- a/examples/groq/search_graph_schema_groq.py +++ b/examples/groq/search_graph_schema_groq.py @@ -34,11 +34,6 @@ class Dishes(BaseModel): "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py index f940c2a4..5d82f37f 100644 --- a/examples/groq/search_link_graph_groq.py +++ b/examples/groq/search_link_graph_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index f828cdec..ab38edc0 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -20,11 +20,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/smart_scraper_multi_groq.py b/examples/groq/smart_scraper_multi_groq.py index 18ba3992..6ead098c 100644 --- a/examples/groq/smart_scraper_multi_groq.py +++ b/examples/groq/smart_scraper_multi_groq.py @@ -19,11 +19,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/groq/smart_scraper_schema_groq.py b/examples/groq/smart_scraper_schema_groq.py index e0c51c98..f9c1a40b 100644 --- a/examples/groq/smart_scraper_schema_groq.py +++ b/examples/groq/smart_scraper_schema_groq.py @@ -33,11 +33,6 @@ class Projects(BaseModel): "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/xml_scraper_graph_multi_groq.py b/examples/groq/xml_scraper_graph_multi_groq.py index 7b102c0f..62540671 100644 --- a/examples/groq/xml_scraper_graph_multi_groq.py +++ b/examples/groq/xml_scraper_graph_multi_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "headless": False } diff --git a/examples/groq/xml_scraper_groq.py b/examples/groq/xml_scraper_groq.py index 1c086175..2172ea77 100644 --- a/examples/groq/xml_scraper_groq.py +++ b/examples/groq/xml_scraper_groq.py @@ -30,11 +30,6 @@ "model": "groq/gemma-7b-it", "api_key": groq_key, "temperature": 0 - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "verbose": True, "headless": False diff --git a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py index 4517bbe9..48b04dab 100644 --- a/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py +++ b/examples/huggingfacehub/csv_scraper_graph_multi_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } diff --git a/examples/huggingfacehub/csv_scraper_huggingfacehub.py b/examples/huggingfacehub/csv_scraper_huggingfacehub.py index 9d1dbe0b..18ce1194 100644 --- a/examples/huggingfacehub/csv_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/csv_scraper_huggingfacehub.py @@ -43,7 +43,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py index ad903b5d..0c392cc1 100644 --- a/examples/huggingfacehub/custom_graph_huggingfacehub.py +++ b/examples/huggingfacehub/custom_graph_huggingfacehub.py @@ -33,7 +33,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/json_scraper_huggingfacehub.py b/examples/huggingfacehub/json_scraper_huggingfacehub.py index 3a9a163d..d709cc0d 100644 --- a/examples/huggingfacehub/json_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/json_scraper_huggingfacehub.py @@ -43,7 +43,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py index 8ca3ba51..c029431e 100644 --- a/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/json_scraper_multi_huggingfacehub.py @@ -24,7 +24,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } FILE_NAME = "inputs/example.json" curr_dir = os.path.dirname(os.path.realpath(__file__)) diff --git a/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py b/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py index bb2724fe..eb0b1895 100644 --- a/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py +++ b/examples/huggingfacehub/pdf_scraper_graph_huggingfacehub.py @@ -25,7 +25,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } source = """ diff --git a/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py index d24d522c..4db809b2 100644 --- a/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/pdf_scraper_multi_huggingfacehub.py @@ -23,7 +23,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # Covert to list diff --git a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py b/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py index f07e5666..76d32cda 100644 --- a/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py +++ b/examples/huggingfacehub/scrape_plain_text_huggingfacehub.py @@ -45,7 +45,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/script_generator_huggingfacehub.py b/examples/huggingfacehub/script_generator_huggingfacehub.py index 4804db93..a3fcaaf4 100644 --- a/examples/huggingfacehub/script_generator_huggingfacehub.py +++ b/examples/huggingfacehub/script_generator_huggingfacehub.py @@ -36,7 +36,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ # Create the ScriptCreatorGraph instance and run it diff --git a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py b/examples/huggingfacehub/script_multi_generator_huggingfacehub.py index 5afeff0d..0ee89189 100644 --- a/examples/huggingfacehub/script_multi_generator_huggingfacehub.py +++ b/examples/huggingfacehub/script_multi_generator_huggingfacehub.py @@ -33,7 +33,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/search_graph_huggingfacehub.py b/examples/huggingfacehub/search_graph_huggingfacehub.py index b3c58ce5..7c4a0c43 100644 --- a/examples/huggingfacehub/search_graph_huggingfacehub.py +++ b/examples/huggingfacehub/search_graph_huggingfacehub.py @@ -29,7 +29,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py index a49fb3b9..75b41282 100644 --- a/examples/huggingfacehub/search_link_graph_huggingfacehub.py +++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py @@ -26,7 +26,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py index bd415d41..6f9a863f 100644 --- a/examples/huggingfacehub/smart_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_huggingfacehub.py @@ -38,7 +38,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } smart_scraper_graph = SmartScraperGraph( diff --git a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py index e1a332f9..046883a2 100644 --- a/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_multi_huggingfacehub.py @@ -28,7 +28,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ******************************************************* diff --git a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py index 784079e4..31719697 100644 --- a/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py +++ b/examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py @@ -48,7 +48,6 @@ class Projects(BaseModel): graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } smart_scraper_graph = SmartScraperGraph( diff --git a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py index 24d6babd..1a244b86 100644 --- a/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py +++ b/examples/huggingfacehub/xml_scraper_graph_multi_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/huggingfacehub/xml_scraper_huggingfacehub.py b/examples/huggingfacehub/xml_scraper_huggingfacehub.py index cc8a4425..ddd73b5f 100644 --- a/examples/huggingfacehub/xml_scraper_huggingfacehub.py +++ b/examples/huggingfacehub/xml_scraper_huggingfacehub.py @@ -40,7 +40,6 @@ graph_config = { "llm": {"model_instance": llm_model_instance}, - "embeddings": {"model_instance": embedder_model_instance} } # ************************************************ diff --git a/examples/local_models/custom_graph_ollama.py b/examples/local_models/custom_graph_ollama.py index b9a42949..66dd59b6 100644 --- a/examples/local_models/custom_graph_ollama.py +++ b/examples/local_models/custom_graph_ollama.py @@ -4,7 +4,7 @@ import os from langchain_openai import OpenAIEmbeddings -from scrapegraphai.models import OpenAI +from langchain_openai import ChatOpenAI from scrapegraphai.graphs import BaseGraph from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode @@ -20,11 +20,7 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, + "verbose": True, } @@ -32,7 +28,7 @@ # Define the graph nodes # ************************************************ -llm_model = OpenAI(graph_config["llm"]) +llm_model = ChatOpenAI(graph_config["llm"]) embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) # define the nodes for the graph @@ -62,15 +58,7 @@ "verbose": True, } ) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -89,14 +77,12 @@ robot_node, fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (robot_node, fetch_node), (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=robot_node ) diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py index 91f4fab4..6e9c3da3 100644 --- a/examples/local_models/json_scraper_multi_ollama.py +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -12,10 +12,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, "headless": False, } diff --git a/examples/local_models/json_scraper_ollama.py b/examples/local_models/json_scraper_ollama.py index 2dd072ac..ca4eb32e 100644 --- a/examples/local_models/json_scraper_ollama.py +++ b/examples/local_models/json_scraper_ollama.py @@ -31,11 +31,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py index c0b65a63..ce258bf6 100644 --- a/examples/local_models/pdf_scraper_multi_ollama.py +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -11,10 +11,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, } diff --git a/examples/local_models/pdf_scraper_ollama.py b/examples/local_models/pdf_scraper_ollama.py index d79afb3a..84eb40f9 100644 --- a/examples/local_models/pdf_scraper_ollama.py +++ b/examples/local_models/pdf_scraper_ollama.py @@ -10,10 +10,6 @@ "format": "json", # Ollama needs the format to be specified explicitly "model_tokens": 4000, }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, "verbose": True, "headless": False, } diff --git a/examples/local_models/scrape_plain_text_ollama.py b/examples/local_models/scrape_plain_text_ollama.py index 9700d713..fe24c2a9 100644 --- a/examples/local_models/scrape_plain_text_ollama.py +++ b/examples/local_models/scrape_plain_text_ollama.py @@ -30,11 +30,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/examples/local_models/script_multi_generator_ollama.py b/examples/local_models/script_multi_generator_ollama.py index dc34c910..d94faba6 100644 --- a/examples/local_models/script_multi_generator_ollama.py +++ b/examples/local_models/script_multi_generator_ollama.py @@ -20,11 +20,6 @@ # "model_tokens": 2000, # set context length arbitrarily, "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "library": "beautifoulsoup", "verbose": True, } diff --git a/examples/local_models/search_graph_ollama.py b/examples/local_models/search_graph_ollama.py index 8ecb60c1..039ca00e 100644 --- a/examples/local_models/search_graph_ollama.py +++ b/examples/local_models/search_graph_ollama.py @@ -16,11 +16,6 @@ # "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "max_results": 5, "verbose": True, } diff --git a/examples/local_models/search_graph_schema_ollama.py b/examples/local_models/search_graph_schema_ollama.py index ae7c0632..fb87954f 100644 --- a/examples/local_models/search_graph_schema_ollama.py +++ b/examples/local_models/search_graph_schema_ollama.py @@ -29,11 +29,6 @@ class Dishes(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, "verbose": True, "headless": False } diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py index 5c594270..a05067dd 100644 --- a/examples/local_models/search_link_graph_ollama.py +++ b/examples/local_models/search_link_graph_ollama.py @@ -14,11 +14,7 @@ "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index c3d60559..01c9f964 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -14,11 +14,7 @@ "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 7168d513..5fcff433 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -24,11 +24,7 @@ class Projects(BaseModel): "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - # "base_url": "http://localhost:11434", # set ollama URL arbitrarily - }, + "verbose": True, "headless": False } diff --git a/examples/local_models/xml_scraper_graph_multi_ollama.py b/examples/local_models/xml_scraper_graph_multi_ollama.py index d84c6c9f..0494ff2c 100644 --- a/examples/local_models/xml_scraper_graph_multi_ollama.py +++ b/examples/local_models/xml_scraper_graph_multi_ollama.py @@ -29,11 +29,7 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, + "verbose": True, } diff --git a/examples/local_models/xml_scraper_ollama.py b/examples/local_models/xml_scraper_ollama.py index cc8c3ad9..50c4f8e7 100644 --- a/examples/local_models/xml_scraper_ollama.py +++ b/examples/local_models/xml_scraper_ollama.py @@ -30,11 +30,6 @@ # "model_tokens": 2000, # set context length arbitrarily "base_url": "http://localhost:11434", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - }, "verbose": True, } diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 4ed08057..0348b3cc 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -29,8 +29,6 @@ class AbstractGraph(ABC): config (dict): Configuration parameters for the graph. schema (BaseModel): The schema for the graph output. llm_model: An instance of a language model client, configured for generating answers. - embedder_model: An instance of an embedding model client, - configured for generating embeddings. verbose (bool): A flag indicating whether to show print statements during execution. headless (bool): A flag indicating whether to run the graph in headless mode. @@ -85,7 +83,6 @@ def __init__(self, prompt: str, config: dict, "verbose": self.verbose, "loader_kwargs": self.loader_kwargs, "llm_model": self.llm_model, - "embedder_model": self.embedder_model, "cache_path": self.cache_path, } From 9ac74de6226b701149f8ae67ace61e7793e2f712 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 14:47:49 +0200 Subject: [PATCH 42/51] Update speech_graph.py --- scrapegraphai/graphs/speech_graph.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 8fc532cd..2ba10db9 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -11,7 +11,6 @@ from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerNode, TextToSpeechNode, ) @@ -72,13 +71,6 @@ def _create_graph(self) -> BaseGraph: "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model } - ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -100,14 +92,12 @@ def _create_graph(self) -> BaseGraph: nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, text_to_speech_node ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node), + (parse_node, generate_answer_node), (generate_answer_node, text_to_speech_node) ], entry_point=fetch_node, From 53dc06a2cbbe2cfd64b62d56c60490ae58f06d17 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 14:48:19 +0200 Subject: [PATCH 43/51] Update speech_graph.py --- scrapegraphai/graphs/speech_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 2ba10db9..d1d6f94b 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -111,7 +111,7 @@ def run(self) -> str: Returns: str: The answer to the prompt. """ - + inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) @@ -122,4 +122,4 @@ def run(self) -> str: "output_path", "output.mp3")) print(f"Audio saved to {self.config.get('output_path', 'output.mp3')}") - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") From 55f706f3d5f4a8afe9dd8fc9ce9bd527f8a11894 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 1 Aug 2024 12:53:01 +0000 Subject: [PATCH 44/51] ci(release): 1.11.0-beta.7 [skip ci] ## [1.11.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.6...v1.11.0-beta.7) (2024-08-01) ### Bug Fixes * abstract_graph and removed unused embeddings ([0b4cfd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0b4cfd6522dcad0eb418f0badd0f7824a1efd534)) ### Refactor * move embeddings code from AbstractGraph to RAGNode ([a94ebcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a94ebcde0078d66d33e67f7e0a87850efb92d408)) * reuse code for common interface models ([bb73d91](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb73d916a1a7b378438038ec928eeda6d8f6ac9d)) --- CHANGELOG.md | 13 +++++++++++++ pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53e36c8a..52eccaa2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## [1.11.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.6...v1.11.0-beta.7) (2024-08-01) + + +### Bug Fixes + +* abstract_graph and removed unused embeddings ([0b4cfd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0b4cfd6522dcad0eb418f0badd0f7824a1efd534)) + + +### Refactor + +* move embeddings code from AbstractGraph to RAGNode ([a94ebcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a94ebcde0078d66d33e67f7e0a87850efb92d408)) +* reuse code for common interface models ([bb73d91](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bb73d916a1a7b378438038ec928eeda6d8f6ac9d)) + ## [1.11.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.5...v1.11.0-beta.6) (2024-07-31) diff --git a/pyproject.toml b/pyproject.toml index 4a7fe29f..56acf3d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b6" +version = "1.11.0b7" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 3e07f6273fae667b2f663be1cdd5e9c068f4c59f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 1 Aug 2024 13:17:34 +0000 Subject: [PATCH 45/51] ci(release): 1.11.0-beta.8 [skip ci] ## [1.11.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.7...v1.11.0-beta.8) (2024-08-01) ### Features * add integration in the abstract grapgh ([5ecdbe7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5ecdbe715f4bb223fa1be834fda07ccea2a51cb9)) ### Bug Fixes * fixed bug on fetch_node ([968c69e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/968c69e217d9c180b9b8c2aa52ca59b9a1733525)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52eccaa2..d2cdb565 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.11.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.7...v1.11.0-beta.8) (2024-08-01) + + +### Features + +* add integration in the abstract grapgh ([5ecdbe7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5ecdbe715f4bb223fa1be834fda07ccea2a51cb9)) + + +### Bug Fixes + +* fixed bug on fetch_node ([968c69e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/968c69e217d9c180b9b8c2aa52ca59b9a1733525)) + ## [1.11.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.6...v1.11.0-beta.7) (2024-08-01) diff --git a/pyproject.toml b/pyproject.toml index 56acf3d7..431488e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b7" +version = "1.11.0b8" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 9355507a2dc73342f325b6649e871df48ae13567 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 2 Aug 2024 12:00:00 +0200 Subject: [PATCH 46/51] feat: refactoring of the code --- scrapegraphai/nodes/base_node.py | 6 +++-- scrapegraphai/nodes/fetch_node.py | 12 ++++----- .../nodes/generate_answer_csv_node.py | 6 ----- scrapegraphai/nodes/generate_answer_node.py | 3 --- .../nodes/generate_answer_omni_node.py | 4 +-- .../nodes/generate_answer_pdf_node.py | 5 ---- scrapegraphai/nodes/generate_scraper_node.py | 2 -- scrapegraphai/nodes/get_probable_tags_node.py | 1 - scrapegraphai/nodes/graph_iterator_node.py | 14 +++++------ scrapegraphai/nodes/image_to_text_node.py | 4 +-- scrapegraphai/nodes/merge_answers_node.py | 8 ------ .../nodes/merge_generated_scripts.py | 6 ----- scrapegraphai/nodes/parse_node.py | 12 ++++----- scrapegraphai/nodes/robots_node.py | 8 +----- scrapegraphai/nodes/search_internet_node.py | 2 -- scrapegraphai/nodes/search_link_node.py | 6 ----- .../nodes/search_node_with_context.py | 1 - scrapegraphai/nodes/text_to_speech_node.py | 3 --- scrapegraphai/utils/convert_to_md.py | 2 +- scrapegraphai/utils/logging.py | 22 ++++++++-------- scrapegraphai/utils/parse_state_keys.py | 13 ++++++---- scrapegraphai/utils/proxy_rotation.py | 1 - scrapegraphai/utils/research_web.py | 25 +++++++++++-------- scrapegraphai/utils/sys_dynamic_import.py | 5 +--- scrapegraphai/utils/token_calculator.py | 3 ++- 25 files changed, 65 insertions(+), 109 deletions(-) diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index d1b59500..26fc44c4 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -86,7 +86,8 @@ def update_config(self, params: dict, overwrite: bool = False): Args: param (dict): The dictionary to update node_config with. - overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None. + overwrite (bool): Flag indicating if the values of node_config + should be overwritten if their value is not None. """ for key, val in params.items(): if hasattr(self, key) and not overwrite: @@ -133,7 +134,8 @@ def _validate_input_keys(self, input_keys): def _parse_input_keys(self, state: dict, expression: str) -> List[str]: """ - Parses the input keys expression to extract relevant keys from the state based on logical conditions. + Parses the input keys expression to extract + relevant keys from the state based on logical conditions. The expression can contain AND (&), OR (|), and parentheses to group conditions. Args: diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4971ddb3..11cbb5fb 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -133,7 +133,7 @@ def execute(self, state): state.update({self.output[0]: compressed_document}) return state elif input_keys[0] == "json": - f = open(source) + f = open(source, encoding="utf-8") compressed_document = [ Document(page_content=str(json.load(f)), metadata={"source": "json"}) ] @@ -181,12 +181,11 @@ def execute(self, state): if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - parsed_content = response - if not self.cut: parsed_content = cleanup_html(response, source) - if (isinstance(self.llm_model, ChatOpenAI) and not self.script_creator) or (self.force and not self.script_creator): + if (isinstance(self.llm_model, ChatOpenAI) + and not self.script_creator) or (self.force and not self.script_creator): parsed_content = convert_to_md(source, input_data[0]) compressed_document = [Document(page_content=parsed_content)] else: @@ -205,7 +204,8 @@ def execute(self, state): data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) - document = [Document(page_content=content, metadata={"source": source}) for content in data] + document = [Document(page_content=content, + metadata={"source": source}) for content in data] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load() @@ -215,10 +215,8 @@ def execute(self, state): parsed_content = document[0].page_content if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: - parsed_content = convert_to_md(document[0].page_content, input_data[0]) - compressed_document = [ Document(page_content=parsed_content, metadata={"source": "html file"}) ] diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 6ce19ef2..a91dae3f 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -3,18 +3,12 @@ Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 12ae6f0f..9c530688 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,7 +1,6 @@ """ GenerateAnswerNode Module """ -import asyncio from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -9,7 +8,6 @@ from langchain_openai import ChatOpenAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm -from langchain_openai import ChatOpenAI from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md @@ -130,7 +128,6 @@ def execute(self, state: dict) -> dict: partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}) - # Add chain to dictionary with dynamic name chain_name = f"chunk{i+1}" chains_dict[chain_name] = prompt | self.llm_model | output_parser diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 98be26dd..93e96f4e 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -113,7 +113,7 @@ def execute(self, state: dict) -> dict: chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) - + state.update({self.output[0]: answer}) return state @@ -148,4 +148,4 @@ def execute(self, state: dict) -> dict: answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 47f14e86..4cef7ae9 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -2,18 +2,13 @@ Module for generating the answer node """ -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from langchain_community.chat_models import ChatOllama from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 393f5e90..733898bd 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -83,7 +83,6 @@ def execute(self, state: dict) -> dict: user_prompt = input_data[0] doc = input_data[1] - # schema to be used for output parsing if self.node_config.get("schema", None) is not None: output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"]) else: @@ -130,7 +129,6 @@ def execute(self, state: dict) -> dict: ) map_chain = prompt | self.llm_model | StrOutputParser() - # Chain answer = map_chain.invoke({"question": user_prompt}) state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/get_probable_tags_node.py b/scrapegraphai/nodes/get_probable_tags_node.py index f31633c0..38c2ba15 100644 --- a/scrapegraphai/nodes/get_probable_tags_node.py +++ b/scrapegraphai/nodes/get_probable_tags_node.py @@ -1,7 +1,6 @@ """ GetProbableTagsNode Module """ - from typing import List, Optional from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 061be77a..6ce4bdaf 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -5,13 +5,11 @@ import asyncio import copy from typing import List, Optional - from tqdm.asyncio import tqdm - from ..utils.logging import get_logger from .base_node import BaseNode -_default_batchsize = 16 +DEFAULT_BATCHSIZE = 16 class GraphIteratorNode(BaseNode): @@ -51,13 +49,15 @@ def execute(self, state: dict) -> dict: the correct data from the state. Returns: - dict: The updated state with the output key containing the results of the graph instances. + dict: The updated state with the output key c + ontaining the results of the graph instances. Raises: - KeyError: If the input keys are not found in the state, indicating that the - necessary information for running the graph instances is missing. + KeyError: If the input keys are not found in the state, + indicating that thenecessary information for running + the graph instances is missing. """ - batchsize = self.node_config.get("batchsize", _default_batchsize) + batchsize = self.node_config.get("batchsize", DEFAULT_BATCHSIZE) self.logger.info( f"--- Executing {self.node_name} Node with batchsize {batchsize} ---" diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 7e7507a9..c1a69390 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -3,14 +3,14 @@ """ from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode class ImageToTextNode(BaseNode): """ - Retrieve images from a list of URLs and return a description of the images using an image-to-text model. + Retrieve images from a list of URLs and return a description of + the images using an image-to-text model. Attributes: llm_model: An instance of the language model client used for image-to-text conversion. diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 0efd8ec8..548b7c04 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -2,18 +2,10 @@ MergeAnswersNode Module """ -# Imports from standard library from typing import List, Optional -from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index cfda3960..8c8eaecd 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -5,15 +5,9 @@ # Imports from standard library from typing import List, Optional from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser, StrOutputParser -from tqdm import tqdm - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index b5418717..d1bb87bd 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -75,23 +75,23 @@ def execute(self, state: dict) -> dict: chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: docs_transformed = docs_transformed[0] - if type(docs_transformed) == Document: + if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) else: - + chunks = chunk(text=docs_transformed, chunk_size= self.node_config.get("chunk_size", 4096)-250, - token_counter=lambda x: len(x), + token_counter= lambda x: len(x), memoize=False) - + state.update({self.output[0]: chunks}) return state diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 66231600..7fa2fe6b 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -4,15 +4,9 @@ from typing import List, Optional from urllib.parse import urlparse - from langchain_community.document_loaders import AsyncChromiumLoader from langchain.prompts import PromptTemplate from langchain.output_parsers import CommaSeparatedListOutputParser - -from langchain.output_parsers import CommaSeparatedListOutputParser -from langchain.prompts import PromptTemplate -from langchain_community.document_loaders import AsyncChromiumLoader - from ..helpers import robots_dictionary from ..utils.logging import get_logger from .base_node import BaseNode @@ -146,4 +140,4 @@ def execute(self, state: dict) -> dict: self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m") state.update({self.output[0]: is_scrapable}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 7588b995..61b11995 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -1,9 +1,7 @@ """ SearchInternetNode Module """ - from typing import List, Optional - from langchain.output_parsers import CommaSeparatedListOutputParser from langchain.prompts import PromptTemplate from langchain_community.chat_models import ChatOllama diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index b3d289d9..6fbe51dd 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -2,19 +2,13 @@ SearchLinkNode Module """ -# Imports from standard library from typing import List, Optional import re from tqdm import tqdm - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel - from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode diff --git a/scrapegraphai/nodes/search_node_with_context.py b/scrapegraphai/nodes/search_node_with_context.py index 62de184a..678e44ae 100644 --- a/scrapegraphai/nodes/search_node_with_context.py +++ b/scrapegraphai/nodes/search_node_with_context.py @@ -67,7 +67,6 @@ def execute(self, state: dict) -> dict: # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - user_prompt = input_data[0] doc = input_data[1] output_parser = CommaSeparatedListOutputParser() diff --git a/scrapegraphai/nodes/text_to_speech_node.py b/scrapegraphai/nodes/text_to_speech_node.py index 59e3fb8b..e8e43cb5 100644 --- a/scrapegraphai/nodes/text_to_speech_node.py +++ b/scrapegraphai/nodes/text_to_speech_node.py @@ -1,13 +1,10 @@ """ TextToSpeechNode Module """ - from typing import List, Optional - from ..utils.logging import get_logger from .base_node import BaseNode - class TextToSpeechNode(BaseNode): """ Converts text to speech using the specified text-to-speech model. diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 6f1a2334..74478bcc 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -1,8 +1,8 @@ """ convert_to_md modul """ -import html2text from urllib.parse import urlparse +import html2text def convert_to_md(html: str, url: str = None) -> str: """ Convert HTML to Markdown. diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index 2684d0b1..afb63c52 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -12,7 +12,7 @@ _library_name = __name__.split(".", maxsplit=1)[0] -_default_handler = None +DEFAULT_HANDLER = None _default_logging_level = logging.WARNING _semaphore = threading.Lock() @@ -23,22 +23,22 @@ def _get_library_root_logger() -> logging.Logger: def _set_library_root_logger() -> None: - global _default_handler + global DEFAULT_HANDLER with _semaphore: - if _default_handler: + if DEFAULT_HANDLER: return - _default_handler = logging.StreamHandler() # sys.stderr as stream + DEFAULT_HANDLER = logging.StreamHandler() # sys.stderr as stream # https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176 if sys.stderr is None: - sys.stderr = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w", encoding="utf-8") - _default_handler.flush = sys.stderr.flush + DEFAULT_HANDLER.flush = sys.stderr.flush library_root_logger = _get_library_root_logger() - library_root_logger.addHandler(_default_handler) + library_root_logger.addHandler(DEFAULT_HANDLER) library_root_logger.setLevel(_default_logging_level) library_root_logger.propagate = False @@ -86,8 +86,8 @@ def set_handler(handler: logging.Handler) -> None: _get_library_root_logger().addHandler(handler) -def set_default_handler() -> None: - set_handler(_default_handler) +def setDEFAULT_HANDLER() -> None: + set_handler(DEFAULT_HANDLER) def unset_handler(handler: logging.Handler) -> None: @@ -98,8 +98,8 @@ def unset_handler(handler: logging.Handler) -> None: _get_library_root_logger().removeHandler(handler) -def unset_default_handler() -> None: - unset_handler(_default_handler) +def unsetDEFAULT_HANDLER() -> None: + unset_handler(DEFAULT_HANDLER) def set_propagation() -> None: diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index 85712ef6..107397e9 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -13,19 +13,22 @@ def parse_expression(expression, state: dict) -> list: state (dict): Dictionary of state keys used to evaluate the expression. Raises: - ValueError: If the expression is empty, has adjacent state keys without operators, invalid operator usage, - unbalanced parentheses, or if no state keys match the expression. + ValueError: If the expression is empty, has adjacent state keys without operators, + invalid operator usage, unbalanced parentheses, or if no state keys match the expression. Returns: - list: A list of state keys that match the boolean expression, ensuring each key appears only once. + list: A list of state keys that match the boolean expression, + ensuring each key appears only once. Example: >>> parse_expression("user_input & (relevant_chunks | parsed_document | document)", {"user_input": None, "document": None, "parsed_document": None, "relevant_chunks": None}) ['user_input', 'relevant_chunks', 'parsed_document', 'document'] - This function evaluates the expression to determine the logical inclusion of state keys based on provided boolean logic. - It checks for syntax errors such as unbalanced parentheses, incorrect adjacency of operators, and empty expressions. + This function evaluates the expression to determine the + logical inclusion of state keys based on provided boolean logic. + It checks for syntax errors such as unbalanced parentheses, + incorrect adjacency of operators, and empty expressions. """ # Check for empty expression diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 07e04d0f..6f6019e9 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -6,7 +6,6 @@ import random import re from typing import List, Optional, Set, TypedDict - import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 101693e4..fe7902d3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,3 +1,6 @@ +""" +Research_web module +""" import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults @@ -5,13 +8,15 @@ import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", + max_results: int = 10, port: int = 8080) -> List[str]: """ Searches the web for a given query using specified search engine options. Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, + options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. @@ -25,19 +30,19 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] """ - + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) return res - + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links - + elif search_engine.lower() == "bing": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" @@ -46,24 +51,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = response = requests.get(search_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - + search_results = [] for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) return search_results - + elif search_engine.lower() == "searxng": url = f"http://localhost:{port}" params = {"q": query, "format": "json"} - + # Send the GET request to the server response = requests.get(url, params=params) - + # Parse the response and limit to the specified max_results data = response.json() limited_results = data["results"][:max_results] return limited_results - + else: raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 30f75d15..19b0d29a 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -5,7 +5,7 @@ import sys import typing - +import importlib.util # noqa: F401 if typing.TYPE_CHECKING: import types @@ -24,9 +24,6 @@ def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": Raises: ImportError: If the module cannot be imported from the srcfile """ - import importlib.util # noqa: F401 - - # spec = importlib.util.spec_from_file_location(modname, modpath) if spec is None: diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index 5b23fdf4..c5263efe 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -22,7 +22,8 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING") ["This is a sample text", "for truncation."] - This function ensures that each chunk of text can be tokenized by the specified model without exceeding the model's token limit. + This function ensures that each chunk of text can be tokenized + by the specified model without exceeding the model's token limit. """ encoding = tiktoken.get_encoding(encoding_name) From 4440790f00c1ddd416add7af895756ab42c30bf3 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 2 Aug 2024 10:01:32 +0000 Subject: [PATCH 47/51] ci(release): 1.11.0-beta.9 [skip ci] ## [1.11.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.8...v1.11.0-beta.9) (2024-08-02) ### Features * refactoring of the code ([9355507](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9355507a2dc73342f325b6649e871df48ae13567)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2cdb565..481eeae6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.11.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.8...v1.11.0-beta.9) (2024-08-02) + + +### Features + +* refactoring of the code ([9355507](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9355507a2dc73342f325b6649e871df48ae13567)) + ## [1.11.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.7...v1.11.0-beta.8) (2024-08-01) diff --git a/pyproject.toml b/pyproject.toml index 431488e5..70be09b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b8" +version = "1.11.0b9" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From ade28fca2c3fdf40f28a80854e3b8435a52a6930 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Fri, 2 Aug 2024 12:15:43 +0200 Subject: [PATCH 48/51] fix(AbstractGraph): instantiation of Azure GPT models Closes #498 --- requirements-dev.lock | 1 + requirements.lock | 1 + requirements.txt | 1 + scrapegraphai/graphs/abstract_graph.py | 8 ++++---- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index 24b7156d..d14f9d42 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -185,6 +185,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index 0e8bb930..7dbac1f3 100644 --- a/requirements.lock +++ b/requirements.lock @@ -133,6 +133,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.txt b/requirements.txt index 8f3f5da5..9c11363c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ semchunk>=1.0.1 langchain-fireworks>=0.1.3 langchain-community>=0.2.9 langchain-huggingface>=0.0.3 +browserbase==0.3.0 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index a7493351..f07bcb10 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -146,6 +146,10 @@ def handle_model(model_name, provider, token_key, default_token=8192): llm_params["model"] = model_name return init_chat_model(**llm_params) + if "azure" in llm_params["model"]: + model_name = llm_params["model"].split("/")[-1] + return handle_model(model_name, "azure_openai", model_name) + if "gpt-" in llm_params["model"]: return handle_model(llm_params["model"], "openai", llm_params["model"]) @@ -154,10 +158,6 @@ def handle_model(model_name, provider, token_key, default_token=8192): token_key = llm_params["model"].split("/")[-1] return handle_model(model_name, "fireworks", token_key) - if "azure" in llm_params["model"]: - model_name = llm_params["model"].split("/")[-1] - return handle_model(model_name, "azure_openai", model_name) - if "gemini" in llm_params["model"]: model_name = llm_params["model"].split("/")[-1] return handle_model(model_name, "google_genai", model_name) From ee30a83f8a77958be6881ca0a94b02d278f37a61 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 2 Aug 2024 12:33:59 +0000 Subject: [PATCH 49/51] ci(release): 1.11.0-beta.10 [skip ci] ## [1.11.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.9...v1.11.0-beta.10) (2024-08-02) ### Bug Fixes * **AbstractGraph:** instantiation of Azure GPT models ([ade28fc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ade28fca2c3fdf40f28a80854e3b8435a52a6930)), closes [#498](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/498) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 481eeae6..cf226b3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.11.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.9...v1.11.0-beta.10) (2024-08-02) + + +### Bug Fixes + +* **AbstractGraph:** instantiation of Azure GPT models ([ade28fc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ade28fca2c3fdf40f28a80854e3b8435a52a6930)), closes [#498](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/498) + ## [1.11.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.11.0-beta.8...v1.11.0-beta.9) (2024-08-02) diff --git a/pyproject.toml b/pyproject.toml index 70be09b5..576861bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.11.0b9" +version = "1.11.0b10" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From a3d7f1b71c0633e2c880f58e210b9516b331fe56 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 5 Aug 2024 14:54:14 +0200 Subject: [PATCH 50/51] Update csv_scraper_graph.py --- scrapegraphai/graphs/csv_scraper_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index f4efd1fb..d7ec186e 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -36,7 +36,7 @@ def _create_graph(self): input="csv | csv_dir", output=["doc"], ) - + generate_answer_node = GenerateAnswerCSVNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], From 66a29bc5cc2e8fc43409ca6fa6de928ccad78802 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 5 Aug 2024 20:54:53 +0200 Subject: [PATCH 51/51] refactoring of the openai examples Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com> --- examples/openai/csv_scraper_graph_multi_openai.py | 2 +- examples/openai/csv_scraper_openai.py | 2 +- examples/openai/custom_graph_openai.py | 2 +- examples/openai/deep_scraper_openai.py | 2 +- examples/openai/json_scraper_multi_openai.py | 2 +- examples/openai/json_scraper_openai.py | 2 +- examples/openai/md_scraper_openai.py | 2 +- examples/openai/pdf_scraper_multi_openai.py | 7 +++---- examples/openai/pdf_scraper_openai.py | 2 +- examples/openai/scrape_plain_text_openai.py | 2 +- examples/openai/script_generator_openai.py | 2 +- examples/openai/search_graph_schema_openai.py | 7 +++---- examples/openai/search_link_graph_openai.py | 11 +++++++++-- examples/openai/smart_scraper_schema_openai.py | 2 +- examples/openai/speech_graph_openai.py | 2 +- examples/openai/xml_scraper_graph_multi_openai.py | 2 +- examples/openai/xml_scraper_openai.py | 2 +- scrapegraphai/graphs/search_graph.py | 1 - 18 files changed, 29 insertions(+), 25 deletions(-) diff --git a/examples/openai/csv_scraper_graph_multi_openai.py b/examples/openai/csv_scraper_graph_multi_openai.py index 771ad679..7b91c896 100644 --- a/examples/openai/csv_scraper_graph_multi_openai.py +++ b/examples/openai/csv_scraper_graph_multi_openai.py @@ -27,7 +27,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/csv_scraper_openai.py b/examples/openai/csv_scraper_openai.py index 211f14f9..744fc7a4 100644 --- a/examples/openai/csv_scraper_openai.py +++ b/examples/openai/csv_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py index 905473e0..6687e0ef 100644 --- a/examples/openai/custom_graph_openai.py +++ b/examples/openai/custom_graph_openai.py @@ -19,7 +19,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/deep_scraper_openai.py b/examples/openai/deep_scraper_openai.py index 4860a31f..5b7202d4 100644 --- a/examples/openai/deep_scraper_openai.py +++ b/examples/openai/deep_scraper_openai.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4", + "model": "gpt-4o", }, "verbose": True, "max_depth": 1 diff --git a/examples/openai/json_scraper_multi_openai.py b/examples/openai/json_scraper_multi_openai.py index 021cd6e1..b27e5050 100644 --- a/examples/openai/json_scraper_multi_openai.py +++ b/examples/openai/json_scraper_multi_openai.py @@ -13,7 +13,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", } } diff --git a/examples/openai/json_scraper_openai.py b/examples/openai/json_scraper_openai.py index 25fc85af..eb5d1e7e 100644 --- a/examples/openai/json_scraper_openai.py +++ b/examples/openai/json_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/md_scraper_openai.py b/examples/openai/md_scraper_openai.py index 7a163137..c3e2a3df 100644 --- a/examples/openai/md_scraper_openai.py +++ b/examples/openai/md_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/pdf_scraper_multi_openai.py b/examples/openai/pdf_scraper_multi_openai.py index 9e699e58..49a9c7fa 100644 --- a/examples/openai/pdf_scraper_multi_openai.py +++ b/examples/openai/pdf_scraper_multi_openai.py @@ -3,11 +3,10 @@ """ import os import json +from typing import List from dotenv import load_dotenv -from scrapegraphai.graphs import PdfScraperMultiGraph - from pydantic import BaseModel, Field -from typing import List +from scrapegraphai.graphs import PdfScraperMultiGraph load_dotenv() @@ -20,7 +19,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, } diff --git a/examples/openai/pdf_scraper_openai.py b/examples/openai/pdf_scraper_openai.py index e07a7ab5..2b0e19f3 100644 --- a/examples/openai/pdf_scraper_openai.py +++ b/examples/openai/pdf_scraper_openai.py @@ -14,7 +14,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, } diff --git a/examples/openai/scrape_plain_text_openai.py b/examples/openai/scrape_plain_text_openai.py index ffe0054a..7f390cff 100644 --- a/examples/openai/scrape_plain_text_openai.py +++ b/examples/openai/scrape_plain_text_openai.py @@ -30,7 +30,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, } diff --git a/examples/openai/script_generator_openai.py b/examples/openai/script_generator_openai.py index 14c00ab4..046a25ec 100644 --- a/examples/openai/script_generator_openai.py +++ b/examples/openai/script_generator_openai.py @@ -18,7 +18,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "library": "beautifulsoup" } diff --git a/examples/openai/search_graph_schema_openai.py b/examples/openai/search_graph_schema_openai.py index e5131461..ecbcc644 100644 --- a/examples/openai/search_graph_schema_openai.py +++ b/examples/openai/search_graph_schema_openai.py @@ -3,14 +3,13 @@ """ import os +from typing import List from dotenv import load_dotenv -load_dotenv() - +from pydantic import BaseModel, Field from scrapegraphai.graphs import SearchGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -from pydantic import BaseModel, Field -from typing import List +load_dotenv() # ************************************************ # Define the output schema for the graph diff --git a/examples/openai/search_link_graph_openai.py b/examples/openai/search_link_graph_openai.py index 10d10d4c..818f9434 100644 --- a/examples/openai/search_link_graph_openai.py +++ b/examples/openai/search_link_graph_openai.py @@ -1,16 +1,23 @@ """ Basic example of scraping pipeline using SmartScraper """ + +import os +from dotenv import load_dotenv from scrapegraphai.graphs import SearchLinkGraph from scrapegraphai.utils import prettify_exec_info + +load_dotenv() # ************************************************ # Define the configuration for the graph # ************************************************ +openai_key = os.getenv("OPENAI_APIKEY") + graph_config = { "llm": { - "api_key": "s", - "model": "gpt-3.5-turbo", + "api_key": openai_key, + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index 076f1327..d9e1bd1c 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -30,7 +30,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/speech_graph_openai.py b/examples/openai/speech_graph_openai.py index 15cc2cfb..603ce51c 100644 --- a/examples/openai/speech_graph_openai.py +++ b/examples/openai/speech_graph_openai.py @@ -25,7 +25,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", "temperature": 0.7, }, "tts_model": { diff --git a/examples/openai/xml_scraper_graph_multi_openai.py b/examples/openai/xml_scraper_graph_multi_openai.py index 46633bba..ef46b877 100644 --- a/examples/openai/xml_scraper_graph_multi_openai.py +++ b/examples/openai/xml_scraper_graph_multi_openai.py @@ -29,7 +29,7 @@ graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose": True, "headless": False, diff --git a/examples/openai/xml_scraper_openai.py b/examples/openai/xml_scraper_openai.py index 5be5716e..b2b5075e 100644 --- a/examples/openai/xml_scraper_openai.py +++ b/examples/openai/xml_scraper_openai.py @@ -28,7 +28,7 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "verbose":False, } diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 787e537a..0c0f1104 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -53,7 +53,6 @@ def __init__(self, prompt: str, config: dict, schema: Optional[BaseModel] = None self.copy_config = copy(config) else: self.copy_config = deepcopy(config) - self.copy_schema = deepcopy(schema) self.considered_urls = [] # New attribute to store URLs