diff --git a/pyproject.toml b/pyproject.toml index 4dd7ac13..0b255da8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,8 @@ free-proxy = "1.1.1" langchain-groq = "0.1.3" playwright = "^1.43.0" langchain-aws = "^0.1.2" - +langchain-anthropic = "^0.1.11" +yahoo-search-py="^0.3" [tool.poetry.dev-dependencies] pytest = "8.0.0" diff --git a/requirements.txt b/requirements.txt index b7c642d1..26f4a855 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ free-proxy==1.1.1 langchain-groq==0.1.3 playwright==1.43.0 langchain-aws==0.1.2 +langchain-anthropic==0.1.11 +yahoo-search-py==0.3 diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 398ae00a..83d44917 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,10 +1,11 @@ -""" +""" Module for making the request on the web """ import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults -from googlesearch import search +from googlesearch import search as google_search +from yahoo_search import search as yahoo_search def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: @@ -29,18 +30,29 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. """ - if search_engine == "Google": + if search_engine.lower() == "google": res = [] - for url in search(query, stop=max_results): + for url in google_search(query, stop=max_results): res.append(url) return res - elif search_engine == "DuckDuckGo": + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links + elif search_engine.lower() == "yahoo": + list_result = yahoo_search(query) + results = [] + for page in list_result.pages: + if len(results) >= max_results: # Check if max_results has already been reached + break # Exit loop if max_results has been reached + try: + results.append(page.link) + except AttributeError: + continue + return results raise ValueError( - "The only search engines avaiable are DuckDuckGo or Google") + "The only search engines available are DuckDuckGo or Google")