From aeb1acbf05e63316c91672c99d88f8a6f338147f Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Fri, 3 May 2024 21:06:09 +0200 Subject: [PATCH 1/2] feat: refactoring search function --- pyproject.toml | 3 ++- requirements.txt | 2 ++ scrapegraphai/utils/research_web.py | 24 ++++++++++++++++++------ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4dd7ac13..7dcb634d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,8 @@ free-proxy = "1.1.1" langchain-groq = "0.1.3" playwright = "^1.43.0" langchain-aws = "^0.1.2" - +langchain-anthropic = "^0.1.11" +yahoo-search-py=="^0.3" [tool.poetry.dev-dependencies] pytest = "8.0.0" diff --git a/requirements.txt b/requirements.txt index b7c642d1..26f4a855 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ free-proxy==1.1.1 langchain-groq==0.1.3 playwright==1.43.0 langchain-aws==0.1.2 +langchain-anthropic==0.1.11 +yahoo-search-py==0.3 diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index 398ae00a..83d44917 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,10 +1,11 @@ -""" +""" Module for making the request on the web """ import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults -from googlesearch import search +from googlesearch import search as google_search +from yahoo_search import search as yahoo_search def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: @@ -29,18 +30,29 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = This function allows switching between Google and DuckDuckGo to perform internet searches, returning a list of result URLs. """ - if search_engine == "Google": + if search_engine.lower() == "google": res = [] - for url in search(query, stop=max_results): + for url in google_search(query, stop=max_results): res.append(url) return res - elif search_engine == "DuckDuckGo": + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links + elif search_engine.lower() == "yahoo": + list_result = yahoo_search(query) + results = [] + for page in list_result.pages: + if len(results) >= max_results: # Check if max_results has already been reached + break # Exit loop if max_results has been reached + try: + results.append(page.link) + except AttributeError: + continue + return results raise ValueError( - "The only search engines avaiable are DuckDuckGo or Google") + "The only search engines available are DuckDuckGo or Google") From f7d66f51818dbdfddd0fa326f26265a3ab686b20 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Fri, 3 May 2024 22:01:41 +0200 Subject: [PATCH 2/2] fix: bug on .toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7dcb634d..0b255da8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ langchain-groq = "0.1.3" playwright = "^1.43.0" langchain-aws = "^0.1.2" langchain-anthropic = "^0.1.11" -yahoo-search-py=="^0.3" +yahoo-search-py="^0.3" [tool.poetry.dev-dependencies] pytest = "8.0.0"