diff --git a/.gitignore b/.gitignore index 72f4db8a..16240e6d 100644 --- a/.gitignore +++ b/.gitignore @@ -29,7 +29,6 @@ venv/ *.google-cookie examples/graph_examples/ScrapeGraphAI_generated_graph examples/**/*.csv -examples/**/*.json main.py poetry.lock diff --git a/examples/gemini/inputs/example.json b/examples/gemini/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/gemini/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/gemini/json_scraper_gemini.py b/examples/gemini/json_scraper_gemini.py new file mode 100644 index 00000000..b038657c --- /dev/null +++ b/examples/gemini/json_scraper_gemini.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/script_generator_gemini.py b/examples/gemini/script_generator_gemini.py index c07acc37..21459f6c 100644 --- a/examples/gemini/script_generator_gemini.py +++ b/examples/gemini/script_generator_gemini.py @@ -19,7 +19,7 @@ graph_config = { "llm": { "api_key": gemini_key, - "model": "gpt-3.5-turbo", + "model": "gemini-pro", }, "library": "beautifoulsoup" } diff --git a/examples/openai/scrape_xml_openai.py b/examples/gemini/xml_scraper_openai.py similarity index 78% rename from examples/openai/scrape_xml_openai.py rename to examples/gemini/xml_scraper_openai.py index 854c5422..e82458ed 100644 --- a/examples/openai/scrape_xml_openai.py +++ b/examples/gemini/xml_scraper_openai.py @@ -1,10 +1,10 @@ """ -Basic example of scraping pipeline using SmartScraper from XML documents +Basic example of scraping pipeline using XMLScraperGraph from XML documents """ import os from dotenv import load_dotenv -from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.graphs import XMLScraperGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info load_dotenv() @@ -28,28 +28,28 @@ graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "gemini-pro", }, } # ************************************************ -# Create the SmartScraperGraph instance and run it +# Create the XMLScraperGraph instance and run it # ************************************************ -smart_scraper_graph = SmartScraperGraph( +xml_scraper_graph = XMLScraperGraph( prompt="List me all the authors, title and genres of the books", source=text, # Pass the content of the file, not the file object config=graph_config ) -result = smart_scraper_graph.run() +result = xml_scraper_graph.run() print(result) # ************************************************ # Get graph execution info # ************************************************ -graph_exec_info = smart_scraper_graph.get_execution_info() +graph_exec_info = xml_scraper_graph.get_execution_info() print(prettify_exec_info(graph_exec_info)) # Save to json or csv diff --git a/examples/local_models/Docker/inputs/example.json b/examples/local_models/Docker/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/local_models/Docker/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/local_models/Docker/json_scraper_docker.py b/examples/local_models/Docker/json_scraper_docker.py new file mode 100644 index 00000000..758de09e --- /dev/null +++ b/examples/local_models/Docker/json_scraper_docker.py @@ -0,0 +1,61 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + } +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/Docker/xml_scraper_docker.py b/examples/local_models/Docker/xml_scraper_docker.py new file mode 100644 index 00000000..6a8c86cc --- /dev/null +++ b/examples/local_models/Docker/xml_scraper_docker.py @@ -0,0 +1,61 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + } +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/Ollama/inputs/example.json b/examples/local_models/Ollama/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/local_models/Ollama/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/local_models/Ollama/json_scraper_ollama.py b/examples/local_models/Ollama/json_scraper_ollama.py new file mode 100644 index 00000000..90c4a151 --- /dev/null +++ b/examples/local_models/Ollama/json_scraper_ollama.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + } +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/Ollama/xml_scraper_ollama.py b/examples/local_models/Ollama/xml_scraper_ollama.py new file mode 100644 index 00000000..4c149a2b --- /dev/null +++ b/examples/local_models/Ollama/xml_scraper_ollama.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + } +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/result.json b/examples/local_models/result.json new file mode 100644 index 00000000..8a4e7057 --- /dev/null +++ b/examples/local_models/result.json @@ -0,0 +1 @@ +{"projects": [{"title": "Rotary Pendulum RL", "description": "Open Source project aimed at controlling a real life rotary pendulum using RL algorithms"}, {"title": "DQN Implementation from scratch", "description": "Developed a Deep Q-Network algorithm to train a simple and double pendulum"}, {"title": "Multi Agents HAED", "description": "University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings."}, {"title": "Wireless ESC for Modular Drones", "description": "Modular drone architecture proposal and proof of concept. The project received maximum grade."}]} \ No newline at end of file diff --git a/examples/mixed_models/inputs/example.json b/examples/mixed_models/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/mixed_models/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/openai/inputs/example.json b/examples/openai/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/openai/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/openai/json_scraper_openai.py b/examples/openai/json_scraper_openai.py new file mode 100644 index 00000000..5e271006 --- /dev/null +++ b/examples/openai/json_scraper_openai.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/openai/result.json b/examples/openai/result.json new file mode 100644 index 00000000..8867c8d6 --- /dev/null +++ b/examples/openai/result.json @@ -0,0 +1 @@ +{"top_5_eyeliner_products_for_gift": [{"product_name": "Tarte Double Take Eyeliner", "type": "Liquid, Gel", "price": "$26", "link": "https://www.sephora.com/product/double-take-eyeliner-P421701"}, {"product_name": "AppleDoll Velvet Liner", "type": "Liquid", "price": "$22", "link": "https://www.appledoll.com/products/velvet-liner"}, {"product_name": "Rare Beauty Perfect Strokes Gel Eyeliner", "type": "Gel", "price": "$19", "link": "https://www.sephora.com/product/perfect-strokes-gel-eyeliner-P468000"}, {"product_name": "Laura Mercier Caviar Tightline Eyeliner", "type": "Gel", "price": "$29", "link": "https://www.sephora.com/product/caviar-tightline-eyeliner-P448800"}, {"product_name": "Ilia Clean Line Liquid Eyeliner", "type": "Liquid", "price": "$28", "link": "https://www.amazon.com/ILIA-Clean-Line-Liquid-Eyeliner/dp/B08Z7JZQZP"}, {"brand": "Tom Ford", "product_name": "Eye Defining Pen", "price": "$62", "type": "Liquid", "colors": 1, "retailer": "Nordstrom"}, {"brand": "Fenty Beauty", "product_name": "Flyliner", "price": "$24", "type": "Liquid", "colors": 2, "retailer": "Sephora"}, {"brand": "Lanc\u00f4me", "product_name": "Le Crayon Kh\u00f4l Smoky Eyeliner", "price": "$28", "type": "Kohl", "colors": 2, "retailer": "Macy's"}, {"brand": "Jillian Dempsey", "product_name": "Kh\u00f4l Eyeliner", "price": "$20", "type": "Kohl", "colors": 6, "retailer": "Amazon"}, {"brand": "R\u00f3en", "product_name": "Eyeline Define Eyeliner Pencil", "price": "$26", "type": "Kohl", "colors": 4, "retailer": "Credo Beauty"}]} \ No newline at end of file diff --git a/examples/openai/xml_scraper_openai.py b/examples/openai/xml_scraper_openai.py new file mode 100644 index 00000000..32b79981 --- /dev/null +++ b/examples/openai/xml_scraper_openai.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gpt-3.5-turbo", + }, +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index a8ee6ac5..d943a4dc 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -6,3 +6,5 @@ from .speech_graph import SpeechGraph from .search_graph import SearchGraph from .script_creator_graph import ScriptCreatorGraph +from .xml_scraper_graph import XMLScraperGraph +from .json_scraper_graph import JSONScraperGraph diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py new file mode 100644 index 00000000..02092544 --- /dev/null +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -0,0 +1,77 @@ +""" +Module for creating the smart scraper +""" +from .base_graph import BaseGraph +from ..nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode +) +from .abstract_graph import AbstractGraph + + +class JSONScraperGraph(AbstractGraph): + """ + SmartScraper is a comprehensive web scraping tool that automates the process of extracting + information from web pages using a natural language model to interpret and answer prompts. + """ + + def __init__(self, prompt: str, source: str, config: dict): + """ + Initializes the JsonScraperGraph with a prompt, source, and configuration. + """ + super().__init__(prompt, config, source) + + self.input_key = "json" if source.endswith("json") else "json_dir" + + def _create_graph(self): + """ + Creates the graph of nodes representing the workflow for web scraping. + """ + fetch_node = FetchNode( + input="json_dir", + output=["doc"], + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={"chunk_size": self.model_token} + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm": self.llm_model, + "embedder_model": self.embedder_model + } + ) + generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={"llm": self.llm_model}, + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=fetch_node + ) + + def run(self) -> str: + """ + Executes the web scraping process and returns the answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py new file mode 100644 index 00000000..0dad83e3 --- /dev/null +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -0,0 +1,77 @@ +""" +Module for creating the smart scraper +""" +from .base_graph import BaseGraph +from ..nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode +) +from .abstract_graph import AbstractGraph + + +class XMLScraperGraph(AbstractGraph): + """ + SmartScraper is a comprehensive web scraping tool that automates the process of extracting + information from web pages using a natural language model to interpret and answer prompts. + """ + + def __init__(self, prompt: str, source: str, config: dict): + """ + Initializes the XmlScraperGraph with a prompt, source, and configuration. + """ + super().__init__(prompt, config, source) + + self.input_key = "xml" if source.endswith("xml") else "xml_dir" + + def _create_graph(self): + """ + Creates the graph of nodes representing the workflow for web scraping. + """ + fetch_node = FetchNode( + input="xml_dir", + output=["doc"], + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={"chunk_size": self.model_token} + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm": self.llm_model, + "embedder_model": self.embedder_model + } + ) + generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={"llm": self.llm_model}, + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=fetch_node + ) + + def run(self) -> str: + """ + Executes the web scraping process and returns the answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 76d80929..9b6cb52f 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -70,9 +70,12 @@ def execute(self, state): input_data = [state[key] for key in input_keys] source = input_data[0] - + if self.input == "json_dir" or self.input == "xml_dir": + compressed_document = [Document(page_content=source, metadata={ + "source": "local_dir" + })] # if it is a local directory - if not source.startswith("http"): + elif not source.startswith("http"): compressed_document = [Document(page_content=remover(source), metadata={ "source": "local_dir" })] diff --git a/tests/graphs/inputs/example.json b/tests/graphs/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/tests/graphs/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file