From f6077d1f98023ac3bf0c89ef6b3d67dde4818df7 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Sat, 27 Apr 2024 13:31:53 +0200 Subject: [PATCH 1/2] feat: add new proxy rotation function --- scrapegraphai/nodes/fetch_node.py | 12 ++++++++-- scrapegraphai/utils/__init__.py | 1 + scrapegraphai/utils/proxy_rotation.py | 32 +++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 scrapegraphai/utils/proxy_rotation.py diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index ff5674e2..bdbabfb1 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -7,6 +7,7 @@ from langchain_core.documents import Document from .base_node import BaseNode from ..utils.remover import remover +from ..utils.proxy_rotation import proxy_rotation class FetchNode(BaseNode): @@ -37,13 +38,16 @@ class FetchNode(BaseNode): to succeed. """ - def __init__(self, input: str, output: List[str], node_name: str = "Fetch"): + def __init__(self, input: str, output: List[str], num_prox: int = True, + node_name: str = "Fetch"): """ Initializes the FetchHTMLNode with a node name and node type. Arguments: node_name (str): name of the node + prox_rotation (bool): if you wamt to rotate proxies """ super().__init__(node_name, "node", input, output, 1) + self.num_prox = num_prox def execute(self, state): """ @@ -78,7 +82,11 @@ def execute(self, state): # if it is a URL else: - loader = AsyncHtmlLoader(source) + if self.num_prox > 1: + loader = AsyncHtmlLoader( + source, proxies=proxy_rotation(self.num_prox)) + else: + loader = AsyncHtmlLoader(source) document = loader.load() compressed_document = [ Document(page_content=remover(str(document)))] diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 2ea30cf1..4d339ed4 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -5,3 +5,4 @@ from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info +from .proxy_rotation import proxy_rotation diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py new file mode 100644 index 00000000..62f9f1de --- /dev/null +++ b/scrapegraphai/utils/proxy_rotation.py @@ -0,0 +1,32 @@ +""" +Module for rotating proxies +""" +from fp.fp import FreeProxy + + +def proxy_rotation(num_ips: int): + """ + Rotates through a specified number of proxy IPs using the FreeProxy library. + + Args: + num_ips (int): The number of proxy IPs to rotate through. + + Returns: + dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation. + + Example: + >>> proxy_rotation(5) + { + 0: '192.168.1.1:8080', + 1: '103.10.63.135:8080', + 2: '176.9.75.42:8080', + 3: '37.57.216.2:8080', + 4: '113.20.31.250:8080' + } + """ + res = {} + + for i in range(0, num_ips): + res[i] = FreeProxy().get() + + return res From b754dd909cd2aa2d5b5d94d9c7879ba3da58adc4 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Sat, 27 Apr 2024 14:40:50 +0200 Subject: [PATCH 2/2] fix: changed proxy function --- scrapegraphai/nodes/fetch_node.py | 4 ++-- scrapegraphai/utils/__init__.py | 2 +- scrapegraphai/utils/proxy_rotation.py | 9 ++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index bdbabfb1..2564d44d 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -7,7 +7,7 @@ from langchain_core.documents import Document from .base_node import BaseNode from ..utils.remover import remover -from ..utils.proxy_rotation import proxy_rotation +from ..utils.proxy_generator import proxy_generator class FetchNode(BaseNode): @@ -84,7 +84,7 @@ def execute(self, state): else: if self.num_prox > 1: loader = AsyncHtmlLoader( - source, proxies=proxy_rotation(self.num_prox)) + source, proxies=proxy_generator(self.num_prox)) else: loader = AsyncHtmlLoader(source) document = loader.load() diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index 4d339ed4..3fd1d884 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -5,4 +5,4 @@ from .convert_to_csv import convert_to_csv from .convert_to_json import convert_to_json from .prettify_exec_info import prettify_exec_info -from .proxy_rotation import proxy_rotation +from .proxy_generator import proxy_generator diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 62f9f1de..0019b421 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -4,7 +4,7 @@ from fp.fp import FreeProxy -def proxy_rotation(num_ips: int): +def proxy_generator(num_ips: int): """ Rotates through a specified number of proxy IPs using the FreeProxy library. @@ -15,7 +15,7 @@ def proxy_rotation(num_ips: int): dict: A dictionary containing the rotated proxy IPs, indexed by their position in rotation. Example: - >>> proxy_rotation(5) + >>> proxy_generator(5) { 0: '192.168.1.1:8080', 1: '103.10.63.135:8080', @@ -24,9 +24,8 @@ def proxy_rotation(num_ips: int): 4: '113.20.31.250:8080' } """ - res = {} + res = [] for i in range(0, num_ips): - res[i] = FreeProxy().get() - + res.append(FreeProxy().get()) return res