添加了 geonodedaili.py 爬取代理 (#186)

* add geonodedaili.py * add headers through crawl function
Python3WebSpider · Mar 1, 2023 · 4c50711 · 4c50711
1 parent 78325d0
commit 4c50711
Showing 1 changed file with 71 additions and 0 deletions.
diff --git a/proxypool/crawlers/public/geonodedaili.py b/proxypool/crawlers/public/geonodedaili.py
@@ -0,0 +1,71 @@
+import time
+from retrying import RetryError
+from loguru import logger
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import json
+
+BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc'
+MAX_PAGE = 18
+
+
+class GeonodeCrawler(BaseCrawler):
+    """
+    Geonode crawler, https://proxylist.geonode.com/
+    """
+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        try:
+            result = json.loads(html)
+            proxy_list = result['data']
+            for proxy_item in proxy_list:
+                host = proxy_item['ip']
+                port = proxy_item['port']
+                yield Proxy(host=host, port=port)
+        except json.JSONDecodeError:
+            print("json.JSONDecodeError")
+            return
+
+    def crawl(self):
+        """
+        override crawl main method
+        add headers
+        """
+        headers = {
+            'authority': 'proxylist.geonode.com',
+            'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
+            'accept': 'application/json, text/plain, */*',
+            'sec-ch-ua-mobile': '?0',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
+            'sec-ch-ua-platform': '"macOS"',
+            'origin': 'https://geonode.com',
+            'sec-fetch-site': 'same-site',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-dest': 'empty',
+            'referer': 'https://geonode.com/',
+            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
+            'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"',
+        }
+        try:
+            for url in self.urls:
+                logger.info(f'fetching {url}')
+                html = self.fetch(url, headers=headers)
+                if not html:
+                    continue
+                time.sleep(.5)
+                yield from self.process(html, url)
+        except RetryError:
+            logger.error(
+                f'crawler {self} crawled proxy unsuccessfully, '
+                'please check if target url is valid or network issue')
+
+
+if __name__ == '__main__':
+    crawler = GeonodeCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)