Skip to content

Commit

Permalink
添加了 geonodedaili.py 爬取代理 (#186)
Browse files Browse the repository at this point in the history
* add geonodedaili.py

* add headers through crawl function
  • Loading branch information
MGMCN committed Mar 1, 2023
1 parent 78325d0 commit 4c50711
Showing 1 changed file with 71 additions and 0 deletions.
71 changes: 71 additions & 0 deletions proxypool/crawlers/public/geonodedaili.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import time
from retrying import RetryError
from loguru import logger
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import json

BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc'
MAX_PAGE = 18


class GeonodeCrawler(BaseCrawler):
"""
Geonode crawler, https://proxylist.geonode.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
try:
result = json.loads(html)
proxy_list = result['data']
for proxy_item in proxy_list:
host = proxy_item['ip']
port = proxy_item['port']
yield Proxy(host=host, port=port)
except json.JSONDecodeError:
print("json.JSONDecodeError")
return

def crawl(self):
"""
override crawl main method
add headers
"""
headers = {
'authority': 'proxylist.geonode.com',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://geonode.com',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://geonode.com/',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"',
}
try:
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=headers)
if not html:
continue
time.sleep(.5)
yield from self.process(html, url)
except RetryError:
logger.error(
f'crawler {self} crawled proxy unsuccessfully, '
'please check if target url is valid or network issue')


if __name__ == '__main__':
crawler = GeonodeCrawler()
for proxy in crawler.crawl():
print(proxy)

0 comments on commit 4c50711

Please sign in to comment.