Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

一些优化 #126

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
12 changes: 12 additions & 0 deletions examples/usage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ def run(self):
# 消除关闭证书验证的警告
urllib3.disable_warnings()
headers = Headers(headers=True).generate()
<<<<<<< HEAD
# headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
# headers['Host'] = 'bb.cf08tp.cn'
# headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'PHPSESSID={}'.format(
''.join(str(uuid.uuid1()).split('-')))
# print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
=======
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
headers['Host'] = 'bb.cf08tp.cn'
Expand All @@ -48,6 +59,7 @@ def run(self):
print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
>>>>>>> cb4cbc440e00a091773d985705d3f27b93a9213e
# 结束计时
end = time.time()
# 输出内容
Expand Down
95 changes: 95 additions & 0 deletions examples/usage3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding: UTF-8 -*-

'''
'''
import requests
import time
import threading
import urllib3
from fake_headers import Headers
import uuid
from geolite2 import geolite2
ips = []

# 爬数据的线程类

def getChinaIP(ip='127.0.0.1'):
reader = geolite2.reader()
ip_info = reader.get(ip)
geolite2.close()
# print(ip_info)
return True if ip_info['country']['iso_code'] == 'CN' else False



class CrawlThread(threading.Thread):
def __init__(self, proxyip):
super(CrawlThread, self).__init__()
self.proxyip = proxyip

def run(self):
# 开始计时
pure_ip_address = self.proxyip.split(':')[0]
# 验证IP归属
if not getChinaIP(pure_ip_address):
pass
# raise ValueError('不是有效IP')
#
start = time.time()
# 消除关闭证书验证的警告
urllib3.disable_warnings()
headers = Headers(headers=True).generate()
headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html'
headers['Pragma'] = 'no-cache'
# headers['Host'] = 'ga.314300.cn'
# headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL'
# print(headers)
headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3'
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
# 结束计时
end = time.time()
# 输出内容
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")

# 获取代理IP的线程类


class GetIpThread(threading.Thread):
def __init__(self, fetchSecond):
super(GetIpThread, self).__init__()
self.fetchSecond = fetchSecond

def run(self):
global ips
while True:
# 获取IP列表
res = requests.get(apiUrl).content.decode()
# 按照\n分割获取到的IP
ips = res.split('\n')
# 利用每一个IP
for proxyip in ips:
if proxyip.strip():
# 开启一个线程
# CrawlThread(proxyip).start()
try:
CrawlThread(proxyip).run()
time.sleep(1.5)
except Exception as e:
print(e)
# 休眠
time.sleep(len(ips) /self.fetchSecond )


if __name__ == '__main__':
# 获取IP的API接口
# apiUrl = "http://127.0.0.1:5555/all"
apiUrl = "http://127.0.0.1:5555/random"
# 要抓取的目标网站地址
# targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
targetUrl = 'http://ga.314300.cn/toupiao/json/?id=40&s=tp'
fetchSecond = 5
# 开始自动获取IP
GetIpThread(fetchSecond).start()
31 changes: 31 additions & 0 deletions proxypool/crawlers/public/fanqieip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from pyquery import PyQuery as pq

BaseUrl = 'https://www.fanqieip.com/free/{num}'
MAX_PAGE = 5 * 100


class FanqieIPCrawler(BaseCrawler):
"""
FanqieIP crawler, https://www.fanqieip.com
"""
urls = [BaseUrl.format(num=i) for i in range(1, MAX_PAGE)]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('.layui-table tbody tr ').items()
for tr in trs:
host = tr.find('td div')[0].text
port = tr.find('td div')[1].text
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = FanqieIPCrawler()
for proxy in crawler.crawl():
print(proxy)
2 changes: 1 addition & 1 deletion proxypool/crawlers/public/taiyangdaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pyquery import PyQuery as pq

BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
MAX_PAGE = 5
MAX_PAGE = 5 * 2


class TaiyangdailiCrawler(BaseCrawler):
Expand Down
11 changes: 6 additions & 5 deletions proxypool/processors/server.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from flask import Flask, g
from flask import Flask, g , request , jsonify
from proxypool.storages.redis import RedisClient
from proxypool.setting import API_HOST, API_PORT, API_THREADED
from proxypool.setting import API_HOST, API_PORT, API_THREADED,PROXY_SCORE_MIN, PROXY_SCORE_MAX


__all__ = ['app']
Expand Down Expand Up @@ -40,11 +40,12 @@ def get_proxy():
@app.route('/all')
def get_proxy_all():
"""
get a random proxy
:return: get a random proxy
get proxy by min_score to max_score
:return: proxies list
"""
args = request.args
conn = get_conn()
proxies = conn.all()
proxies = conn.all(args.get('min_score',PROXY_SCORE_MIN),args.get('max_score',PROXY_SCORE_MAX))
proxies_string = ''
for proxy in proxies:
proxies_string += str(proxy) + '\n'
Expand Down
4 changes: 2 additions & 2 deletions proxypool/processors/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
from asyncio import TimeoutError

import requests

EXCEPTIONS = (
ClientProxyConnectionError,
Expand Down Expand Up @@ -43,7 +43,7 @@ async def test(self, proxy: Proxy):
# if TEST_ANONYMOUS is True, make sure that
# the proxy has the effect of hiding the real IP
if TEST_ANONYMOUS:
url = 'https://httpbin.org/ip'
url = 'http://www.nghttp2.org/httpbin/ip'
async with session.get(url, timeout=TEST_TIMEOUT) as response:
resp_json = await response.json()
origin_ip = resp_json['origin']
Expand Down
6 changes: 6 additions & 0 deletions proxypool/schemas/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ class Proxy(object):
"""
host = attr(type=str, default=None)
port = attr(type=int, default=None)
location = attr(type=str, default=None)
isp = attr(type=str, default=None)
country = attr(type=str, default=None)
anonymous = attr(type=bool, default=None)
protocol = attr(type=str, default=None)
alive_time = attr(type=int, default=None)

def __str__(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions proxypool/storages/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,12 @@ def count(self) -> int:
"""
return self.db.zcard(REDIS_KEY)

def all(self) -> List[Proxy]:
def all(self,min_score=PROXY_SCORE_MIN,max_score=PROXY_SCORE_MAX) -> List[Proxy]:
"""
get all proxies
:return: list of proxies
"""
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, min_score,max_score))

def batch(self, cursor, count) -> List[Proxy]:
"""
Expand Down