Python3WebSpider · everhopingandwaiting · Feb 2, 2021 · Feb 2, 2021 · Feb 2, 2021 · Feb 3, 2021
diff --git a/examples/usage2.py b/examples/usage2.py
@@ -39,6 +39,17 @@ def run(self):
         # 消除关闭证书验证的警告
         urllib3.disable_warnings()
         headers = Headers(headers=True).generate()
+<<<<<<< HEAD
+        # headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
+        headers['Pragma'] = 'no-cache'
+        # headers['Host'] = 'bb.cf08tp.cn'
+        # headers['x-forward-for'] = pure_ip_address
+        headers['Cookie'] = 'PHPSESSID={}'.format(
+            ''.join(str(uuid.uuid1()).split('-')))
+        # print(headers)
+        html = requests.get(headers=headers, url=targetUrl, proxies={
+                            "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
+=======
         headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
         headers['Pragma'] = 'no-cache'
         headers['Host'] = 'bb.cf08tp.cn'
@@ -48,6 +59,7 @@ def run(self):
         print(headers)
         html = requests.get(headers=headers, url=targetUrl, proxies={
                             "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
+>>>>>>> cb4cbc440e00a091773d985705d3f27b93a9213e
         # 结束计时
         end = time.time()
         # 输出内容

diff --git a/examples/usage3.py b/examples/usage3.py
@@ -0,0 +1,95 @@
+# -*- coding: UTF-8 -*-
+
+'''
+'''
+import requests
+import time
+import threading
+import urllib3
+from fake_headers import Headers
+import uuid
+from geolite2 import geolite2
+ips = []
+
+# 爬数据的线程类
+
+def getChinaIP(ip='127.0.0.1'):
+    reader = geolite2.reader()
+    ip_info = reader.get(ip)
+    geolite2.close()
+    # print(ip_info)
+    return True if ip_info['country']['iso_code'] == 'CN' else False
+
+
+
+class CrawlThread(threading.Thread):
+    def __init__(self, proxyip):
+        super(CrawlThread, self).__init__()
+        self.proxyip = proxyip
+
+    def run(self):
+        # 开始计时
+        pure_ip_address = self.proxyip.split(':')[0]
+        # 验证IP归属
+        if not getChinaIP(pure_ip_address):
+            pass
+            # raise ValueError('不是有效IP')
+        # 
+        start = time.time()
+        # 消除关闭证书验证的警告
+        urllib3.disable_warnings()
+        headers = Headers(headers=True).generate()
+        headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html'
+        headers['Pragma'] = 'no-cache'
+        # headers['Host'] = 'ga.314300.cn'
+        # headers['x-forward-for'] = pure_ip_address
+        headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL'
+        # print(headers)
+        headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3'
+        html = requests.get(headers=headers, url=targetUrl, proxies={
+                            "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
+        # 结束计时
+        end = time.time()
+        # 输出内容
+        print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
+              "毫秒 " + self.proxyip + " 获取到如下HTML内容：\n" + html + "\n*************")
+
+# 获取代理IP的线程类
+
+
+class GetIpThread(threading.Thread):
+    def __init__(self, fetchSecond):
+        super(GetIpThread, self).__init__()
+        self.fetchSecond = fetchSecond
+
+    def run(self):
+        global ips
+        while True:
+            # 获取IP列表
+            res = requests.get(apiUrl).content.decode()
+            # 按照\n分割获取到的IP
+            ips = res.split('\n')
+            # 利用每一个IP
+            for proxyip in ips:
+                if proxyip.strip():
+                    # 开启一个线程
+                    # CrawlThread(proxyip).start()
+                    try:
+                        CrawlThread(proxyip).run()
+                        time.sleep(1.5)
+                    except Exception as e:
+                        print(e)
+            # 休眠
+            time.sleep(len(ips) /self.fetchSecond )
+
+
+if __name__ == '__main__':
+    # 获取IP的API接口
+    # apiUrl = "http://127.0.0.1:5555/all"
+    apiUrl = "http://127.0.0.1:5555/random"
+    # 要抓取的目标网站地址
+    # targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
+    targetUrl = 'http://ga.314300.cn/toupiao/json/?id=40&s=tp'
+    fetchSecond = 5
+    # 开始自动获取IP
+    GetIpThread(fetchSecond).start()
diff --git a/proxypool/crawlers/public/fanqieip.py b/proxypool/crawlers/public/fanqieip.py
@@ -0,0 +1,31 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+from pyquery import PyQuery as pq
+
+BaseUrl = 'https://www.fanqieip.com/free/{num}'
+MAX_PAGE = 5 * 100
+
+
+class FanqieIPCrawler(BaseCrawler):
+    """
+    FanqieIP crawler, https://www.fanqieip.com
+    """
+    urls = [BaseUrl.format(num=i) for i in range(1, MAX_PAGE)]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        doc = pq(html)
+        trs = doc('.layui-table tbody tr ').items()
+        for tr in trs:
+            host = tr.find('td div')[0].text
+            port = tr.find('td div')[1].text
+            yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = FanqieIPCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py
@@ -3,7 +3,7 @@
 from pyquery import PyQuery as pq
 
 BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
-MAX_PAGE = 5
+MAX_PAGE = 5 * 2
 
 
 class TaiyangdailiCrawler(BaseCrawler):

diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py
@@ -1,6 +1,6 @@
-from flask import Flask, g
+from flask import Flask, g , request , jsonify
 from proxypool.storages.redis import RedisClient
-from proxypool.setting import API_HOST, API_PORT, API_THREADED
+from proxypool.setting import API_HOST, API_PORT, API_THREADED,PROXY_SCORE_MIN, PROXY_SCORE_MAX
 
 
 __all__ = ['app']
@@ -40,11 +40,12 @@ def get_proxy():
 @app.route('/all')
 def get_proxy_all():
     """
-    get a random proxy
-    :return: get a random proxy
+    get proxy by min_score to max_score
+    :return: proxies list
     """
+    args = request.args
     conn = get_conn()
-    proxies = conn.all()
+    proxies = conn.all(args.get('min_score',PROXY_SCORE_MIN),args.get('max_score',PROXY_SCORE_MAX))
     proxies_string = ''
     for proxy in proxies:
         proxies_string += str(proxy) + '\n'

diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py
@@ -6,7 +6,7 @@
 from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS
 from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
 from asyncio import TimeoutError
-
+import requests
 
 EXCEPTIONS = (
     ClientProxyConnectionError,
@@ -43,7 +43,7 @@ async def test(self, proxy: Proxy):
                 # if TEST_ANONYMOUS is True, make sure that
                 # the proxy has the effect of hiding the real IP
                 if TEST_ANONYMOUS:
-                    url = 'https://httpbin.org/ip'
+                    url = 'http://www.nghttp2.org/httpbin/ip'
                     async with session.get(url, timeout=TEST_TIMEOUT) as response:
                         resp_json = await response.json()
                         origin_ip = resp_json['origin']

diff --git a/proxypool/schemas/proxy.py b/proxypool/schemas/proxy.py
@@ -8,6 +8,12 @@ class Proxy(object):
     """
     host = attr(type=str, default=None)
     port = attr(type=int, default=None)
+    location = attr(type=str, default=None)
+    isp = attr(type=str, default=None)
+    country = attr(type=str, default=None)
+    anonymous = attr(type=bool, default=None)
+    protocol = attr(type=str, default=None)
+    alive_time = attr(type=int, default=None)
 
     def __str__(self):
         """

diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py
@@ -103,12 +103,12 @@ def count(self) -> int:
         """
         return self.db.zcard(REDIS_KEY)
 
-    def all(self) -> List[Proxy]:
+    def all(self,min_score=PROXY_SCORE_MIN,max_score=PROXY_SCORE_MAX) -> List[Proxy]:
         """
         get all proxies
         :return: list of proxies
         """
-        return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
+        return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, min_score,max_score))
 
     def batch(self, cursor, count) -> List[Proxy]:
         """