In [212]:
import requests

In [213]:
def get_request_info():
    res = requests.get('http://ifconfig.co/json')
    return res.json()

In [214]:
j = get_request_info()
j

{'city': 'Taipei',
 'country': 'Taiwan',
 'hostname': '36-231-24-206.dynamic-ip.hinet.net',
 'ip': '36.231.24.206',
 'ip_decimal': 619124942}

In [215]:
res = requests.get('http://www.ip181.com/')
res.encoding = 'gbk'
res.text[:2000]



In [216]:
import pandas as pd
from bs4 import BeautifulSoup

In [218]:
dfs = pd.read_html(res.text)

In [219]:
from fake_useragent import UserAgent
ua = UserAgent()

In [220]:
url = 'http://www.ip181.com/'
res = requests.get(url, {'user-agent': ua.random})
res.encoding = 'gbk'

In [221]:
df = pd.read_html(res.text)[0]
df.columns = df.iloc[0]
df = df.iloc[1:]
df.head(3)

Unnamed: 0,IP地址,端口,匿名等级,代理类型,响应时间,地理位置,最近验证时间
1,118.193.107.62,80,普匿,HTTP,16.44 秒,北京市 CNISP未分配地址资源,1分钟2秒前
2,118.178.227.171,80,高匿,HTTP,12.69 秒,浙江省杭州市 阿里云计算有限公司,1分钟8秒前
3,119.87.238.51,8123,高匿,"HTTP,HTTPS",8.77 秒,重庆市 电信,1分钟9秒前


In [222]:
df['匿名等级'].value_counts()

普匿    58
透明    22
高匿    20
Name: 匿名等级, dtype: int64

In [223]:
df.columns = ['ip', 'port', 'level', 'http_s', 
              'response_time', 'location', 'last_check_time']

df.head(3)

Unnamed: 0,ip,port,level,http_s,response_time,location,last_check_time
1,118.193.107.62,80,普匿,HTTP,16.44 秒,北京市 CNISP未分配地址资源,1分钟2秒前
2,118.178.227.171,80,高匿,HTTP,12.69 秒,浙江省杭州市 阿里云计算有限公司,1分钟8秒前
3,119.87.238.51,8123,高匿,"HTTP,HTTPS",8.77 秒,重庆市 电信,1分钟9秒前


In [224]:
proxy_list = list(df['ip'] + ':' + df['port'])

In [225]:
proxy_list[:10]

['118.193.107.62:80',
 '118.178.227.171:80',
 '119.87.238.51:8123',
 '118.193.107.109:80',
 '125.62.12.68:80',
 '121.31.193.14:8123',
 '112.114.98.136:8118',
 '118.193.107.30:80',
 '210.26.125.142:8080',
 '221.7.255.168:8080']

In [226]:
for p in proxy_list[:3]:
    print(p)

118.193.107.62:80
118.178.227.171:80
119.87.238.51:8123


In [227]:
from datetime import datetime

def get_request_info_with_proxy(proxy):
    start = datetime.now()
    
    try:
        r = requests.get('http://ifconfig.co/json', 
                         proxies={'http': proxy},
                         headers={'user-agent': ua.random},
                         timeout=10)

        rs_info = r.json()
        rs_info['response_time'] = (datetime.now() - start).total_seconds()
    except Exception as e:
        rs_info = {
            'exception': type(e),
            'ip': None,
            'ip_decimal': None,
            'country': None,
            'city': None,
            'response_time': None
        }
        
    rs_info['proxy'] = proxy
    rs_info['proxy_available'] = rs_info['ip'] == rs_info['proxy'].split(':')[0]
    return rs_info

In [228]:
get_request_info_with_proxy(proxies_list[90])

{'city': 'Beijing',
 'country': 'China',
 'ip': '116.196.119.138',
 'ip_decimal': 1959032714,
 'proxy': '116.196.119.138:3128',
 'proxy_available': True,
 'response_time': 6.901867}

In [229]:
import time
from tqdm import tqdm


frame = []
for proxy in tqdm(proxies_list):
    info = get_request_info_with_proxy(proxy)
    frame.append(info)
    time.sleep(1)

100%|██████████| 100/100 [12:19<00:00, 10.91s/it]


In [161]:
get_request_info_with_proxy(p)

{'city': None,
 'country': None,
 'exception': requests.exceptions.ReadTimeout(urllib3.exceptions.ReadTimeoutError("HTTPConnectionPool(host='111.56.5.41', port=80): Read timed out. (read timeout=10)")),
 'ip': None,
 'ip_decimal': None,
 'proxy': '111.56.5.41:80',
 'proxy_available': False,
 'response_time': None}

In [163]:
df = pd.DataFrame(frame)
df.query('response_time == response_time').sort_values('response_time').head(10)

Unnamed: 0,city,country,exception,hostname,ip,ip_decimal,proxy,proxy_available,response_time
21,Chongqing,China,,,113.207.27.84,1909398000.0,113.207.27.84:3128,True,0.659555
45,Guangzhou,China,,,116.199.2.208,1959199000.0,116.199.2.208:80,True,0.665322
10,Chongqing,China,,,123.147.165.143,2073274000.0,123.147.165.143:8080,True,0.755805
47,Changsha,China,,,183.214.162.196,3084297000.0,119.36.92.46:80,False,0.813416
51,Unknown,China,,,120.237.91.34,2028821000.0,120.237.91.34:9797,True,0.848075
11,Guangzhou,China,,,116.199.2.209,1959199000.0,116.199.2.209:80,True,0.862768
40,Hangzhou,China,,,121.43.178.58,2032907000.0,121.43.178.58:3128,True,0.90937
61,Yichang,China,,,219.139.130.49,3683353000.0,219.139.130.49:80,True,0.92545
41,Changsha,China,,,183.214.162.188,3084297000.0,119.36.92.47:80,False,0.950258
0,Guangzhou,China,,,219.135.164.245,3683100000.0,219.135.164.245:3128,True,1.018491


In [182]:
cols = ['proxy', 'proxy_available', 'response_time']

(df.query('response_time == response_time')
   .sort_values('response_time')
   .query('proxy_available == True')[cols].pipe(len))

44

In [179]:
type(df['exception'].dropna().tolist().pop())

requests.exceptions.ReadTimeout

In [181]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
city               48 non-null object
country            48 non-null object
exception          52 non-null object
hostname           4 non-null object
ip                 48 non-null object
ip_decimal         48 non-null float64
proxy              100 non-null object
proxy_available    100 non-null bool
response_time      48 non-null float64
dtypes: bool(1), float64(2), object(6)
memory usage: 32.9 KB


In [193]:
df.query('response_time < 5').sort_values('response_time')[['proxy', 'response_time']].head(3)

Unnamed: 0,proxy,response_time
21,113.207.27.84:3128,0.659555
45,116.199.2.208:80,0.665322
10,123.147.165.143:8080,0.755805


In [200]:
get_request_info_with_proxy('113.207.27.84:3128')

{'city': 'Chongqing',
 'country': 'China',
 'ip': '113.207.27.84',
 'ip_decimal': 1909398356,
 'proxy': '113.207.27.84:3128',
 'proxy_available': True,
 'response_time': 0.679468}