In [43]:
import requests, bs4, proxyscrape
import pandas as pd
from proxyscrape import create_collector, get_collector

### Proxy Scraping Function:

In [44]:
def scrape_proxies(collector, timeout_filtering, url_testing):
    
    #Refresh list of proxies to scrape 
    collector.refresh_proxies(force=True)
    
    #Scrape HTTPS proxies that provide anonymity for specific countries. See proxyscrape documentation for more info.
#     proxy_list = collector.get_proxy({'code': ('us', 'ca'), 'anonymous': True, 'type': 'https'}) #Get one USA proxy
#     proxy_list = collector.get_proxies({'code': ('us'), 'anonymous': True, 'type': 'https'}) #Get ALL USA proxies
    proxy_list = collector.get_proxies({'code': ('us', 'ca'), 'anonymous': True, 'type': 'https'}) #Get ALL USA + Canadian proxies
#     proxy_list = collector.get_proxies({'code': ('us', 'ca', 'de', 'fr'), 'anonymous': True, 'type': 'https'}) ALL USA+Canada+German+French
    
    #Let's extract useful information from this scraped list of proxies
    #Note that scraped list may contain duplicates
    
    ip_addresses = []
    ports = []
    countries = []

    #Collecting proxy information
    for proxy in proxy_list:
        if proxy[0] in ip_addresses and proxy[1] in ports: #preventing duplicates from being recorded
            continue

        ip_addresses.append(proxy[0]) #Collect the ip addresses
        ports.append(proxy[1]) #Collect the port numbers
        countries.append(proxy[3]) #Collect their corresponding countries
    
    #Now let's filter out the working (good) proxies
    
    good_proxies = [] 
    good_proxies_country = []

    for ip_address, port, country in zip(ip_addresses, ports, countries):
        full_proxy = ip_address + ":" + port

        proxy_temp = {"https": "https://" + full_proxy} #putting it in correct format
        print("Trying Proxy: " + "https://" + full_proxy)

        #Filtering proxies
        try:
            requests.get(url_testing, proxies=proxy_temp, timeout=timeout_filtering) #Testing proxy by downloading content in URL with a timeout 
            print("Success! Added to List.") #Successful downloading means that the proxy works
            good_proxies.append(full_proxy)
            good_proxies_country.append(country)
        except:
            print("Connection error. Trying next proxy.")
    
    return good_proxies, good_proxies_country

### Initialize here:

In [45]:
url_testing = 'http://www.foo.com/' #URL we'll use to check if a proxy works. Put your desired url to scrape here for best results.
timeout_filtering = 4 #seconds, the timer we'll set on checking url_testing with proxies

### We will use the 'proxyscrape' module for scraping proxies. Documentation here: https://pypi.org/project/proxyscrape/ ###

#Initialize collector for proxies. RUN THIS ONCE.
collector = create_collector('my-collector', 'https')

#Retrieve a collector if already initialized
# collector = get_collector('my-collector')

#Outputs a list of working proxies and their corresponding geolocation (country)
good_proxies, good_proxies_country  = scrape_proxies(collector, timeout_filtering, url_testing)

Trying Proxy: https://173.236.38.106:80
Success! Added to List.
Trying Proxy: https://104.45.188.43:3128
Success! Added to List.
Trying Proxy: https://34.105.59.26:80
Success! Added to List.
Trying Proxy: https://139.99.105.5:80
Success! Added to List.
Trying Proxy: https://24.172.34.114:49920
Success! Added to List.
Trying Proxy: https://199.91.203.210:3128
Success! Added to List.
Trying Proxy: https://64.71.145.122:3128
Success! Added to List.
Trying Proxy: https://185.243.56.133:80
Success! Added to List.
Trying Proxy: https://69.65.65.178:58389
Success! Added to List.
Trying Proxy: https://173.46.67.172:58517
Success! Added to List.
Trying Proxy: https://54.214.52.181:80
Success! Added to List.
Trying Proxy: https://144.217.101.245:3129
Success! Added to List.
Trying Proxy: https://38.126.254.114:58768
Success! Added to List.
Trying Proxy: https://205.158.57.2:53281
Success! Added to List.
Trying Proxy: https://173.236.38.109:80
Success! Added to List.
Trying Proxy: https://136.25.

#### Display collected proxies in dataframe

In [46]:
pd.DataFrame(list(zip(good_proxies, good_proxies_country)), 
               columns =['Proxy', 'Country']) 

Unnamed: 0,Proxy,Country
0,173.236.38.106:80,united states
1,104.45.188.43:3128,united states
2,34.105.59.26:80,united states
3,139.99.105.5:80,canada
4,24.172.34.114:49920,united states
5,199.91.203.210:3128,united states
6,64.71.145.122:3128,united states
7,185.243.56.133:80,united states
8,69.65.65.178:58389,united states
9,173.46.67.172:58517,united states
