In [25]:
import webbrowser
import requests, bs4
import pandas as pd
import proxyscrape
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from proxyscrape import create_collector, get_collector

### Proxy Scraping Function:

In [26]:
def scrape_proxies(collector, timeout_filtering, url_testing):
    
    #Refresh list of proxies to scrape 
    collector.refresh_proxies(force=True)
    
    #Scrape HTTPS proxies that provide anonymity for specific countries. See proxyscrape documentation for more info.
#     proxy_list = collector.get_proxy({'code': ('us'), 'anonymous': True, 'type': 'https'}) #Get one USA proxy
#     proxy_list = collector.get_proxies({'code': ('us'), 'anonymous': True, 'type': 'https'}) #Get ALL USA proxies
    proxy_list = collector.get_proxies({'code': ('us', 'ca'), 'anonymous': True, 'type': 'https'}) #Get ALL USA + Canadian proxies
#     proxy_list = collector.get_proxies({'code': ('us', 'ca', 'de', 'fr'), 'anonymous': True, 'type': 'https'}) ALL USA+Canada+German+French
    
    #Let's extract useful information from this scraped list of proxies
    #Note that scraped list may contain duplicates
    
    ip_addresses = []
    ports = []
    countries = []

    #Collecting proxy information
    for proxy in proxy_list:
        if proxy[0] in ip_addresses and proxy[1] in ports: #preventing duplicates from being recorded
            continue

        ip_addresses.append(proxy[0]) #Collect the ip addresses
        ports.append(proxy[1]) #Collect the port numbers
        countries.append(proxy[3]) #Collect their corresponding countries
    
    #Now let's filter out the working (good) proxies
    
    good_proxies = [] 
    good_proxies_country = []

    for ip_address, port, country in zip(ip_addresses, ports, countries):
        full_proxy = ip_address + ":" + port

        proxy_temp = {"https": "https://" + full_proxy} #putting it in correct format
        print("Trying Proxy: " + "https://" + full_proxy)

        #Filtering proxies
        try:
            requests.get(url_testing, proxies=proxy_temp, timeout=timeout_filtering) #Testing proxy by downloading content in URL with a timeout 
            print("Success! Added to List.") #Successful downloading means that the proxy works
            good_proxies.append(full_proxy)
            good_proxies_country.append(country)
        except:
            print("Connection error. Trying next proxy.")
    
    return good_proxies, good_proxies_country

### Initialize Proxy Collection here:

In [28]:
url_testing = 'http://www.foo.com/' #URL we'll use to check if a proxy works. Put your desired url to scrape here for best results.
timeout_filtering = 4 #seconds, the timer we'll set on checking url_testing with proxies

### We will use the 'proxyscrape' module for scraping proxies. Documentation here: https://pypi.org/project/proxyscrape/ ###

#Initialize collector for proxies. RUN THIS ONCE.
collector = create_collector('my-collector', 'https')

#Retrieve a collector if already initialized
# collector = get_collector('my-collector')

#Outputs a list of working proxies and their corresponding geolocation (country)
good_proxies, good_proxies_country  = scrape_proxies(collector, timeout_filtering, url_testing)

Trying Proxy: https://139.99.105.5:80
Success! Added to List.
Trying Proxy: https://54.214.52.181:80
Success! Added to List.
Trying Proxy: https://69.195.157.162:8100
Success! Added to List.
Trying Proxy: https://209.190.32.28:3128
Success! Added to List.
Trying Proxy: https://144.217.101.245:3129
Success! Added to List.
Trying Proxy: https://50.233.228.147:8080
Success! Added to List.
Trying Proxy: https://198.55.103.233:80
Success! Added to List.
Trying Proxy: https://64.71.145.122:3128
Success! Added to List.
Trying Proxy: https://188.227.58.58:3128
Success! Added to List.
Trying Proxy: https://24.172.34.114:49920
Success! Added to List.
Trying Proxy: https://107.178.4.215:35330
Success! Added to List.
Trying Proxy: https://136.25.2.43:56726
Success! Added to List.
Trying Proxy: https://173.82.177.226:5836
Success! Added to List.
Trying Proxy: https://198.204.253.114:3128
Success! Added to List.
Trying Proxy: https://192.3.31.67:9998
Success! Added to List.
Trying Proxy: https://162

#### Display collected proxies in dataframe

In [29]:
pd.DataFrame(list(zip(good_proxies, good_proxies_country)), 
               columns =['Proxy', 'Country']) 

Unnamed: 0,Proxy,Country
0,139.99.105.5:80,canada
1,54.214.52.181:80,united states
2,69.195.157.162:8100,united states
3,209.190.32.28:3128,united states
4,144.217.101.245:3129,canada
5,50.233.228.147:8080,united states
6,198.55.103.233:80,united states
7,64.71.145.122:3128,united states
8,188.227.58.58:3128,united states
9,24.172.34.114:49920,united states


### Robust web surfing Function with Selenium:

In [30]:
def robust_get(browser, url, current_proxy, good_proxies, timeout_sel): 
    
    #Put current proxy on top of list for iteration
    if current_proxy in good_proxies:
        good_proxies.insert(0, good_proxies.pop(good_proxies.index(current_proxy)))
    else:
        good_proxies.insert(0,current_proxy) 
        
    for proxy in good_proxies:
        
        print("===================== SURFING =====================") 
        
        try:
            if proxy == current_proxy:
                print("Current Proxy: " + proxy)
            
            if proxy != current_proxy:
                print("Trying proxy: " + proxy)
                chrome_options = Options() 
                chrome_options.add_argument('--proxy-server=%s' % proxy)
                browser = webdriver.Chrome(chromedriver, options=chrome_options)
            
            browser.set_page_load_timeout(timeout_sel) #Setting the timer    
            browser.get(url)

            #error thrown if timer runs out or the browser shows an error is thrown
            errorElems = browser.find_elements_by_class_name('neterror')
            if len(errorElems) > 0:
                raise Exception()
                
            #Proxy works!
            if proxy == current_proxy:
                print("Still works: " + proxy)
            else:
                print("Found new working proxy: " + proxy)
                
            checker = True
            current_proxy = proxy
            break
        except:
            checker = False
            print("Connection error. Trying next proxy.")
            browser.quit() #Have to close browser in order to change proxy
            continue 
            
    if checker == False:
        raise Exception("No working proxies!") #Error thrown if none of the proxies are working
    else:
        print("Final proxy: " + current_proxy)
        
    return browser, current_proxy #return browser with new working proxy

### Robust Selenium web surfing example:

In [31]:
#Set up the web driver with an initial/preferred proxy
preferred_proxy = random.choice(good_proxies) #Just picking a random one from list
chromedriver = r'C:/Users/talha/Documents/Chrome_Webdriver/chromedriver.exe'
chrome_options = Options()   
chrome_options.add_argument('--proxy-server=%s' % preferred_proxy) #with preferred proxy
browser = webdriver.Chrome(chromedriver, options=chrome_options)

url = 'http://www.foo.com/' #url you want to navigate to
timeout_sel = 30 #seconds, the timer set on checking if you can navigate to the url with a proxy

#Instead of browser.get(url), you will run this:
browser, preferred_proxy = robust_get(browser, url, preferred_proxy, good_proxies, timeout_sel) 

Current Proxy: 162.17.252.5:43764
Connection error. Trying next proxy.
Trying proxy: 139.99.105.5:80
Connection error. Trying next proxy.
Trying proxy: 54.214.52.181:80
Found new working proxy: 54.214.52.181:80
Final proxy: 54.214.52.181:80
