# DOWNLOAD PROXIES FOR WEBSCRAPING

This notebook is a demo for how this scraper can be used to scrape the free proxies from *https://free-proxy-list.net/* to be used in further webscraping tasks.

In [1]:
# Import the Scraper functionality
import Scraper

# Define the routine for obtaining proxies from the *https://free-proxy-list.net/*
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

# The link that we will scrape.
link = 'https://free-proxy-list.net/'

# Define the scraper routine that you will use at the link
# INPUT: the selenium browser instance that will be used to read the link.
# RETURN: a pandas dataframe containing the data you want to collect.
def scrape_routine_proxies(browser):
    
    # Select the option for the most numerous proxies
    select = Select(browser.find_element_by_xpath('/html/body/section[1]/div/div[2]/div/div[1]/div[1]/div/label/select'))
    elem = select.select_by_visible_text('80')

    # Read the table and return a dataframe
    soup = BeautifulSoup(browser.page_source,'html.parser')
    table = soup.find('table', {'id' :"proxylisttable"})
    
    headers = [h.text for h in table.find_all('th',{'aria-controls':"proxylisttable"})]
    rows = table.find_all('tr', {'class': "odd"}) + table.find_all('tr', {'class': "even"})
    row_data = zip(*[[r.text for r in R.find_all('td')] for R in rows])
    
    return pd.DataFrame({a:b for (a,b) in zip(headers,row_data)})

In [2]:
import pandas as pd

# Scrape the web proxies, then close the scraper object
scr = Scraper.Scraper(isHeadless=True)
df = scr.scrape_link(link,scrape_routine_proxies)
scr.close()

# properly format the columns for postgresql (i.e. lower case headers, no spaces.)
df.columns = [header.lower().replace(" ","_") for header in df.columns]

In [3]:
df.head(5)

Unnamed: 0,ip_address,port,code,country,anonymity,google,https,last_checked
0,14.207.102.103,8080,TH,Thailand,transparent,no,no,1 minute ago
1,51.158.102.56,80,FR,France,anonymous,no,yes,3 minutes ago
2,203.107.135.125,80,TH,Thailand,anonymous,no,no,3 minutes ago
3,159.203.20.110,8080,CA,Canada,anonymous,no,no,3 minutes ago
4,104.248.115.226,8080,US,United States,anonymous,no,no,3 minutes ago


In [4]:
# CREATE A POSTGRES TABLE TO STORE THESE PROXIES FOR USE LATER
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL
import db_configuration

DB_CONFIG_DICT = db_configuration.DB_CONFIG_DICT
DB_CONFIG_DICT['database'] = 'wills_db'

DB_CONN_FORMAT = "postgresql://{user}:{password}@{host}:{port}/{database}"
DB_CONN_URI_DEFAULT = (DB_CONN_FORMAT.format(**DB_CONFIG_DICT))

# Create a function for a quick connection to the database
def db_connect():
    return create_engine(DB_CONN_URI_DEFAULT)

In [5]:
# Use pandas functionality to upload the proxies to a postgresDB
conn = db_connect()
conn.execute("DROP TABLE IF EXISTS proxies;")
df.to_sql('proxies',conn,index=None)

In [7]:
# Confirm that the proxies are in the postgres DB:
conn.execute("SELECT * FROM proxies WHERE anonymity = 'anonymous';").fetchall()[0:10]

[('51.158.102.56', '80', 'FR', 'France', 'anonymous', 'no', 'yes', '3 minutes ago'),
 ('203.107.135.125', '80', 'TH', 'Thailand', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('159.203.20.110', '8080', 'CA', 'Canada', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('104.248.115.226', '8080', 'US', 'United States', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('157.230.149.54', '80', 'US', 'United States', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('157.230.157.60', '8080', 'US', 'United States', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('212.126.120.170', '8080', 'IQ', 'Iraq', 'anonymous', 'no', 'yes', '3 minutes ago'),
 ('68.183.39.251', '8080', 'GB', 'United Kingdom', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('157.230.236.97', '8080', 'US', 'United States', 'anonymous', 'no', 'no', '3 minutes ago'),
 ('67.198.189.239', '8888', 'US', 'United States', 'anonymous', 'no', 'no', '3 minutes ago')]

With the execution of this notebook, I have a new set of proxy IP addresses in a postgres database that I can use to scrape data. 

*Don't be a jerk. Don't scrape too much, too fast: Throttle your scraper*