In [1]:
# Import dependencies
import pandas as pd
from lxml import html
from config import db_password, ACCESS_ID, ACCESS_KEY
import logging
from io import StringIO
import boto3
from botocore.exceptions import ClientError

In [2]:
# Create a function to Extract the data
def total_func(url,name):
    # Read the URL
    dfs = pd.read_html(url)
    # Format the Dataframe
    df = pd.concat(dfs)
    df.columns = ['rank', name, 'zip']
    df.drop('rank', axis=1, inplace = True)
    df['zip'] = [x[:-4] for x in df['zip'] if len(df['zip']) >= 10]
    df['zip'] = df['zip'].replace({'/':''}, regex=True)
    df['zip'] = [z[0:5] for z in df['zip']]     
    df = df.iloc[1:]
    # Save as CSV
    df.to_csv(r'../Resources/'+name+'.csv')
    # Upload to AWS S3 Bucket
    s3 = boto3.client('s3',
         aws_access_key_id=ACCESS_ID,
         aws_secret_access_key= ACCESS_KEY)
    with open(r'../Resources/'+name+'.csv', "rb") as f:
        s3.upload_fileobj(f, "databootcamp-csvfiles", name+r'.csv')
    return df.head()

In [3]:
# ETL avg_edu_2000
total_func('http://www.usa.com/rank/california-state--average-education-index--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'avg_edu_2000'
          )

Unnamed: 0,avg_edu_2000,zip
1,17.76,94305
2,17.12,95041
3,17.01,94708
4,17.0,94304
5,16.83,94707


In [4]:
# ETL avg_edu_2014
total_func('http://www.usa.com/rank/california-state--average-education-index--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'avg_edu_2014'
          )

Unnamed: 0,avg_edu_2014,zip
1,19.0,95463
2,19.0,95721
3,17.69,94305
4,17.68,95736
5,17.38,92617


In [5]:
# ETL house_median_value_2000
total_func('http://www.usa.com/rank/california-state--house-median-value--zip-code-rank.htm?hl=&hlst=&wist=&yr=1000&dis=50&sb=DESC&plow=&phigh=&ps=',
           'house_median_value_2000'
          )

Unnamed: 0,house_median_value_2000,zip
1,"$1,000,001",90210
2,"$1,000,001",90402
3,"$1,000,001",90743
4,"$1,000,001",92067
5,"$1,000,001",93108


In [6]:
# ETL house_median_value_2014
total_func('http://www.usa.com/rank/california-state--house-median-value--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'house_median_value_2014'
          )

Unnamed: 0,house_median_value_2014,zip
1,"$1,000,001",90024
2,"$1,000,001",90049
3,"$1,000,001",90077
4,"$1,000,001",90210
5,"$1,000,001",90211


In [7]:
# ETL median_income_2000
total_func('http://www.usa.com/rank/california-state--median-household-income--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'median_income_2000'
          )

Unnamed: 0,median_income_2000,zip
1,"$200,001",94027
2,"$196,298",92067
3,"$164,479",94028
4,"$145,425",94022
5,"$142,459",94506


In [8]:
# ETL median_income_2014
total_func('http://www.usa.com/rank/california-state--median-household-income--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'median_income_2014'
          )

Unnamed: 0,median_income_2014,zip
1,"$236,912",94027
2,"$228,587",92145
3,"$200,325",91980
4,"$187,857",94957
5,"$182,750",94022


In [9]:
# ETL median_rent_2000
total_func('http://www.usa.com/rank/california-state--median-rental-price--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'median_rent_2000'
          )

Unnamed: 0,median_rent_2000,zip
1,"$2,001",92067
2,"$2,001",92091
3,"$2,001",92602
4,"$2,001",93953
5,"$2,001",94027


In [10]:
# ETL median_rent_2014
total_func('http://www.usa.com/rank/california-state--median-rental-price--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'median_rent_2014'
          )

Unnamed: 0,median_rent_2014,zip
1,"$2,001",90067
2,"$2,001",90077
3,"$2,001",90094
4,"$2,001",90210
5,"$2,001",90265


In [11]:
# ETL public_transportation_2000
total_func('http://www.usa.com/rank/california-state--take-public-transportation-to-work-population-percentage--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'public_transportation_2000'
          )

Unnamed: 0,public_transportation_2000,zip
1,58.9%,90017
2,44.4%,90057
3,43.0%,95431
4,42.6%,94102
5,39.9%,94103


In [12]:
# ETL public_transportation_2014
total_func('http://www.usa.com/rank/california-state--take-public-transportation-to-work-population-percentage--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'public_transportation_2014'
          )

Unnamed: 0,public_transportation_2014,zip
1,100.0%,92304
2,100.0%,94128
3,72.7%,93634
4,47.8%,90073
5,47.6%,94130


In [13]:
# ETL total_pop_2000
total_func('http://www.usa.com/rank/california-state--total-population--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'total_pop_2000'
          )

Unnamed: 0,total_pop_2000,zip
1,105275,90201
2,103211,90650
3,101214,90011
4,98226,92054
5,97300,91331


In [14]:
# ETL total_pop_2014
total_func('http://www.usa.com/rank/california-state--total-population--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'total_pop_2014'
          )

Unnamed: 0,total_pop_2014,zip
1,106521,90650
2,102926,90011
3,102515,90201
4,102367,91331
5,99580,92335


In [15]:
# ETL white_pop_2000
total_func('http://www.usa.com/rank/california-state--white-population-percentage--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
           'white_pop_2000'
          )

Unnamed: 0,white_pop_2000,zip
1,100.0%,90263
2,100.0%,91743
3,100.0%,92338
4,100.0%,95232
5,100.0%,95735


In [16]:
# ETL white_pop_2014
total_func('http://www.usa.com/rank/california-state--white-population-percentage--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
           'white_pop_2014'
          )

Unnamed: 0,white_pop_2014,zip
1,100.0%,91948
2,100.0%,92060
3,100.0%,92266
4,100.0%,92304
5,100.0%,92332


## Extra Dataset

In [17]:
# total_func('http://www.usa.com/rank/california-state--population-density--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
#            'population_density_2014'
#           )
# total_func('http://www.usa.com/rank/california-state--population-density--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
#            'population_density_2000'
#           )
# total_func('http://www.usa.com/rank/california-state--per-capita-income--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
#            'per_capita_income_2014'
#           )
# total_func('http://www.usa.com/rank/california-state--per-capita-income--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
#            'per_capita_income_2000'
#           )
# total_func('http://www.usa.com/rank/california-state--crime-index--city-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
#            'crime_index_2014'
#           )
# total_func('http://www.usa.com/rank/california-state--crime-index--city-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
#            'crime_index_2000'
#           )
# total_func('http://www.usa.com/rank/california-state--public-school-performance--zip-code-rank.htm?yr=9000&dis=50&wist=&plow=&phigh=',
#            'public_school_2014'
#           )
# total_func('http://www.usa.com/rank/california-state--public-school-performance--zip-code-rank.htm?yr=1000&dis=50&wist=&plow=&phigh=',
#            'public_school_2000'
#           )