In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import pprint
import googlemaps
import time
import pickle
from random import randint
from collections import defaultdict
import selenium
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline

chromedriver = f"/Users/brenner/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

%load_ext dotenv
%dotenv

%load_ext autoreload
%autoreload 2

In [2]:
# Use Seaborn whitegrid styling, because I like it
import matplotlib.style as style
style.use('seaborn-whitegrid')

# Import private Google Maps API key
MAPS_KEY = os.environ.get('MAPS_KEY')

# Change format of charts to .svg
%config InlineBackend.figure_format = 'svg'

In [4]:
%xmode

Exception reporting mode: Plain


In [5]:
# This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
# Read the data into pandas dataframe
df = pd.read_csv('kc_house_data.csv')

In [6]:
# Let's take a look at the data
df.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [7]:
# Recast price column as integer, for simplicity
df['price'] = df.price.astype('int')

In [26]:
coords = list(zip(df['lat'].astype(float), df['long'].astype(float)))

In [30]:
len(coords)

21613

In [31]:
formatted_addresses = []
neighborhoods_list = []
address_dictionary = defaultdict(str)

def get_addresses(coords_list):
    count = 0
    for coord_tuple in coords_list:
        count +=1
#         if count >= 1000:
#             time.sleep(120)
#             count = 0
        
        try:
            # Instantiate a Google Maps API session
            address = gmaps.reverse_geocode(coord_tuple)
            
            # Parse the JSON results
            formatted_address = address[1]['formatted_address']
            
            # Store both a list of formatted addresses and a list of all data for safekeeping
            formatted_addresses.append(formatted_address)
            
            # Store a list of neighborhoods
            neighborhood = address[0]['address_components'][2]['short_name']
            neighborhoods_list.append(neighborhood)
            
            # Store a dictionary of lat/long and formatted addresses, in case we need to map to dataframe later
            address_dictionary[coord_tuple] = formatted_address
            
            print('Success')

        except:
            print('Could not get address.')
            formatted_addresses.append(None)

In [None]:
# Calls the Google API
get_addresses(coords)

In [34]:
len(formatted_addresses)

21613

In [39]:
# # Pickling original list so not to lose my work
# with open('address_list_final.pkl', 'wb') as f:
#     pickle.dump(list_of_addresses, f)

# with open('formatted_addresses.pkl', 'wb') as f:
#     pickle.dump(formatted_addresses, f)

# # Pickling like crazy because I don't want to lose my data.
# # Yes, I know pickling isn't always the best. Works for now.
# with open('json_addresses.pkl', 'wb') as f:
#     pickle.dump(json_addresses, f)

# with open('neighborhoods.pkl', 'wb') as f:
#     pickle.dump(neighborhoods_list, f)

# with open('address_dictionary.pkl', 'wb') as f:
#     pickle.dump(address_dictionary, f)

In [71]:
# Add formatted addresses from Google Maps API to our dataframe (!)
df['Address'] = formatted_addresses

In [None]:
# Adding neighborhood to dataframe
df['Neighborhood'] = neighborhoods_list

**Start here once data is already scraped** <a name='bookmark' />

In [8]:
# Now that the data is already scraped, we can pick up from here and add 'Address' column to main dataframe
addresses_df = pd.read_csv('formatted_addresses_csv.csv')
df['Address'] = addresses_df['Formatted Addresses']

In [20]:
neighborhood_df = pd.read_csv('Census Geocode Data/neighborhood_csv.csv', header=None)
df['Neighborhood'] = neighborhood_df

Code in cell below is for imputing zip code from Google addresses, but we already have that data in another column. May return to this later.

In [95]:
# # Grab a list of all the zip codes from the formatted addresses list
# # I love list comprehensions!
# zip_list = [x.split(',')[-2][-5:] for x in formatted_addresses]

# df['Imputed Zip Code'] = zip_list

# def get_zips(x):
#     if x == ' WA' or x == 'ngton':
#         return 0
#     else:
#         return x

# df['Imputed Zip Code'] = df['Imputed Zip Code'].apply(get_zips)

# df['Imputed Zip Code'] = df['Imputed Zip Code'].astype(int)

In [47]:
# 205 Unique Neighborhoods. Should be sufficient for Walk Score/Transit Score/Bike Score etc. for our purposes.
df['Neighborhood'].nunique()

205

In [165]:
adds = df['Address']

In [60]:
adds = adds.apply(lambda x: x.split(','))

In [68]:
adds = pd.DataFrame(adds)

In [163]:
# Parsing from the address column to get individual street numbers, cities, etc. to feed into Census
street_add = adds['Address'].apply(lambda x: x[0])

city = adds['Address'].apply(lambda x: x[1])

state = 'WA'

zip = adds['Address'].apply(lambda x: x[2][4:])

In [142]:
# Add a state column that doesn't have any wrong values
state = adds['Address'].apply(lambda x: x[2][1:3])

state = pd.DataFrame(state)

state['Address'] = 'WA'

state['Address'].value_counts()

WA    21613
Name: Address, dtype: int64

In [145]:
output = pd.concat([street_add, city, state, zip], axis=1, names=None)

In [289]:
output1 = output.iloc[8000:16000,:]

In [291]:
output1.head()

Unnamed: 0,Address,Address.1,Address.2,Address.3
8000,10261 39th Ave SW,Seattle,WA,98146
8001,17526 47th Ave NE,Lake Forest Park,WA,98155
8002,16116 NE 107th Ct,Redmond,WA,98052
8003,7024 126th Ave NE,Kirkland,WA,98033
8004,17242 164th Way SE,Renton,WA,98058


In [258]:
# Bringing in all the Geocodes from the Census Data into one place, so we can add it to our df
census_1 = pd.read_csv('Census Geocode Data/8000.csv', header=None)
census_2 = pd.read_csv('Census Geocode Data/16000.csv', header=None)
census_3 = pd.read_csv('Census Geocode Data/24000.csv', header=None)

In [280]:
census = pd.concat([census_1, census_2, census_3], ignore_index=False)

In [295]:
names=['index_num', 'Street', 'Match', 'Precision', 'Full address', 'loc', 'loc2', 'A', 'B', 'C', 'D', 'E']

In [303]:
census.sort_values('index_num', inplace=True)

## Walk Score Web Scraping

In [127]:
def get_walk_scores(start, num_records_to_fetch):
    '''Takes row number, number of rows to scrape data from, and returns Walk Score data.
    Scrapes walkscore.com for Walk Score, Bike Score, Transit Score, Personal Crime Grade, and Property Crime Grade.
    '''

    count = 0
    
    for i in range(start, start + num_records_to_fetch + 1):
        address = df.iloc[i, df.columns.get_loc('Address')]
        zipcode = df.iloc[i, df.columns.get_loc('zipcode')]
        
        if count==0:
            # Fetch the URL
            driver = webdriver.Chrome(chromedriver)
            url_address = address.lower().replace(",", "").replace('.', '').replace(" ", "-")
            driver.get(f'https://www.walkscore.com/score/{url_address}')
            time.sleep(5)
    
        
        try:
            # Take address from df, transform it and enter it into URL
            url_address = address.lower().replace(",", "").replace('.', '').replace(" ", "-")
            driver.get(f'https://www.walkscore.com/score/{url_address}')
#             time.sleep(1)
            
        except:
            # Use the search bar if direct URL doesn't yield a page.
            input_element = driver.find_element_by_id('addrbar-street')
            input_element.clear()
            input_element.click()
            input_element.send_keys(address)
            input_element.send_keys(Keys.ENTER)
            
            print('Search bar didn\'t work.')
        
        # Read the html
        html = driver.page_source
        soup=BeautifulSoup(html)

        # Getting Walk, Transit, and Bike Scores
        image_tags = soup.find_all('img')
        
        try:
            for score in image_tags:
                if "Score of" in str(score):
                    
                    z = score['src']
                    
                    #Figure out which score (Walk/Transit/Bike) this is
                    if 'walk/' in z:
                        df.iloc[i, df.columns.get_loc('walk_score')] = z[-6:-4]
                        print(f'Walk Score: {z[-6:-4]}')
                    if 'transit/' in z:
                        df.iloc[i, df.columns.get_loc('transit_score')] = z[-6:-4]
                    if 'bike/' in z:
                        df.iloc[i, df.columns.get_loc('bike_score')] = z[-6:-4]

        except:
            print(f'No Walk/Bike/Transit Score for {address}')

        # Getting Personal & Property Crime Grades
        results = soup.find_all("div", {"class" : "crime-grade"})
        parsed_grades = []
            
        try:
            for grade in results:
                parsed_grades.append(grade.text)

            
            df.iloc[i, df.columns.get_loc('pers_crime_score')] = parsed_grades[0][2]
            df.iloc[i, df.columns.get_loc('prop_crime_score')] = parsed_grades[1][2]
            
        except:
            pass
        
        print(f'{count} pages scraped. On index {i}.')
        
        count +=1
        
        # Backup data every 25 records
        if count % 25 == 0:
            df.to_csv('backup_df_from_scraping.csv')
            print('Backed up dataframe.')
            continue
    
        # Sleep for a bit if you've scraped 2000 records, and reinitialize     
        if count % 2000 == 0:
            time.sleep(randint(60, 600))
            driver = webdriver.Chrome(chromedriver)
            count = 0

            

In [None]:
# Use this function call to scrape the data and pick up where you left off
most_recent_record = df[df['walk_score'].isnull() == True].index[0]
get_walk_scores(most_recent_record, 22000)
# get_walk_scores(520,22000)

In [165]:
df['walk_score'].isnull().value_counts()

False    21588
True        25
Name: walk_score, dtype: int64

In [159]:
df['bike_score'].isnull().value_counts()

True     13277
False     8336
Name: bike_score, dtype: int64

**Adding GEOID and income data to dataframe <a name='bookmark2' />**

In [393]:
# Fill NAs with float of 0.0 so we can run operations on the whole column to transform into full GEOID
census['D'].fillna(0.0, inplace=True)

In [398]:
# Add leading zeros to transform Census Tract ID into format we can use to find GEOID to match up with Census data
def add_zeros(x):
    x = str(int(x))
    x = x.zfill(6)
    return(x)

census['tract'] = census['D'].apply(add_zeros)

In [407]:
# Add the leading characters for GEOID to our tract number
# Note: '53' denotes Washington State, and '033' denotes King County.
census['GEOID'] = '53033' + census['tract']

In [411]:
# Add 'GEOID' column to our main dataframe
census.df['GEOID'] = census['GEOID'].values

Unnamed: 0,index_num,Street,Match,Precision,Full address,loc,loc2,A,B,C,D,E,tract,GEOID
55,0,"6101 S Cooper St, Seattle, WA, 98118",Match,Non_Exact,"6101 S COOPER ST, SEATTLE, WA, 98118","-122.25696,47.51166",239766052.0,R,53.0,33.0,11900.0,,11900,53033011900
53,1,"860 NE 127th St, Seattle, WA, 98125",Match,Non_Exact,"860 NE 127TH ST, SEATTLE, WA, 98125","-122.31958,47.721283",186716651.0,L,53.0,33.0,200.0,,200,53033000200
51,2,"15098 81st Ave NE, Kenmore, WA, 98028",Match,Exact,"15098 81ST AVE NE, KENMORE, WA, 98028","-122.23289,47.73783",239770253.0,R,53.0,33.0,22102.0,,22102,53033022102
49,3,"9247 Fauntleroy Way SW, Seattle, WA, 98136",Match,Exact,"9247 FAUNTLEROY WAY SW, SEATTLE, WA, 98136","-122.3935,47.520912",186661942.0,R,53.0,33.0,11600.0,,11600,53033011600
73,4,"757 222nd Pl NE, Sammamish, WA, 98074",Match,Exact,"757 222ND PL NE, SAMMAMISH, WA, 98074","-122.0446,47.61503",187001283.0,L,53.0,33.0,32317.0,,32317,53033032317


In [590]:
median_income = pd.read_csv('Census Geocode Data/median_income.csv')

In [438]:
# Remove the last entry because it contains a null value
median_income = median_income.iloc[:-1, :]

In [479]:
# Found another incorrect value. Finding the index so I can drop it.
median_income[median_income['HC02_EST_VC02'] == '(X)']

Unnamed: 0,GEO.id2,HC02_EST_VC02
54,53033005302,(X)


In [484]:
median_income.iloc[54]

GEO.id2          53033005302
HC02_EST_VC02            (X)
Name: 54, dtype: object

In [488]:
# Dropping incorrect row.
median_income = median_income.drop(54)

In [583]:
# Reset index in order to make zipping into dicts possible without errors
median_income = median_income.reset_index(inplace=True)

In [556]:
# Recasting income variable as an int to make things easier down the road
median_income['HC02_EST_VC02'] = median_income['HC02_EST_VC02'].astype('str', inplace=True)

In [15]:
# Create dictionary to map Census median HH income to df based on each house's GEOID
income_tracts = dict(zip(median_income['GEO.id2'], median_income['HC02_EST_VC02']))

# Do the mapping
df['income'] = df['GEOID'].map(income_tracts)

In [21]:
# Sanity checking the GEOID of where I used to live - checks out
income_tracts[53033022102]

96863

In [None]:
df = pd.to_csv('backups/backup_df_from_scraping.csv')

## Head over to 'Data Cleaning' notebook to pick up from here.