## Setup

In [1]:
#libraries needed to run code
from bs4 import BeautifulSoup

import pandas as pd, requests, time, random

from selenium import webdriver

## Initial Prep

In [7]:
# reads in the excel data sheet
# set workbook to excel file name 
workbook = 'Hector_ExcelTest.xlsx'
table = pd.read_excel(workbook)

#checks for missing cells in each column
city = table['City'].isnull().values.any()
state = table['State'].isnull().values.any()

#conditional checks to see if either City or State columns are missing values
if city or state:
  print("Check 'City', 'State' columns for blanks")
else:
    #combining city and state column into 'city, state' format for searching
    table['Location'] = table['City'] + ', ' + table['State']
    # gets unique list of locations
    unique_locations = table['Location'].unique()
    number_of_locations = len(unique_locations)
    print(f'There are {number_of_locations} locations to search for.')

There are 75 locations to search for.


## Webscraping portion

In [8]:
#prints estimated runtime
shortest = round(((number_of_locations * 1) + (number_of_locations * 3)) / 60,1)
longest = round(((number_of_locations * 3) + (number_of_locations * 5)) / 60,1)

print(f'Estimated runtime between {shortest} and {longest} minutes.')


wd = webdriver.Chrome('./chromedriver')

#dictionary that stores url for each city,state
sites = {}
for location in unique_locations:
  #gets html info from query search of location into census reporter
  url = "https://censusreporter.org/search/?q="
  wd.get(url+ location)
  time.sleep(random.randint(1, 3))
  #gets html page of search query for current location
  html = wd.execute_script("return document.documentElement.outerHTML")
  soup = BeautifulSoup(html)
  
  #checks to see if there is an h3 tag. If there isn't, that means that the city,state entered don't exist.
  #otherwise, get the first result from the search page.
  if soup.find('h3') == None:
    sites[location] = 'not found'
  else:
    location_site = soup.find('h3').find('a').get('href')
    sites[location] = location_site


#dictionary that stores the metro areas for each city,state
location_info = {}
test = {}
# gets the metro areas,county, or micro area if available. Return state if not.
for location in sites.keys():
  #for city,states that weren't found in the loop above
  if sites[location] == 'not found':
    location_info[location] = f'{location} was not found'
  else:
    wd.get(sites[location])
    time.sleep(random.randint(3, 5))
    html = wd.execute_script("return document.documentElement.outerHTML")
    soup = BeautifulSoup(html)
    #data stores all of the location info that is were the metro area is stored on census reporter
    data = soup.find('header',class_='column-full').find('p').find_all('a')
    test[location] = data
    #position is the dictionary used to keep track of where in list info certain information is
    position = {}
    info = []
    #for loop for getting the location text we want from the data variable
    for x,loc_info in enumerate(data):
      #each conditional statement checks to see at what position the metro/micro/county info is located and keeps track of it in dictionary position
      if 'Metro' in loc_info.text:
        position['Metro'] = x
      elif 'Micro' in loc_info.text:
        position['Micro'] = x
      elif 'County' in loc_info.text:
        position['County'] = x
      else:
        position[loc_info.text] = x
      info.append(loc_info.text)

#returns either metro, county, micro, or all available info depending on the following checks  
    if "Metro" in position:
      location_info[location] = info[position["Metro"]]
    elif "Micro" in position:
      location_info[location] = info[position['Micro']]
    elif "County" in position:
      location_info[location] = info[position['County']]
    else:
      location_info[location] = ",".join(info)

#dictionary containing all the locations and their metro areas
location_info

Estimated runtime between 5.0 and 10.0 minutes.


{'Placentia, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Costa Mesa, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Garden Grove, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Santa Ana, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Haymaket, VA': 'Haymaket, VA was not found',
 'El Monte, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Beaumont, CA': 'Riverside-San Bernardino-Ontario, CA Metro Area',
 'Corona, CA': 'Riverside-San Bernardino-Ontario, CA Metro Area',
 'Newport Beach, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Westminster, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Lake Elsinore, CA': 'Riverside-San Bernardino-Ontario, CA Metro Area',
 'Laguna Hills, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Lake Forest, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Bellflower, CA': 'Los Angeles-Long Beach-Anaheim, CA Metro Area',
 'Moreno Valley, CA': 'Riverside-San Bernardino-Ontario, CA M

## Creating new column with metro areas

In [13]:
#function that maps metro areas to all locations (City,states) in the data. 
def metro_area(loc,loc_info):
  # if location in location_info dictionary, return its metro area. Else, return its State.
  return loc_info[loc] if loc in loc_info else loc.split(', ')[1]

#applying the function above to the snapshot data
table['Metro Area'] = table['Location'].apply(metro_area, args = [location_info])
table[['City','State','Metro Area']]

Unnamed: 0,City,State,Metro Area
0,Placentia,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
1,Costa Mesa,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
2,Garden Grove,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
3,Santa Ana,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
4,Costa Mesa,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
...,...,...,...
393,Garden Grove,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
394,Costa Mesa,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"
395,Riverside,CA,"Riverside-San Bernardino-Ontario, CA Metro Area"
396,Costa Mesa,CA,"Los Angeles-Long Beach-Anaheim, CA Metro Area"


## Output

In [11]:
#outputs copy of the data with addition of metro area column
table.to_excel("output.xlsx", index = False) 