# ETL Project - EXTRACTION
## Terrence Cummings
Data: Race mix of Minneapolis neighborhoods scraped from mncompass.org

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import json
import requests
import pymongo
from splinter import Browser
from selenium import webdriver
import time
import sys
import os
import datetime
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")

In [2]:

#Initialize list to hold neighborhood URL's and neighborhood names
nbhd_links_lst = []
nbhd_names_lst = []

#URL for the MSP neighborhoods page at MN Compass. This page contains all the links to the individual neighborhood pages.
msp_nbhds_url = 'http://www.mncompass.org/profiles/neighborhoods/minneapolis-saint-paul'

#Create Chrome web driver
driver = webdriver.Chrome(options=chrome_options)

#Send the webdriver to the link
driver.get(msp_nbhds_url)

#Extract an object containing all the neighborhood links. Uses Selenium Xpath.
nbhd_links = driver.find_elements_by_xpath('//*[@id="list"]/div/ul[2]/li[*]/a')

#Go through each Selenium item and extract the link for the neighborhood.
for nbhd in nbhd_links:
    href = nbhd.get_attribute('href')
    #Add the link to the list of neighborhood links
    nbhd_links_lst.append(href)
    #Extract the name of the neighborhood from the end of the URL and add to the list of neighborhood names.
    nbhd_names_lst.append(href[60:])

#Close the webdriver    
driver.close()


In [3]:

nbhd_links_df = pd.DataFrame(nbhd_links_lst, columns=["URL"])
nbhd_links_df.to_csv('nbhd_links.csv', index=False)
nbhd_links_lst

['http://www.mncompass.org/profiles/neighborhoods/minneapolis/armatage',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/audubon-park',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/bancroft',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/cedar-isles-dean',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/cedar-riverside',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/central',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/como',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/cooper',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/corcoran',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/diamond-lake',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/downtown-west',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/ecco',
 'http://www.mncompass.org/profiles/neighborhoods/minneapolis/east-harriet',
 'http://www.mncompass.org/pro

In [4]:
#Open the webdriver
driver = webdriver.Chrome(options=chrome_options)

#Initialize lists to hold the count and percentage for each race. The list is in the same order as the neighborhoods above.
white_cnt_lst = []
white_pct_lst = []
black_cnt_lst = []
black_pct_lst = []
native_cnt_lst = []
native_pct_lst = []
asian_cnt_lst = []
asian_pct_lst = []
other_cnt_lst = []
other_pct_lst = []
twoormore_cnt_lst = []
twoormore_pct_lst = []
hispanic_cnt_lst = []
hispanic_pct_lst = []
ofcolor_cnt_lst = []
ofcolor_pct_lst = []

#For each neighborhood
for nbhd_link in nbhd_links_lst:
    #Go to the link for the neighborhood
    driver.get(nbhd_link)
    #A short timer to give the browser time to respond
    time.sleep(1)
    #Use Selenium Xpath to extract the table list for the race, access the count and percent elements of the list and add to lists
    nbhd_white = driver.find_elements_by_xpath('//*[@id="totalWhite"]')
    white_cnt_lst.append(nbhd_white[0].text)
    white_pct_lst.append(nbhd_white[2].text)
    nbhd_black = driver.find_elements_by_xpath('//*[@id="totalBlack"]')
    black_cnt_lst.append(nbhd_black[0].text)
    black_pct_lst.append(nbhd_black[2].text)
    nbhd_native = driver.find_elements_by_xpath('//*[@id="totalNative"]')
    native_cnt_lst.append(nbhd_native[0].text)
    native_pct_lst.append(nbhd_native[2].text)
    nbhd_asian = driver.find_elements_by_xpath('//*[@id="totalAsian"]')
    asian_cnt_lst.append(nbhd_asian[0].text)
    asian_pct_lst.append(nbhd_asian[2].text)
    nbhd_other = driver.find_elements_by_xpath('//*[@id="totalOther"]')
    other_cnt_lst.append(nbhd_other[0].text)
    other_pct_lst.append(nbhd_other[2].text)
    nbhd_twoormore = driver.find_elements_by_xpath('//*[@id="twoOrMoreRaces"]')
    twoormore_cnt_lst.append(nbhd_twoormore[0].text)
    twoormore_pct_lst.append(nbhd_twoormore[2].text)
    nbhd_hispanic = driver.find_elements_by_xpath('//*[@id="totalHispanic"]')
    hispanic_cnt_lst.append(nbhd_hispanic[0].text)
    hispanic_pct_lst.append(nbhd_hispanic[2].text)
    nbhd_ofcolor = driver.find_elements_by_xpath('//*[@id="totalOfColor"]')
    ofcolor_cnt_lst.append(nbhd_ofcolor[0].text)
    ofcolor_pct_lst.append(nbhd_ofcolor[2].text)

#Close the browser
driver.close()


In [5]:

#Zip the lists together and make a final dataframe where each row is a neighborhood.
nbhd_race_df = pd.DataFrame(list(zip(nbhd_names_lst, white_cnt_lst, white_pct_lst, black_cnt_lst, black_pct_lst, native_cnt_lst, native_pct_lst, asian_cnt_lst, asian_pct_lst, other_cnt_lst, other_pct_lst, twoormore_cnt_lst, twoormore_pct_lst, hispanic_cnt_lst, hispanic_pct_lst, ofcolor_cnt_lst, ofcolor_pct_lst, nbhd_links_lst)), columns =['neighborhood', 'white_cnt', 'white_pct', 'black_cnt', 'black_pct', 'native_cnt', 'native_pct', 'asian_cnt', 'asian_pct', 'other_cnt', 'other_pct', 'two_or_more_cnt', 'two_or_more_pct', 'hispanic_cnt', 'hispanic_pct', 'of_color_cnt', 'of_color_pct', 'URL']) 


In [6]:
nbhd_race_df.to_csv('nbhd_race.csv', index=False)
nbhd_race_df

Unnamed: 0,neighborhood,white_cnt,white_pct,black_cnt,black_pct,native_cnt,native_pct,asian_cnt,asian_pct,other_cnt,other_pct,two_or_more_cnt,two_or_more_pct,hispanic_cnt,hispanic_pct,of_color_cnt,of_color_pct,URL
0,armatage,4316.0,84.5%,suppressed,,suppressed,,suppressed,,suppressed,,185,3.6%,suppressed,,792,15.5%,http://www.mncompass.org/profiles/neighborhood...
1,audubon-park,4034.0,78.8%,386,7.5%,suppressed,,suppressed,,suppressed,,303,5.9%,449,8.8%,1083,21.2%,http://www.mncompass.org/profiles/neighborhood...
2,bancroft,2601.0,73.4%,413,11.6%,suppressed,,74,2.1%,suppressed,,278,7.8%,521,14.7%,943,26.6%,http://www.mncompass.org/profiles/neighborhood...
3,cedar-isles-dean,2927.0,91.6%,suppressed,,suppressed,,suppressed,,suppressed,,suppressed,,suppressed,,suppressed,,http://www.mncompass.org/profiles/neighborhood...
4,cedar-riverside,3219.0,32.0%,5146,51.1%,suppressed,,1156,11.5%,suppressed,,396,3.9%,270,2.7%,6846,68.0%,http://www.mncompass.org/profiles/neighborhood...
5,central,3567.0,43.3%,2116,25.7%,suppressed,,194,2.4%,1738,21.1%,513,6.2%,3204,38.9%,4670,56.7%,http://www.mncompass.org/profiles/neighborhood...
6,como,4314.0,72.2%,552,9.2%,suppressed,,893,14.9%,suppressed,,106,1.8%,427,7.1%,1663,27.8%,http://www.mncompass.org/profiles/neighborhood...
7,cooper,3009.0,81.4%,suppressed,,suppressed,,suppressed,,suppressed,,113,3.1%,suppressed,,686,18.6%,http://www.mncompass.org/profiles/neighborhood...
8,corcoran,2849.0,63.6%,760,17.0%,suppressed,,115,2.6%,352,7.9%,253,5.6%,920,20.6%,1630,36.4%,http://www.mncompass.org/profiles/neighborhood...
9,diamond-lake,4429.0,76.8%,651,11.3%,suppressed,,suppressed,,suppressed,,suppressed,,485,8.4%,1335,23.2%,http://www.mncompass.org/profiles/neighborhood...


In [7]:
#Initialize list to hold community URL's and community names
cmnty_links_lst = []
cmnty_names_lst = []

#URL for the MSP community page at MN Compass. This page contains all the links to the individual community pages.
msp_cmnty_url = 'http://www.mncompass.org/profiles/neighborhoods/minneapolis-saint-paul'

#Create Chrome web driver
driver = webdriver.Chrome(options=chrome_options)

#Send the webdriver to the link
driver.get(msp_cmnty_url)

#Extract an object containing all the commumity links. Uses Selenium Xpath.
cmnty_links = driver.find_elements_by_xpath('//*[@id="list"]/div/ul[3]/li[*]/a')

#Go through each Selenium item and extract the link for the community.
for cmnty in cmnty_links:
    href = cmnty.get_attribute('href')
    #Add the link to the list of community links
    cmnty_links_lst.append(href)
    #Extract the name of the community from the end of the URL and add to the list of community names.
    cmnty_names_lst.append(href[58:])

#Close the webdriver    
driver.close()
cmnty_links_lst

['http://www.mncompass.org/profiles/communities/minneapolis/calhoun-isles',
 'http://www.mncompass.org/profiles/communities/minneapolis/camden',
 'http://www.mncompass.org/profiles/communities/minneapolis/central',
 'http://www.mncompass.org/profiles/communities/minneapolis/longfellow',
 'http://www.mncompass.org/profiles/communities/minneapolis/near-north',
 'http://www.mncompass.org/profiles/communities/minneapolis/nokomis',
 'http://www.mncompass.org/profiles/communities/minneapolis/northeast',
 'http://www.mncompass.org/profiles/communities/minneapolis/phillips',
 'http://www.mncompass.org/profiles/communities/minneapolis/powderhorn',
 'http://www.mncompass.org/profiles/communities/minneapolis/southwest',
 'http://www.mncompass.org/profiles/communities/minneapolis/university']

In [8]:
cmnty_links_df = pd.DataFrame(cmnty_links_lst, columns=["URL"])
cmnty_links_df.to_csv('cmnty_links.csv', index=False)
cmnty_links_df

Unnamed: 0,URL
0,http://www.mncompass.org/profiles/communities/...
1,http://www.mncompass.org/profiles/communities/...
2,http://www.mncompass.org/profiles/communities/...
3,http://www.mncompass.org/profiles/communities/...
4,http://www.mncompass.org/profiles/communities/...
5,http://www.mncompass.org/profiles/communities/...
6,http://www.mncompass.org/profiles/communities/...
7,http://www.mncompass.org/profiles/communities/...
8,http://www.mncompass.org/profiles/communities/...
9,http://www.mncompass.org/profiles/communities/...


In [9]:
#Open the webdriver
driver = webdriver.Chrome(options=chrome_options)

#Initialize lists to hold the count and percentage for each race. The list is in the same order as the neighborhoods above.
white_cnt_lst = []
white_pct_lst = []
black_cnt_lst = []
black_pct_lst = []
native_cnt_lst = []
native_pct_lst = []
asian_cnt_lst = []
asian_pct_lst = []
other_cnt_lst = []
other_pct_lst = []
twoormore_cnt_lst = []
twoormore_pct_lst = []
hispanic_cnt_lst = []
hispanic_pct_lst = []
ofcolor_cnt_lst = []
ofcolor_pct_lst = []

#For each community
for cmnty_link in cmnty_links_lst:
    #Go to the link for the neighborhood
    driver.get(cmnty_link)
    #A short timer to give the browser time to respond
    time.sleep(1)
    #Use Selenium Xpath to extract the table list for the race, access the count and percent elements of the list and add to lists
    white = driver.find_elements_by_xpath('//*[@id="totalWhite"]')
    white_cnt_lst.append(white[0].text)
    white_pct_lst.append(white[2].text)
    black = driver.find_elements_by_xpath('//*[@id="totalBlack"]')
    black_cnt_lst.append(black[0].text)
    black_pct_lst.append(black[2].text)
    native = driver.find_elements_by_xpath('//*[@id="totalNative"]')
    native_cnt_lst.append(native[0].text)
    native_pct_lst.append(native[2].text)
    asian = driver.find_elements_by_xpath('//*[@id="totalAsian"]')
    asian_cnt_lst.append(asian[0].text)
    asian_pct_lst.append(asian[2].text)
    other = driver.find_elements_by_xpath('//*[@id="totalOther"]')
    other_cnt_lst.append(other[0].text)
    other_pct_lst.append(other[2].text)
    twoormore = driver.find_elements_by_xpath('//*[@id="twoOrMoreRaces"]')
    twoormore_cnt_lst.append(twoormore[0].text)
    twoormore_pct_lst.append(twoormore[2].text)
    hispanic = driver.find_elements_by_xpath('//*[@id="totalHispanic"]')
    hispanic_cnt_lst.append(hispanic[0].text)
    hispanic_pct_lst.append(hispanic[2].text)
    ofcolor = driver.find_elements_by_xpath('//*[@id="totalOfColor"]')
    ofcolor_cnt_lst.append(ofcolor[0].text)
    ofcolor_pct_lst.append(ofcolor[2].text)

#Close the browser
driver.close()


In [10]:
#Zip the lists together and make a final dataframe where each row is a commumity.
cmnty_race_df = pd.DataFrame(list(zip(cmnty_names_lst, white_cnt_lst, white_pct_lst, black_cnt_lst, black_pct_lst, native_cnt_lst, native_pct_lst, asian_cnt_lst, asian_pct_lst, other_cnt_lst, other_pct_lst, twoormore_cnt_lst, twoormore_pct_lst, hispanic_cnt_lst, hispanic_pct_lst, ofcolor_cnt_lst, ofcolor_pct_lst, cmnty_links_lst)), columns =['community', 'white_cnt', 'white_pct', 'black_cnt', 'black_pct', 'native_cnt', 'native_pct', 'asian_cnt', 'asian_pct', 'other_cnt', 'other_pct', 'two_or_more_cnt', 'two_or_more_pct', 'hispanic_cnt', 'hispanic_pct', 'of_color_cnt', 'of_color_pct', 'URL']) 

In [11]:
cmnty_race_df.to_csv('cmnty_race.csv', index=False)
cmnty_race_df

Unnamed: 0,community,white_cnt,white_pct,black_cnt,black_pct,native_cnt,native_pct,asian_cnt,asian_pct,other_cnt,other_pct,two_or_more_cnt,two_or_more_pct,hispanic_cnt,hispanic_pct,of_color_cnt,of_color_pct,URL
0,calhoun-isles,27911,86.1%,1591,4.9%,suppressed,,1080,3.3%,416,1.3%,1309,4.0%,1166,3.6%,4512,13.9%,http://www.mncompass.org/profiles/communities/...
1,camden,13565,44.1%,10677,34.7%,343,1.1%,3500,11.4%,691,2.2%,1953,6.4%,2360,7.7%,17164,55.9%,http://www.mncompass.org/profiles/communities/...
2,central,23384,66.2%,7133,20.2%,405,1.1%,2410,6.8%,410,1.2%,1564,4.4%,1645,4.7%,11928,33.8%,http://www.mncompass.org/profiles/communities/...
3,longfellow,20399,70.3%,5521,19.0%,290,1.0%,815,2.8%,901,3.1%,1072,3.7%,2083,7.2%,8613,29.7%,http://www.mncompass.org/profiles/communities/...
4,near-north,8264,23.0%,17163,47.9%,629,1.8%,5493,15.3%,1864,5.2%,2448,6.8%,3784,10.5%,27604,77.0%,http://www.mncompass.org/profiles/communities/...
5,nokomis,30885,79.6%,3507,9.0%,629,1.6%,776,2.0%,1214,3.1%,1779,4.6%,2160,5.6%,7905,20.4%,http://www.mncompass.org/profiles/communities/...
6,northeast,28051,74.9%,3921,10.5%,674,1.8%,1035,2.8%,1999,5.3%,1793,4.8%,3989,10.6%,9422,25.1%,http://www.mncompass.org/profiles/communities/...
7,phillips,6310,28.6%,9606,43.5%,1255,5.7%,757,3.4%,3400,15.4%,750,3.4%,5709,25.9%,15768,71.4%,http://www.mncompass.org/profiles/communities/...
8,powderhorn,33738,59.0%,11305,19.8%,787,1.4%,1387,2.4%,7056,12.3%,2876,5.0%,13036,22.8%,23438,41.0%,http://www.mncompass.org/profiles/communities/...
9,southwest,42747,86.0%,2289,4.6%,suppressed,,1650,3.3%,959,1.9%,1842,3.7%,2021,4.1%,6986,14.0%,http://www.mncompass.org/profiles/communities/...
