In [1]:
# Imports needed for the methods in this py file.
from bs4 import BeautifulSoup
import requests
import time

In [2]:
# Main method that does the entire web scraping and then saving of the content to a file
def main(url, loop=0):
    timeout = 60
    # Had some blacklist issues so added in this retry loop to try and help
    while True:
        try:
            web_input = requests.get(url, stream=True, timeout=timeout)
            break  # If it downloads, get out and get on with the scraping
        except requests.exceptions.RequestException as e:  # If it doesn't download after timeout, throw exception and try again
            print(e)
            pass
    soup = BeautifulSoup(web_input.content, 'html.parser')
    # Collect all cities from the website that we need to scrape
    all_cities = allcities(soup)
    data = list()
    # This is the main loop that goes through the city names and grabs the data from the individual web pages
    for i in range(loop, len(all_cities)): # this needs to be set to range(0, len(all_cities)) to run over all cities
        population = scrapernoscraping(all_cities[i])
        print(str(i) + ": " + str(all_cities[i]))
        time.sleep(1)
        data.append(population)
        # Save off files every 150 web pages due to some of the connection issues I was having
        if i > 0 and i % 150 == 0:
            # Added in loop variable to make sure we start at the zip starting with the correct city
            if loop != 0:
                output = list(zip(all_cities[loop:], data))
            else:
                output = list(zip(all_cities, data))
            file = 'files/output' + str(i) + '.py'
            # Write statement that creates/updates the file with collected info
            with open(file, 'w') as f:
                for j in range(0, len(output)):
                    f.write(str(output[j]) + "\n")
    # Final file creation step that adds the remaining cities found to a file
    if loop != 0:
        output = list(zip(all_cities[loop:], data))
    else:
        output = list(zip(all_cities, data))
    with open('files/output.py', 'w') as f:
        for k in range(0, len(output)):
            f.write(str(output[k]) + "\n")

In [3]:
# Method to collect all cities from the website that we need to scrape
def allcities(soup):
    all_tr = soup.find_all('tr')
    all_cities = list()
    # Loop over all cities from the website that we need to scrape
    for city in all_tr:
        try:
            if city['class'][0] == 'rT' or city['class'][0] == 'rS' or city['class'][0] == 'rB':
                name = city.select('a')
                name = str(name[0].contents[0]).split(",")
                all_cities.append(name[0])
        except KeyError:
            pass
    return all_cities

In [4]:
# Method to be able to rerun main method with specific start parameter
def rerun(line):
    main('http://www.city-data.com/city/Pennsylvania.html', loop=line)

In [5]:
# Helper method to load in data file of city info
def loadindata(file):
    arr = list()
    with open(file, 'r') as f:
        for line in f:
            arr.append(line)
    return arr

In [6]:
# Helper methods for grabbing certain types of data from City-Data website, the comment after the return statement
# tells us what each method returns
def zipscrape(zips):
    for zip in zips:
        try:
            zip2 = zip.select('li')
            tup9 = zip2[2].contents[0].contents[0]
        except:
            pass
    return tup9 #returns city zip codes

In [7]:
def citypopulation(all_sections_cp):
    for sect in all_sections_cp:
        try:
            tup1 = str(sect.contents[1].split(".")[0]) #population
        except KeyError:
            pass
    return tup1 #returns city population

In [8]:
def racebr(race):
    tup8 = list()
    for sect in race:
        for i in range(0, len(sect.select('li')[1].contents[0]), 2):
            try:
                tup8.append((sect.select('li')[1].contents[0].contents[i].contents[2].contents, sect.select('li')[1].contents[0].contents[i].contents[0].contents)) #racial breakdown
            except(KeyError, AttributeError):
                pass
    return tup8 #returns city breakdown by race

In [9]:
def men_women(pbs):
    for sect in pbs:
        try:
            tup4 = str(sect.select('td')[1].contents[-1]) #% male in city
            tup5 = str(sect.select('td')[3].contents[-1]) #% female in city
        except (KeyError, IndexError):
            pass
    return tup4, tup5    #returns city % of men and women

In [10]:
def moneystuff(ms):
    for sect in ms:
        try:
            tup3 = str(sect.contents[1].strip()) #median-household-income
            tup6 = str(sect.select('br')[1].contents[1]) #per capita income
            try:
                tup7 = str(sect.select('br')[3].contents[1].contents[3]) #median house value
            except:
                tup7 = str(sect.select('br')[1].contents[10])
        except (KeyError, IndexError):
            pass
    return tup3, tup6, tup7 #returns a tuple of median household income, per capita income, median house value

In [11]:
def medianage(mage):
    for sect in mage:
        try:
            tup2 = str(sect.select('td')[1].contents[-1]) #median-age info
        except KeyError:
            pass
    return tup2 #returns city median age

In [12]:
# Method that does the individual web page scraping for all the data we were collecting
def scrapernoscraping(name):
    # Special handling for cities with spaces in their names
    namearr = name.split(" ")
    url = 'http://www.city-data.com/city/'
    for k in range(0, len(namearr)):
        if k == len(namearr) - 1:
            url = url + namearr[k]
        else:
            url = url + namearr[k] + "-"
    url = url + '-Pennsylvania.html'
    # Special handling for some weirdly named cities
    if name == "O'Hara Township":
        url = "http://www.city-data.com/city/O-Hara-Township-Pennsylvania.html"
    elif name == 'Penn State Erie (Behrend)':
        url = 'http://www.city-data.com/city/Penn-State-Erie-Behrend-Pennsylvania.html'
    elif name == 'Tharptown (Uniontown)':
        url = 'http://www.city-data.com/city/Tharptown-Uniontown-Pennsylvania.html'

    # Initialize all core variables for use with the Beautiful Soup object
    timeout = 60
    web_input = requests.get(url, stream=True, timeout=timeout)
    more_soup = BeautifulSoup(web_input.content, 'html.parser')
    all_sections_cp = more_soup.find_all('section', {'id': 'city-population'})
    all_section_ma = more_soup.find_all('section', {'class': 'median-age'})
    all_section_mhi = more_soup.find_all('section', {'class', 'median-income'})
    all_section_pbs = more_soup.find_all('section', {'class', 'population-by-sex'})
    all_ol_county = more_soup.find_all('ol', {'class': 'breadcrumb'})
    all_ul_race = more_soup.find_all('ul', {'class': 'list-group'})

    # Grab each data point that we are looking for from the website
    tup1 = citypopulation(all_sections_cp)
    tup2 = medianage(all_section_ma)
    tup3, tup6, tup7 = moneystuff(all_section_mhi)
    tup4, tup5 = men_women(all_section_pbs)
    tup8 = racebr(all_ul_race)
    tup9 = zipscrape(all_ol_county)
    # Structure is (population, median age, median household income, % of men, % of women, per capita income, median house value, racial breakdown, county)
    metrics = tup1 + ' | ' + tup2 + ' | ' + tup3 + ' | ' + tup4 + ' | ' + tup5 + ' | ' + tup6 + ' | ' + tup7 + ' | ' + str(tup8) + ' | ' + tup9
    return metrics #Altered the return to make things easier for me to parse and clean later

In [13]:
if __name__ == "__main__":
    #scrapernoscraping("Wyalusing")  # Way for me to test individual cities that may have had issues while scraping
    main('http://www.city-data.com/city/Pennsylvania.html')
    # rerun(1651)  # Rerun tag so I could restart my scraping if it lost connection to the website

0: Aaronsburg
1: Abbottstown
2: Ackermanville
3: Adamstown
4: Addison
5: Akron
6: Albion
7: Albrightsville
8: Alburtis


KeyboardInterrupt: 