# The data
First, let's just gather all the data we'll need, then work from there

In [122]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [171]:
# wiki page with list of Pittsburgh neighborhoods
wiki = "https://en.wikipedia.org/wiki/List_of_Pittsburgh_neighborhoods"

# create the dataframe we'll store our data in
df = pd.DataFrame(columns=["Neighborhood", "wiki", "latitude", "longitude"])

# open the page and create soup object
page = urlopen(wiki)
soup = BeautifulSoup(page, 'html.parser')

# find our list of neighborhoods
neighborhoods_div = soup.find('div', attrs={"class": "div-col columns column-width"})

# get all the "li" tags, and get the names and urls. Add them to the dataframe
lis = neighborhoods_div.findAll('li')
for li in lis:
    a = li.find('a')
    text = a.text.strip()
    url = "https://en.wikipedia.org" + a.attrs.get("href")
    df = df.append({"Neighborhood":text, "wiki":url}, ignore_index=True)

    
df.head()

Unnamed: 0,Neighborhood,wiki,latitude,longitude
0,Allegheny Center,https://en.wikipedia.org/wiki/Allegheny_Center...,,
1,Allegheny West,https://en.wikipedia.org/wiki/Allegheny_West_(...,,
2,Allentown,https://en.wikipedia.org/wiki/Allentown_(Pitts...,,
3,Arlington,https://en.wikipedia.org/wiki/Arlington_(Pitts...,,
4,Arlington Heights,https://en.wikipedia.org/wiki/Arlington_Height...,,


In [210]:
# method to transform lat and long coordinates from degrees to decimal format
def toDegrees(lat, long):
    # parse the coordinate
    lat = re.split("[\u2032 \u2033 \N{DEGREE SIGN} N]", lat)
    long = re.split("[\u2032 \u2033 \N{DEGREE SIGN} W]", long)
    
    # some neighborhoods only provided degrees and minutes so I created an exception for those
    try:
        # convert strings to floats
        lat = [float(x) for x in lat[0:3]]
        long = [float(x) for x in long[0:3]]
        # the math part
        lat_dec = round(lat[0] + (lat[1]/60) + (lat[2]/3600), 6)
        long_dec = -round(long[0] + (long[1]/60) + (long[2]/3600), 6)
    except:
        # convert strings to floats
        lat = [float(x) for x in lat[0:2]]
        long = [float(x) for x in long[0:2]]
        # the math part
        lat_dec = round(lat[0] + (lat[1]/60), 6)
        long_dec = -round(long[0] + (long[1]/60), 6)
    
    return lat_dec, long_dec

In [218]:
for index, row in df.iterrows():
    # get the latitude and longitude from each wikipedia page
    location_page = urlopen(row["wiki"])
    location_soup = BeautifulSoup(location_page)
    lat = location_soup.find("span", attrs={"class":"latitude"}).text.strip()
    long = location_soup.find("span", attrs={"class":"longitude"}).text.strip()
    
    # convert to decimal
    row["latitude"], row["longitude"] = toDegrees(lat, long)

In [219]:
df.head()

Unnamed: 0,Neighborhood,wiki,latitude,longitude
0,Allegheny Center,https://en.wikipedia.org/wiki/Allegheny_Center...,40.4531,-80.005
1,Allegheny West,https://en.wikipedia.org/wiki/Allegheny_West_(...,40.4521,-80.0158
2,Allentown,https://en.wikipedia.org/wiki/Allentown_(Pitts...,40.4211,-79.9939
3,Arlington,https://en.wikipedia.org/wiki/Arlington_(Pitts...,40.415,-79.97
4,Arlington Heights,https://en.wikipedia.org/wiki/Arlington_Height...,40.415,-79.97


In [None]:
# todo, merge rows when lat/long match