# The data
First, let's just gather all the data we'll need, then work from there

### The wiki data
Let's get our list of neighborhoods and coordinates

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [2]:
# wiki page with list of Pittsburgh neighborhoods
wiki = "https://en.wikipedia.org/wiki/List_of_Pittsburgh_neighborhoods"

# create the dataframe we'll store our data in
df = pd.DataFrame(columns=["Neighborhood", "wiki", "latitude", "longitude"])

# open the page and create soup object
page = urlopen(wiki)
soup = BeautifulSoup(page, 'html.parser')

# find our list of neighborhoods
neighborhoods_div = soup.find('div', attrs={"class": "div-col columns column-width"})

# get all the "li" tags, and get the names and urls. Add them to the dataframe
lis = neighborhoods_div.findAll('li')
for li in lis:
    a = li.find('a')
    text = a.text.strip()
    url = "https://en.wikipedia.org" + a.attrs.get("href")
    df = df.append({"Neighborhood":text, "wiki":url}, ignore_index=True)

    
df.head()

Unnamed: 0,Neighborhood,wiki,latitude,longitude
0,Allegheny Center,https://en.wikipedia.org/wiki/Allegheny_Center...,,
1,Allegheny West,https://en.wikipedia.org/wiki/Allegheny_West_(...,,
2,Allentown,https://en.wikipedia.org/wiki/Allentown_(Pitts...,,
3,Arlington,https://en.wikipedia.org/wiki/Arlington_(Pitts...,,
4,Arlington Heights,https://en.wikipedia.org/wiki/Arlington_Height...,,


In [3]:
# method to transform lat and long coordinates from degrees to decimal format
def toDegrees(lat, long):
    # parse the coordinate
    lat = re.split("[\u2032 \u2033 \N{DEGREE SIGN} N]", lat)
    long = re.split("[\u2032 \u2033 \N{DEGREE SIGN} W]", long)
    
    # some neighborhoods only provided degrees and minutes so I created an exception for those
    try:
        # convert strings to floats
        lat = [float(x) for x in lat[0:3]]
        long = [float(x) for x in long[0:3]]
        # the math part
        lat_dec = round(lat[0] + (lat[1]/60) + (lat[2]/3600), 6)
        long_dec = -round(long[0] + (long[1]/60) + (long[2]/3600), 6)
    except:
        # convert strings to floats
        lat = [float(x) for x in lat[0:2]]
        long = [float(x) for x in long[0:2]]
        # the math part
        lat_dec = round(lat[0] + (lat[1]/60), 6)
        long_dec = -round(long[0] + (long[1]/60), 6)
    
    return lat_dec, long_dec

In [4]:
for index, row in df.iterrows():
    # get the latitude and longitude from each wikipedia page
    location_page = urlopen(row["wiki"])
    location_soup = BeautifulSoup(location_page)
    lat = location_soup.find("span", attrs={"class":"latitude"}).text.strip()
    long = location_soup.find("span", attrs={"class":"longitude"}).text.strip()
    
    # convert to decimal
    row["latitude"], row["longitude"] = toDegrees(lat, long)

In [10]:
# there are some neighborhoods that have smaller neighborhoods within them. They therefore have matching latitude and longitude
# let's drop the duplicates
df.drop_duplicates(subset=["latitude", "longitude"], inplace=True)

# and we won't needto wiki links anymore, so let's drop those just to clean up our data frame
df.drop("wiki", axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,Neighborhood,latitude,longitude
0,Allegheny Center,40.4531,-80.005
1,Allegheny West,40.4521,-80.0158
2,Allentown,40.4211,-79.9939
3,Arlington,40.415,-79.97
5,Banksville,40.4119,-80.0389


### Now add the foursquare data
We'll get the venues for each of these neighborhoods using Foursquare's API

In [12]:
# API credentials
client_id = "JEHUFR3S515TVIJDYY4UCOOARQKZFLLXCKMOCMHGOA1TQVDF"
client_secret = "Q44OSNI3XZVMMIPEANERXXUXJK5KJJZM5KCFHRN3UH3VXMKQ"
version = "20180605"
limit = 100

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name) # because this method takes a while to run, this will help us see the process made during runtime
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
venues_df = getNearbyVenues(df.Neighborhood, df.latitude, df.longitude)
venues_df.head()

Allegheny Center
Allegheny West
Allentown
Arlington
Banksville
Bedford Dwellings
Beechview
Beltzhoover
Bloomfield
Bluff
Bon Air
Brighton Heights
Brookline
California-Kirkbride
Carrick
Central Business District
Chinatown
Cultural District
Central Lawrenceville
Central Northside
Mexican War Streets
Central Oakland
Chartiers
Chateau
Crafton Heights
Duquesne Heights
East Allegheny
East Carnegie
East Hills
East Liberty
Elliott
Esplen
Fairywood
Fineview
Friendship
Garfield
Glen Hazel
Greenfield
Four Mile Run
Hays
Hazelwood
Highland Park
Homewood North
Knoxville
Larimer
Lincoln–Lemington–Belmar
Lincoln Place
Lower Lawrenceville
Manchester
Marshall-Shadeland
Brunot Island
Morningside
Mount Oliver
Mount Washington
Chatham Village
New Homestead
North Point Breeze
North Shore
Northview Heights
Oakwood
Overbrook
Perry North
Perry South
Point Breeze
Polish Hill
Regent Square
Ridgemont
Saint Clair
Shadyside
Sheraden
Panther Hollow
Southshore
Station Square
South Side Flats
SouthSide Works
South Side

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allegheny Center,40.453056,-80.005,Children's Museum of Pittsburgh,40.452793,-80.006569,Museum
1,Allegheny Center,40.453056,-80.005,Federal Galley,40.451605,-80.006045,Comfort Food Restaurant
2,Allegheny Center,40.453056,-80.005,Park House,40.453284,-80.001504,Bar
3,Allegheny Center,40.453056,-80.005,El Burro,40.45586,-80.006689,Mexican Restaurant
4,Allegheny Center,40.453056,-80.005,Bistro To Go,40.45345,-80.000995,Deli / Bodega


In [17]:
venues_df.shape

(1407, 7)

### Create dummy variables, then restrict to top 5 venue types

In [20]:
# get dummy variables
onehot = pd.get_dummies(venues_df[["Venue Category"]], prefix="", prefix_sep="")

# reinsert neighborhood names
onehot.insert(loc=0, column="Neighborhood", value = venues_df.Neighborhood)

# group by the neighborhoods
df = onehot.groupby("Neighborhood").mean()

df.head()

Unnamed: 0_level_0,American Restaurant,Antique Shop,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,...,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Water Park,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Allegheny Center,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0
Allegheny West,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0
Allentown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arlington,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Banksville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
