# Capstone Project

## Watts Dietrich

Welcome! This is my notebook for the capstone course of the IBM Data Science program. 

In [1]:
import pandas as pd
import numpy as np
import requests # library to handle requests
import random # library for random number generation
import fnmatch # regular expressions
from  more_itertools import unique_everseen

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Folium installed
Libraries imported.


# Importing neighborhood data

In [84]:
# Import data
neighborhoods = pd.read_csv(r"C:\Users\Watts\Desktop\Data Science\Capstone\Project\Portland_Neighborhoods.csv")
neighborhoods.drop(["Unnamed: 4"], axis=1, inplace=True)
neighborhoods

Unnamed: 0,Neighborhood,Latitude,Longitude,Income
0,Alameda,45.548200,-122.630700,105242
1,Arbor Lodge,45.573540,-122.692470,71234
2,Ardenwald-Johnson Creek,45.455400,-122.629800,88349
3,Argay,45.554750,-122.521140,47028
4,Arlington Heights,45.521100,-122.710450,107676
5,Arnold Creek,45.441340,-122.699830,124401
6,Ashcreek,45.456090,-122.737180,93686
7,Beaumont-Wilshire,45.550300,-122.622400,103577
8,Boise,45.550790,-122.671220,70682
9,Brentwood-Darlington,45.468630,-122.597150,54202


In [28]:
neighborhoods.shape

(95, 3)

# Map of neighborhoods

In [29]:
# Get geo coordinates of Toronto

address = 'Portland, OR'
geolocator = Nominatim(user_agent="portland_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Portland are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Portland are 45.5202471, -122.6741949.


In [33]:
# create map of Portland
map_portland = folium.Map(location=[latitude, longitude], zoom_start=11)

# add neighborhood markers to map
for lat, lng, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_portland)  
    
map_portland

# Retrieval and analysis of Foursquare data

In [34]:
# Define function for retrieving venue information
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [38]:
#Obtain venue data
portland_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Alameda
Arbor Lodge
Ardenwald-Johnson Creek
Argay
Arlington Heights
Arnold Creek
Ashcreek
Beaumont-Wilshire
Boise
Brentwood-Darlington
Bridgeton
Bridlemile
Brooklyn
Buckman
Cathedral Park
Centennial
Collins View
Concordia
Creston-Kenilworth
Crestwood
Cully
Downtown
East Columbia
Eastmoreland
Eliot
Far Southwest
Forest Park
Foster-Powell
Glenfair
Goose Hollow
Grant Park
Hayden Island
Hayhurst
Hazelwood
Healy Heights
Hillsdale
Hillside
Hollywood
Homestead
Hosford-Abernethy
Humboldt
Irvington
Kenton
Kerns
King
Laurelhurst
Lents
Linnton
Lloyd District
Madison South
Maplewood
Markham
Marshall Park
Mill Park
Montavilla
Mt. Scott Arleta
Mt. Tabor
Multnomah
North Tabor
Northwest District
Northwest Heights
Northwest Industrial
Old Town Chinatown
Overlook
Parkrose
Parkrose Heights
Pearl District
Piedmont
Pleasant Valley
Portsmouth
Powelhurst-Gilbert
Reed
Richmond
Rose City Park
Roseway
Russel
Sabin
Sellwood-Moreland
South Burlingame
South Portland
South Tabor
Southwest Hills
St. Johns
Sullivan's

In [56]:
print(portland_venues.shape)
portland_venues.head()
portland_venues.to_csv(r'C:\Users\Watts\Desktop\Data Science\Capstone\Project\venues.csv', index = False)

(1252, 7)


In [40]:
# group venue data by neighborhood
portland_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alameda,3,3,3,3,3,3
Arbor Lodge,14,14,14,14,14,14
Ardenwald-Johnson Creek,5,5,5,5,5,5
Argay,4,4,4,4,4,4
Arlington Heights,14,14,14,14,14,14
Arnold Creek,1,1,1,1,1,1
Ashcreek,3,3,3,3,3,3
Beaumont-Wilshire,14,14,14,14,14,14
Boise,30,30,30,30,30,30
Brentwood-Darlington,4,4,4,4,4,4


In [41]:
print('There are {} uniques categories.'.format(len(portland_venues['Venue Category'].unique())))

There are 258 uniques categories.


In [42]:
# one hot encoding
portland_onehot = pd.get_dummies(portland_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
portland_onehot['Neighborhood'] = portland_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [portland_onehot.columns[-1]] + list(portland_onehot.columns[:-1])
portland_onehot = portland_onehot[fixed_columns]

portland_onehot.head()

Unnamed: 0,Zoo Exhibit,ATM,Accessories Store,African Restaurant,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,...,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
portland_onehot.shape

(1252, 258)

In [45]:
# Group data by neighborhood
portland_grouped = portland_onehot.groupby('Neighborhood').mean().reset_index()
portland_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Accessories Store,African Restaurant,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,...,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Alameda,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,Arbor Lodge,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.071429,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,Ardenwald-Johnson Creek,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
3,Argay,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
4,Arlington Heights,0.00,0.000000,0.000000,0.00,0.000000,0.071429,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
5,Arnold Creek,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
6,Ashcreek,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
7,Beaumont-Wilshire,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
8,Boise,0.00,0.000000,0.000000,0.00,0.033333,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
9,Brentwood-Darlington,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [97]:
#Create dataframe specific to Thai Restaurants
thai_data = portland_grouped[["Neighborhood", "Thai Restaurant"]]
thai_data.head()

Unnamed: 0,Neighborhood,Thai Restaurant
0,Alameda,0.0
1,Arbor Lodge,0.0
2,Ardenwald-Johnson Creek,0.0
3,Argay,0.25
4,Arlington Heights,0.0


# Clustering of neighborhoods

In [61]:
# set number of clusters
kclusters = 3

thai_data_clustering = thai_data.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(thai_data_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 1, 0, 0, 0, 2, 2, 0])

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

portland_merged = thai_data.copy() #
portland_merged["Cluster"] = kmeans.labels_

In [93]:
# add relevant data to cluster dataframe
portland_merged["Latitude"] = neighborhoods["Latitude"]
portland_merged["Longitude"] = neighborhoods["Longitude"]
portland_merged["Income"] = neighborhoods["Income"]
portland_merged

Unnamed: 0,Neighborhood,Thai Restaurant,Cluster,Latitude,Longitude,Income
0,Alameda,0.000000,0,45.548200,-122.630700,105242
1,Arbor Lodge,0.000000,0,45.573540,-122.692470,71234
2,Ardenwald-Johnson Creek,0.000000,0,45.455400,-122.629800,88349
3,Argay,0.250000,1,45.554750,-122.521140,47028
4,Arlington Heights,0.000000,0,45.521100,-122.710450,107676
5,Arnold Creek,0.000000,0,45.441340,-122.699830,124401
6,Ashcreek,0.000000,0,45.456090,-122.737180,93686
7,Beaumont-Wilshire,0.071429,2,45.550300,-122.622400,103577
8,Boise,0.066667,2,45.550790,-122.671220,70682
9,Brentwood-Darlington,0.000000,0,45.468630,-122.597150,54202


# Map of clustered neighborhoods

In [71]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(portland_merged['Latitude'], portland_merged['Longitude'], portland_merged['Neighborhood'], portland_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Cross-referencing cluster results with median household income

In [98]:
# Generate list of neighborhoods that have no thai restaurants, ordered by median income

recommendations = portland_merged.loc[portland_merged['Cluster'] == 0].copy()
recommendations.drop(["Thai Restaurant", "Cluster", "Latitude", "Longitude"], axis=1, inplace=True)
recommendations.sort_values(by=['Income'], ascending=False)

Unnamed: 0,Neighborhood,Income
87,Wilkes,161886
61,Parkrose,135612
5,Arnold Creek,124401
81,Sunderland,123356
53,Mt. Scott Arleta,119629
37,Hosford-Abernethy,117896
46,Lloyd District,108654
4,Arlington Heights,107676
0,Alameda,105242
16,Concordia,105158
