### This notebook is used for capstone project for IBM Data Science Professional certificate

In [51]:
import pandas as pd  
import numpy as np
import requests 
from bs4 import BeautifulSoup
import pprint
import folium

import matplotlib.cm as cm
import matplotlib.colors as colors

Make a dictionary of all Toronto Boroughs and Neighborhoods with Postal Codes

In [5]:

#neighborhoods_dict = {'M5G': [['Downtown Toronto'],['Central', 'Bay Street']]}

neighborhoods_dict = {}

#I can't control wikipedia, so I downloaded the page and put in my github repo

with open('List of postal codes of Canada_ M - Wikipedia.html') as html_file:
    soup = BeautifulSoup(html_file, 'lxml')
    
match = soup.find('table', class_='wikitable sortable jquery-tablesorter')
match = match.find('tbody')

table_rows = match.find_all('tr')


#process the table founded in the wiki page and make a dictionary of following form:
#{'Code': [['Borough'],['NeighborhoodS',...']]}
for item in table_rows:
    table_data = item.find_all('td')
        
    for i, data in enumerate(table_data):
        table_data[i] = data.text.rstrip()
  
    #process the table_data to dict: {'Code': [['Borough'],['NeighborhoodS',...']]}
    code = table_data[0]
    borough = table_data[1]
    neighborhood = table_data[2]
    
    if borough == 'Not assigned':  #filter all entries where borough is undefined
        pass
    elif code not in neighborhoods_dict: 
        if neighborhood != 'Not assigned':
            neighborhoods_dict[code]=[[borough],[neighborhood]]
        else:
            neighborhoods_dict[code]=[[borough],[borough]]
    elif code in neighborhoods_dict:
            neighborhoods_dict[code][1].append(neighborhood)
    else:
        print('something wrong with data') #check if something inconsistent in data
   

#### Let's transform our dictionary to panda's dataframe


In [6]:
# define the dataframe columns
column_names = ['PostalCode','Borough','Neighborhood']

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns = column_names)

#Take a look at the empty dataframe to confirm that the columns are as intended.
neighborhoods


Unnamed: 0,PostalCode,Borough,Neighborhood


In [7]:
for k, v in neighborhoods_dict.items():
 
    borough = v[0][0]
    Neighborhood = v[1]
  
    neighborhoods = neighborhoods.append({'PostalCode': k, 'Borough': borough, 'Neighborhood': Neighborhood} , ignore_index=True)

neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,"[Harbourfront, Regent Park]"
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]"
4,M7A,Queen's Park,[Queen's Park]


In [8]:
print(neighborhoods.shape)

(103, 3)


In [9]:
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
neighborhoods = neighborhoods.set_index('PostalCode').join(coord.set_index('Postal Code'))
neighborhoods.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,[Parkwoods],43.753259,-79.329656
M4A,North York,[Victoria Village],43.725882,-79.315572
M5A,Downtown Toronto,"[Harbourfront, Regent Park]",43.65426,-79.360636
M6A,North York,"[Lawrence Heights, Lawrence Manor]",43.718518,-79.464763
M7A,Queen's Park,[Queen's Park],43.662301,-79.389494


In [11]:
neighborhoods = neighborhoods.reset_index(level=None)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,[Parkwoods],43.753259,-79.329656
1,M4A,North York,[Victoria Village],43.725882,-79.315572
2,M5A,Downtown Toronto,"[Harbourfront, Regent Park]",43.65426,-79.360636
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]",43.718518,-79.464763
4,M7A,Queen's Park,[Queen's Park],43.662301,-79.389494


In [53]:
toronto_latitue = 43.653908
toronto_longitude = -79.384293
map_toronto = folium.Map(location=[toronto_latitue, toronto_longitude], zoom_start = 12)

for lat, long, borough, neib in zip(neighborhoods['Latitude'],neighborhoods['Longitude'],neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(borough, neib)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, long],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color = '#3186cc',
    fill_opacity = 0.7).add_to(map_toronto)
    
    
map_toronto

In [54]:
CLIENT_ID = 'NONE' # your Foursquare ID
CLIENT_SECRET = 'NONE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version


## Explore Neighborhoods in Downtown

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    LIMIT = 100
    radius = 500
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    print('Get Nearby Venuest: DONE!')
    return(nearby_venues)

In [15]:
downtown_data = neighborhoods[neighborhoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"[Harbourfront, Regent Park]",43.65426,-79.360636
1,M5B,Downtown Toronto,"[Ryerson, Garden District]",43.657162,-79.378937
2,M5C,Downtown Toronto,[St. James Town],43.651494,-79.375418
3,M5E,Downtown Toronto,[Berczy Park],43.644771,-79.373306
4,M5G,Downtown Toronto,[Central Bay Street],43.657952,-79.387383


In [16]:
downtown_data.shape

(18, 5)

In [21]:
downtown_venues = getNearbyVenues(names=downtown_data['Neighborhood'],
                                  latitudes=downtown_data['Latitude'],
                                  longitudes=downtown_data['Longitude']
                                  )


Get Nearby Venuest: DONE!


In [26]:
print(downtown_venues.shape)
downtown_venues.head()

(1283, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
3,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,Cooper Koo YMCA,43.653191,-79.357947,Gym / Fitness Center


In [27]:
downtown_venues['Neighborhood'] = downtown_venues['Neighborhood'].astype(str)
downtown_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"['Adelaide', 'King', 'Richmond']",100,100,100,100,100,100
['Berczy Park'],53,53,53,53,53,53
"['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara']",14,14,14,14,14,14
"['Cabbagetown', 'St. James Town']",48,48,48,48,48,48
['Central Bay Street'],82,82,82,82,82,82
"['Chinatown', 'Grange Park', 'Kensington Market']",100,100,100,100,100,100
['Christie'],16,16,16,16,16,16
['Church and Wellesley'],88,88,88,88,88,88
"['Commerce Court', 'Victoria Hotel']",100,100,100,100,100,100
"['Design Exchange', 'Toronto Dominion Centre']",100,100,100,100,100,100


In [28]:
#one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix = "", prefix_sep="")

downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood']

fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
fixed_columns.remove('Neighborhood')
fixed_columns.insert(0, 'Neighborhood')
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,"['Harbourfront', 'Regent Park']",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"['Harbourfront', 'Regent Park']",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"['Harbourfront', 'Regent Park']",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"['Harbourfront', 'Regent Park']",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"['Harbourfront', 'Regent Park']",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
downtown_onehot.shape

(1283, 204)

In [30]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,"['Adelaide', 'King', 'Richmond']",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01
1,['Berczy Park'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"['CN Tower', 'Bathurst Quay', 'Island airport'...",0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"['Cabbagetown', 'St. James Town']",0.020833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,['Central Bay Street'],0.012195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.012195,0.0,0.0,0.012195,0.0
5,"['Chinatown', 'Grange Park', 'Kensington Market']",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.06,0.0,0.04,0.01,0.0
6,['Christie'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,['Church and Wellesley'],0.011364,0.0,0.011364,0.011364,0.0,0.0,0.0,0.0,0.0,...,0.011364,0.0,0.0,0.0,0.0,0.011364,0.011364,0.011364,0.0,0.0
8,"['Commerce Court', 'Victoria Hotel']",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
9,"['Design Exchange', 'Toronto Dominion Centre']",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0


In [31]:
downtown_grouped.shape

(18, 204)

Let's print each neighborhood along with the top 5 most common venues

In [32]:
num_top_venues = 5

for hood in downtown_grouped['Neighborhood']:
    print('---'+hood+' ---')
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['Venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq',ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---['Adelaide', 'King', 'Richmond'] ---
                 Venue  freq
0          Coffee Shop  0.07
1                 Café  0.06
2      Thai Restaurant  0.04
3           Steakhouse  0.04
4  American Restaurant  0.04


---['Berczy Park'] ---
                Venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.06
2            Beer Bar  0.04
3  Seafood Restaurant  0.04
4                Café  0.04


---['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara'] ---
              Venue  freq
0  Airport Terminal  0.14
1   Airport Service  0.14
2    Airport Lounge  0.14
3     Boat or Ferry  0.07
4      Airport Gate  0.07


---['Cabbagetown', 'St. James Town'] ---
                Venue  freq
0         Coffee Shop  0.10
1          Restaurant  0.08
2  Chinese Restaurant  0.04
3              Bakery  0.04
4   Indian Restaurant  0.04


---['Central Bay Street'] ---
                Venue  freq
0         Coffee Shop  0.15
1     

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [46]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"['Adelaide', 'King', 'Richmond']",Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Cosmetics Shop,Restaurant,Hotel,Bar,Gym
1,['Berczy Park'],Coffee Shop,Cocktail Bar,Bakery,Seafood Restaurant,Farmers Market,Cheese Shop,Café,Beer Bar,Steakhouse,Restaurant
2,"['CN Tower', 'Bathurst Quay', 'Island airport'...",Airport Terminal,Airport Lounge,Airport Service,Boat or Ferry,Harbor / Marina,Airport,Airport Food Court,Airport Gate,Boutique,Plane
3,"['Cabbagetown', 'St. James Town']",Coffee Shop,Restaurant,Café,Pub,Bakery,Pizza Place,Park,Chinese Restaurant,Indian Restaurant,Italian Restaurant
4,['Central Bay Street'],Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Bar,Bubble Tea Shop,Burger Joint,Falafel Restaurant,Spa
5,"['Chinatown', 'Grange Park', 'Kensington Market']",Café,Vegetarian / Vegan Restaurant,Chinese Restaurant,Bar,Vietnamese Restaurant,Bakery,Mexican Restaurant,Dumpling Restaurant,Coffee Shop,Caribbean Restaurant
6,['Christie'],Grocery Store,Café,Park,Italian Restaurant,Coffee Shop,Diner,Nightclub,Convenience Store,Restaurant,Baby Store
7,['Church and Wellesley'],Japanese Restaurant,Coffee Shop,Gay Bar,Sushi Restaurant,Burger Joint,Restaurant,Gastropub,Mediterranean Restaurant,Men's Store,Fast Food Restaurant
8,"['Commerce Court', 'Victoria Hotel']",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Deli / Bodega,Gastropub,Steakhouse,Italian Restaurant,Seafood Restaurant
9,"['Design Exchange', 'Toronto Dominion Centre']",Coffee Shop,Hotel,Café,American Restaurant,Restaurant,Gastropub,Gym,Sports Bar,Deli / Bodega,Italian Restaurant


### Cluster Neighborhoods

In [33]:
# import k-means from clustering stage
from sklearn.cluster import KMeans


In [36]:

kclusters = 5

dowtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1) 

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dowtown_grouped_clustering)

kmeans.labels_[0:10]

array([0, 0, 2, 0, 0, 3, 4, 0, 0, 0])

In [48]:
downtown_data['Neighborhood'] = downtown_data['Neighborhood'].astype(str)

downtown_merged = downtown_data
downtown_merged['Cluster Labels'] = kmeans.labels_

downtown_merged = downtown_data.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
downtown_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"['Harbourfront', 'Regent Park']",43.65426,-79.360636,0,Coffee Shop,Bakery,Park,Café,Restaurant,Pub,Mexican Restaurant,Breakfast Spot,Theater,Health Food Store
1,M5B,Downtown Toronto,"['Ryerson', 'Garden District']",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Bar,Japanese Restaurant,Ramen Restaurant,Restaurant,Movie Theater,Sandwich Place
2,M5C,Downtown Toronto,['St. James Town'],43.651494,-79.375418,2,Coffee Shop,Café,Restaurant,Hotel,Clothing Store,Cocktail Bar,Bakery,Gastropub,Cosmetics Shop,Italian Restaurant
3,M5E,Downtown Toronto,['Berczy Park'],43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Bakery,Seafood Restaurant,Farmers Market,Cheese Shop,Café,Beer Bar,Steakhouse,Restaurant
4,M5G,Downtown Toronto,['Central Bay Street'],43.657952,-79.387383,0,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Japanese Restaurant,Bar,Bubble Tea Shop,Burger Joint,Falafel Restaurant,Spa


In [55]:

map_clusters = folium.Map(location=[toronto_latitue, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters