# PART 1

Importing required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.io.html import read_html

Using Pandas to scrape table from wikipidea Page

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table=read_html(url, attrs={'class':'wikitable'})

In [3]:
table[0].head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [4]:
table[0].shape

(287, 3)

In [5]:
table[0].Postcode.value_counts()

M8Y    8
M9V    8
M5V    7
M8Z    5
M4V    5
      ..
M2Y    1
M6X    1
M6Z    1
M1W    1
M4G    1
Name: Postcode, Length: 180, dtype: int64

In [6]:
df=table[0]

In [7]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


Grouping Postcode codes and corresponding Neighbourhoods

In [8]:
postcode_grp=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)

In [9]:
postcode_grp_df=pd.DataFrame(postcode_grp)

In [10]:
postcode_grp_df.reset_index(inplace=True)

In [11]:
postcode_grp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[Not assigned]
1,M1B,Scarborough,"[Rouge, Malvern]"
2,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[Woburn]
...,...,...,...
175,M9V,Etobicoke,"[Albion Gardens, Beaumond Heights, Humbergate,..."
176,M9W,Etobicoke,[Northwest]
177,M9X,Not assigned,[Not assigned]
178,M9Y,Not assigned,[Not assigned]


In [12]:
postcode_grp_df['Neighbourhood']=postcode_grp_df['Neighbourhood'].str.join(',')

In [13]:
postcode_grp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
...,...,...,...
175,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
176,M9W,Etobicoke,Northwest
177,M9X,Not assigned,Not assigned
178,M9Y,Not assigned,Not assigned


Converting 'Not assigned' to NaN and droping Boroughs with NaNs

In [14]:
postcode_grp_df.replace('Not assigned',np.nan,inplace=True)
# postcode_grp_df.info()

In [16]:
postcode_grp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postcode         180 non-null object
Borough          103 non-null object
Neighbourhood    102 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [17]:
postcode_grp_df.dropna(axis=0,subset=['Borough'], inplace=True)

Filling NaN Neighbourhoods with Adjacent Borough

In [18]:
postcode_grp_df[postcode_grp_df.isnull().any(axis=1)]

Unnamed: 0,Postcode,Borough,Neighbourhood
160,M9A,Queen's Park,


In [19]:
 #postcode_grp_df['Neighbourhood'].fillna(df['Borough'],inplace=True)

In [20]:
postcode_grp_df['Neighbourhood'][160]

nan

In [23]:
postcode_grp_df['Neighbourhood']=postcode_grp_df['Neighbourhood'].fillna(value=postcode_grp_df['Borough'])

In [24]:
postcode_grp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 1 to 176
Data columns (total 3 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 8.2+ KB


In [25]:
postcode_grp_df.loc[160,:]

Postcode                  M9A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 160, dtype: object

# PART 2

Importing the Geospatial Co-ordinates csv

In [31]:
geo=pd.read_csv('Geospatial_Coordinates.csv')

In [39]:
geo.head()
geo.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [40]:
geo

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Joining the Postcode Dataframe to the co-ordinates dataframe 

In [41]:
table=pd.merge(postcode_grp_df,geo,on='Postcode')

In [42]:
table

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


# PART 3

In [64]:
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
from pandas.io.json import json_normalize


Exploring Toronto's Neighbourhoods

In [73]:
toronto_data=table[table['Borough'].str.contains("Toronto")]
toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [44]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [45]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

In [48]:
for lat, lng, Borough, Neighbourhood in zip(table['Latitude'], table['Longitude'], table['Borough'], table['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)
map_Toronto

In [49]:
CLIENT_ID = 'SGMRMZJESM15R5IRZ5Y5RACNUNFIWE2VIEC25Z414CMFSKG0' # your Foursquare ID
CLIENT_SECRET = 'T5BO0ATDRUAZSN3JOH5LURRCISAPYXSQR2IRJ1DDBYTWLQ4I' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SGMRMZJESM15R5IRZ5Y5RACNUNFIWE2VIEC25Z414CMFSKG0
CLIENT_SECRET:T5BO0ATDRUAZSN3JOH5LURRCISAPYXSQR2IRJ1DDBYTWLQ4I


Explore other Neighborhoods in Toronto

In [76]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [78]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvalles
Runnymede

In [79]:
toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
2,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
3,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
4,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery


In [96]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton,Exhibition Place,Parkdale Village",100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,47,47,47,47,47,47
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15
"Cabbagetown,St. James Town",36,36,36,36,36,36
Central Bay Street,100,100,100,100,100,100
"Chinatown,Grange Park,Kensington Market",100,100,100,100,100,100
Christie,100,100,100,100,100,100
Church and Wellesley,100,100,100,100,100,100


In [104]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
#toronto_onehot.drop(['Neighbourhood'],axis=1,inplace=True)
toronto_onehot.insert(loc=0, column='Neighbourhood', value=toronto_venues['Neighbourhood'])
toronto_onehot
toronto_onehot.shape

(3186, 277)

In [106]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio,Zoo
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [108]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Café,Hotel,Coffee Shop,Restaurant,Theater,Sushi Restaurant,Pub,Beer Bar,Bar,Bakery
1,Berczy Park,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Beer Bar,Bakery,BBQ Joint,Park,Cocktail Bar
2,"Brockton,Exhibition Place,Parkdale Village",Café,Coffee Shop,Bar,Bakery,Furniture / Home Store,Restaurant,Gift Shop,Tibetan Restaurant,Park,Soccer Stadium
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Park,Pizza Place,Brewery,Fast Food Restaurant,Sushi Restaurant,Italian Restaurant,Pet Store,Snack Place,Electronics Store
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Coffee Shop,Café,Harbor / Marina,Dance Studio,Garden,Dog Run,Sculpture Garden,Park,Track,Scenic Lookout


Making clusters

In [109]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 1, 2, 4, 2, 2, 1, 1, 2])

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster_Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')



In [121]:
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Coffee Shop,Pub,Pizza Place,Beach,Japanese Restaurant,Breakfast Spot,Park,Caribbean Restaurant,Sandwich Place,Café
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,2,Greek Restaurant,Café,Coffee Shop,Pub,Fast Food Restaurant,Italian Restaurant,Ice Cream Shop,Grocery Store,Yoga Studio,Spa
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,2,Indian Restaurant,Coffee Shop,Café,Beach,Sandwich Place,Brewery,Italian Restaurant,Park,Bakery,Butcher
43,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Coffee Shop,Bar,Café,Brewery,Italian Restaurant,Vietnamese Restaurant,American Restaurant,Bakery,French Restaurant,Diner
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Café,Coffee Shop,Pharmacy,Bookstore,Gym / Fitness Center,College Quad,College Gym,Park,Trail,Dumpling Restaurant


In [122]:
neighbourhoods_venues_sorted.head()

Unnamed: 0,Cluster_Labels,Cluster Label,Cluster Labels,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,2,2,2,"Adelaide,King,Richmond",Café,Hotel,Coffee Shop,Restaurant,Theater,Sushi Restaurant,Pub,Beer Bar,Bar,Bakery
1,2,2,2,Berczy Park,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Beer Bar,Bakery,BBQ Joint,Park,Cocktail Bar
2,1,1,1,"Brockton,Exhibition Place,Parkdale Village",Café,Coffee Shop,Bar,Bakery,Furniture / Home Store,Restaurant,Gift Shop,Tibetan Restaurant,Park,Soccer Stadium
3,2,2,2,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Park,Pizza Place,Brewery,Fast Food Restaurant,Sushi Restaurant,Italian Restaurant,Pet Store,Snack Place,Electronics Store
4,4,4,4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Coffee Shop,Café,Harbor / Marina,Dance Studio,Garden,Dog Run,Sculpture Garden,Park,Track,Scenic Lookout


In [123]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [124]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters