# 1. First stage

## Scrape wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [16]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11689 sha256=27b9c36c16d69b933c6747c35feb859ecb0f7ef2a9bd684b89be52640c1f25c2
  Stored in directory: c:\users\rafal\appdata\local\pip\cache\wheels\15\93\6d\5b2c68b8a64c7a7a04947b4ed6d89fb557dcc6bc27d1d7f3ba
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [17]:
import pandas as pd
import wikipedia as wp

html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")

canada_pc_df = pd.read_html(html)[0]
canada_pc_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Create dataframe with thre columns: PostalCode, Borought, Neighborhood

In [18]:
df = canada_pc_df.rename(columns={'Postal Code':'PostalCode'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Ignore cells with a borough that is Not assigned.

In [20]:
df = df[canada_pc_df.Neighbourhood != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the  neighborhoods separated with a comma as shown in row 11 in the above table.

In [21]:
df_group = df.groupby(['PostalCode', 'Borough'], sort = False).agg( ','.join)
df_final = df_group.reset_index()
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [22]:
indexNum = df_final[df_final['Borough'] == 'Not assigned'].index
df_final.drop(indexNum, inplace = True)
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [23]:
print(df_final.shape)

(103, 3)


# 2. Second stage

## Get the latitude and the longitude coordinates of each neighborhood

In [24]:
!pip install geocoder



## Tried to use geocoder without any success

In [34]:
#import geocoder # import geocoder

#lat_lng_coords = None

#while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format(df_final.PostalCode))
#    lat_lng_coords = g.latlng
      
        
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]
        
     

## I got geo coordinates from a file

In [35]:
geo_tor_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_tor_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_tor_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [39]:
df_final_geo = pd.merge(geo_tor_df, df_final, on='PostalCode')[['PostalCode','Borough','Neighbourhood','Latitude','Longitude']]
df_final_geo.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# 3.Third stage

## Explore and cluster the neighborhoods in Toronto

In [40]:
!pip install folium
!pip install geopy

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [103]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
from geopy.geocoders import Nominatim 
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

In [42]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geo coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.65238435, -79.38356765.


## Create map of Toronto using latitude and longitude values

In [45]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

df_final_geo_center = df_final_geo[df_final_geo['Borough'].str.contains("Toronto")].reset_index()

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_final_geo_center['Latitude'], df_final_geo_center['Longitude'], df_final_geo_center['Borough'], df_final_geo_center['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Use foursquare API to check the places in Toronto

In [55]:
radius = 500
df_final_geo_center.loc[0, 'Neighbourhood']
LIMIT = 30
VERSION = 20201214
CLIENT_ID = 'TNAFJUD1UPMN1ZIQOEZY3HP5QE2I3PWTXURFWXU5PQ4PUSAB'
CLIENT_SECRET = '4311VIHETAV2MROP14AU1WCJUTGDCY4AMSKOS1W2JWJEFLFV'

neighbourhood_latitude = df_final_geo_center.loc[0, 'Latitude'] 
neighbourhood_longitude = df_final_geo_center.loc[0, 'Longitude']
neighbourhood_name = df_final_geo_center.loc[0, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

url

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


'https://api.foursquare.com/v2/venues/explore?&client_id=TNAFJUD1UPMN1ZIQOEZY3HP5QE2I3PWTXURFWXU5PQ4PUSAB&client_secret=4311VIHETAV2MROP14AU1WCJUTGDCY4AMSKOS1W2JWJEFLFV&v=20201214&ll=43.67635739999999,-79.2930312&radius=500&limit=30'

In [56]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fd6a3d878c4fb260aaf6a47'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'labe

In [None]:
#Create a function that extracts the category of the venue

In [57]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

## Extract nearby venues in Toronto

In [65]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df_final_geo_center['Neighbourhood'],
                                   latitudes=df_final_geo_center['Latitude'],
                                   longitudes=df_final_geo_center['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

In [66]:
print(toronto_venues.shape)
toronto_venues.head()

(851, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [67]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 196 uniques categories.


## Group all the venues by Neighbourhood

In [68]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

In [69]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [70]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Beer Bar,Cocktail Bar,Seafood Restaurant,Farmers Market,Park,Restaurant,Basketball Stadium,Breakfast Spot,Jazz Club,Liquor Store
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Nightclub,Coffee Shop,Pet Store,Bar,Italian Restaurant,Bakery,Convenience Store,Climbing Gym
2,"Business reply mail Processing Centre, South C...",Garden,Fast Food Restaurant,Skate Park,Brewery,Light Rail Station,Burrito Place,Restaurant,Recording Studio,Butcher,Farmers Market
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Rental Car Location,Bar,Harbor / Marina
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Yoga Studio,Comic Shop,Sandwich Place,Dessert Shop,Seafood Restaurant,Bubble Tea Shop,Ramen Restaurant


## Cluster Neighborhood
## Using K-means

In [89]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [90]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df_final_geo_center
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
toronto_merged.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Neighborhood,Trail,Pub,Health Food Store,Yoga Studio,Coworking Space,Distribution Center,Discount Store,Diner,Dessert Shop
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Ice Cream Shop,Italian Restaurant,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop,Restaurant,Pub
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,1,Park,Coffee Shop,Pizza Place,Brewery,Restaurant,Italian Restaurant,Burrito Place,Ice Cream Shop,Liquor Store,Pub
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Coffee Shop,Café,American Restaurant,Bakery,Yoga Studio,Pet Store,Brewery,Italian Restaurant,Bookstore,Seafood Restaurant
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Bus Line,Dumpling Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


In [92]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

toronto_merge_no_na = toronto_merged.dropna()
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merge_no_na['Latitude'], toronto_merge_no_na['Longitude'], toronto_merge_no_na['Neighbourhood'], toronto_merge_no_na['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [104]:
cl_num = 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == cl_num, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M5N,-79.416936,2,Garden,Music Venue,Yoga Studio,Creperie,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
