In [1]:
import numpy as np 
import pandas as pd 
from pandas.io.json import json_normalize 
import json 
import requests 
from bs4 import BeautifulSoup 

from sklearn.cluster import KMeans # clustering algorithm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata: done
Solving environment: \ 
  - anaconda::ca-certificates-2018.03.07-0, anaconda::openssl-1.0.2o-h26aff7b_0
  - anaconda::openssl-1.0.2o-h26aff7b_0, defaults::ca-certificates-2018.03.07-0
  - anaconda::ca-certificates-2018.03.07-0, defaults::openssl-1.0.2o-h26aff7b_0
  - defaults::ca-certificates-2018.03.07-0, defaults::openssl-1.0.2o-h26aff7bdone

## Package Plan ##

  environment location: /anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-2.3.0               |        py36_1001         526 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2018.4.16          |           py36_0         142 KB  conda-forge
    conda-4.6.2                |           py36_0         875 KB  conda-forge
    folium-0.5.0               |             py_0          45

In [2]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
urlPage= requests.get(url).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(urlPage,'xml')


In [5]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

text_result = requests.get(link).text 
html_parsed_result = BeautifulSoup(text_result, 'html.parser') 

neighborhood_table = html_parsed_result.find('table', class_ = 'wikitable')
neighborhood_table_rows = neighborhood_table.find_all('tr')


neighborhood_info = []
for row in neighborhood_table_rows:
    info = row.text.split('\n')[1:-1] 
    neighborhood_info.append(info)
    
neighborhood_info[0:5]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [6]:
neighborhood_info[0][-1] = 'Neighborhood'
neighborhood_df = pd.DataFrame(neighborhood_info[1:], columns=neighborhood_info[0])

neighborhood_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [7]:
not_assigned_boroughs = neighborhood_df.index[neighborhood_df['Borough'] == 'Not assigned']
not_assigned_neighborhoods = neighborhood_df.index[neighborhood_df['Neighborhood'] == 'Not assigned']

In [8]:
not_assigned_neighborhoods_and_borough = not_assigned_boroughs & not_assigned_neighborhoods
neighborhood_df.drop(neighborhood_df.index[not_assigned_boroughs], inplace=True)
neighborhood_df.reset_index(drop=True, inplace=True)

neighborhood_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [9]:
not_assigned_neighborhoods = neighborhood_df.index[neighborhood_df['Neighborhood'] == 'Not assigned'] # run this again because the indexes on the dataframe where reset

for idx in not_assigned_neighborhoods:
    neighborhood_df['Neighborhood'][idx] = neighborhood_df['Borough'][idx]
    
neighborhood_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [10]:
print('After cleaning the DataFrame, its new shape is {}'.format(neighborhood_df.shape),'\n')
print('There are:')
print('  {} Postal codes'.format(neighborhood_df['Postcode'].unique().shape[0]))
print('  {} Boroughs'.format(neighborhood_df['Borough'].unique().shape[0]))
print('  {} Neighborhoods'.format(neighborhood_df['Neighborhood'].unique().shape[0]))

After cleaning the DataFrame, its new shape is (212, 3) 

There are:
  103 Postal codes
  11 Boroughs
  210 Neighborhoods


In [11]:
group = neighborhood_df.groupby('Postcode')
grouped_neighborhoods = group['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
grouped_boroughs = group['Borough'].apply(lambda x: set(x).pop())
grouped_df = pd.DataFrame(list(zip(grouped_boroughs.index, grouped_boroughs, grouped_neighborhoods)))
grouped_df.columns = ['Postcode', 'Borough', 'Neighborhood']

grouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
coordinates_df = pd.read_csv('https://cocl.us/Geospatial_data') # transform the csv file into a dataframe

print('The coordinates dataframe shape is', coordinates_df.shape)
coordinates_df.head()

The coordinates dataframe shape is (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
postcodes_with_coordinates_df = grouped_df.join(coordinates_df.set_index('Postal Code'), on='Postcode')

postcodes_with_coordinates_df.head(16)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [15]:
latitude = 43.806686
longitude = -79.194353
venues_map = folium.Map(location=[latitude,longitude], zoom_start=13)

# add a red circle marker to represent the Conrad Hotel
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Conrad Hotel',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the Italian restaurants as blue circle markers
for lat, lng,pcode in zip(postcodes_with_coordinates_df.Latitude, postcodes_with_coordinates_df.Longitude,postcodes_with_coordinates_df.Postcode):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='green',
        popup=pcode,
        fill = True,
        fill_color='red',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

In [17]:
CLIENT_ID = 'YZRYNO1GETKTERVB5AI0EFMYZ3NEJJM5BVGWW11BCPG201P0' # your Foursquare ID
CLIENT_SECRET = 'UFGXPNYU1K0EWVJGVKG1H51UB454SXMWX1QU2O35HMHDVEV2' # your Foursquare Secret
VERSION = '20190101'
LIMIT = 30
print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: YZRYNO1GETKTERVB5AI0EFMYZ3NEJJM5BVGWW11BCPG201P0
CLIENT_SECRET:UFGXPNYU1K0EWVJGVKG1H51UB454SXMWX1QU2O35HMHDVEV2


In [18]:
def format_url(lat, lng, radius=500, limit=100):
    return 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, 
        lat, lng, VERSION, radius, limit)

In [19]:
first_postcode = postcodes_with_coordinates_df.iloc[2, :]
url = format_url(first_postcode['Latitude'], first_postcode['Longitude'])

url

'https://api.foursquare.com/v2/venues/explore?client_id=YZRYNO1GETKTERVB5AI0EFMYZ3NEJJM5BVGWW11BCPG201P0&client_secret=UFGXPNYU1K0EWVJGVKG1H51UB454SXMWX1QU2O35HMHDVEV2&ll=43.7635726,-79.1887115&v=20190101&radius=500&limit=100'

In [20]:
import json
from pandas.io.json import json_normalize
def get_venues(url):
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    print(venues)
    return json_normalize(venues)

In [21]:
venues_df = get_venues(url)

venues_df.head()

[{'reasons': {'count': 0, 'items': [{'summary': 'This spot is popular', 'type': 'general', 'reasonName': 'globalInteractionReason'}]}, 'venue': {'id': '4b6074e3f964a5200fe729e3', 'name': 'Swiss Chalet Rotisserie & Grill', 'location': {'address': '4410 Kingston Rd', 'lat': 43.76769708292701, 'lng': -79.1899135003439, 'labeledLatLngs': [{'label': 'display', 'lat': 43.76769708292701, 'lng': -79.1899135003439}], 'distance': 469, 'postalCode': 'M1E 2N5', 'cc': 'CA', 'city': 'Scarborough', 'state': 'ON', 'country': 'Canada', 'formattedAddress': ['4410 Kingston Rd', 'Scarborough ON M1E 2N5', 'Canada']}, 'categories': [{'id': '4bf58dd8d48988d1ca941735', 'name': 'Pizza Place', 'pluralName': 'Pizza Places', 'shortName': 'Pizza', 'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/pizza_', 'suffix': '.png'}, 'primary': True}], 'photos': {'count': 0, 'groups': []}}, 'referralId': 'e-0-4b6074e3f964a5200fe729e3-0'}, {'reasons': {'count': 0, 'items': [{'summary': 'This spot is popular', '

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b6074e3f964a5200fe729e3-0,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",4b6074e3f964a5200fe729e3,4410 Kingston Rd,CA,Scarborough,Canada,,469,"[4410 Kingston Rd, Scarborough ON M1E 2N5, Can...","[{'label': 'display', 'lat': 43.76769708292701...",43.767697,-79.189914,M1E 2N5,ON,Swiss Chalet Rotisserie & Grill,0,[]
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4c62f34bde1b2d7fec89e370-1,"[{'id': '4bf58dd8d48988d122951735', 'name': 'E...",4c62f34bde1b2d7fec89e370,4371 kingston road,CA,Toronto,Canada,,298,"[4371 kingston road, Toronto ON M1E 2M9, Canada]","[{'label': 'display', 'lat': 43.765309, 'lng':...",43.765309,-79.191537,M1E 2M9,ON,G & G Electronics,0,[]
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-522deb21abdf65cfbab70655-2,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",522deb21abdf65cfbab70655,4379 Kingston Road,CA,Scarborough,Canada,,326,"[4379 Kingston Road, Scarborough ON, Canada]","[{'label': 'display', 'lat': 43.766, 'lng': -7...",43.766,-79.191,,ON,Marina Spa,0,[]
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5411f741498e9ebd5e35d8bd-3,"[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",5411f741498e9ebd5e35d8bd,4383 Kingston rd.,CA,Scarborough,Canada,,343,"[4383 Kingston rd., Scarborough ON, Canada]","[{'label': 'display', 'lat': 43.76629908447079...",43.766299,-79.19072,,ON,Big Bite Burrito,0,[]
4,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4c1c7f9bb306c9288f0464b7-4,"[{'id': '4bf58dd8d48988d1ef941735', 'name': 'R...",4c1c7f9bb306c9288f0464b7,4304 Kingston Rd,CA,Scarborough,Canada,,381,"[4304 Kingston Rd, Scarborough ON M1E 2M8, Can...","[{'label': 'display', 'lat': 43.7640757, 'lng'...",43.764076,-79.193406,M1E 2M8,ON,Enterprise Rent-A-Car,0,[]


In [22]:
def clean_df(df):
    relevant_columns = ['venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.name']
    clean_df = df.loc[:,relevant_columns]

    #rename columns
    clean_df.columns = ['Category', 'Lat', 'Lng', 'Name']
    return clean_df 

venues_df = clean_df(venues_df)
venues_df.head()

Unnamed: 0,Category,Lat,Lng,Name
0,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",43.767697,-79.189914,Swiss Chalet Rotisserie & Grill
1,"[{'id': '4bf58dd8d48988d122951735', 'name': 'E...",43.765309,-79.191537,G & G Electronics
2,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",43.766,-79.191,Marina Spa
3,"[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",43.766299,-79.19072,Big Bite Burrito
4,"[{'id': '4bf58dd8d48988d1ef941735', 'name': 'R...",43.764076,-79.193406,Enterprise Rent-A-Car


In [23]:
def get_category_name(row):
    category_json = row['Category']
    try:
        return category_json[0]['name']
    except:        
        return None

venues_df['Category'] = venues_df.apply(get_category_name, axis=1)

venues_df

Unnamed: 0,Category,Lat,Lng,Name
0,Pizza Place,43.767697,-79.189914,Swiss Chalet Rotisserie & Grill
1,Electronics Store,43.765309,-79.191537,G & G Electronics
2,Spa,43.766,-79.191,Marina Spa
3,Mexican Restaurant,43.766299,-79.19072,Big Bite Burrito
4,Rental Car Location,43.764076,-79.193406,Enterprise Rent-A-Car
5,Medical Center,43.766631,-79.192286,Woburn Medical Centre
6,Breakfast Spot,43.7678,-79.190466,Eggsmart


In [27]:
!conda install -c conda-forge geopy --yes
from geopy.distance import great_circle


def closest_postcode(postcode, postcodes):
    postcode = np.asarray(postcode).reshape(1,-1)
    postcodes = np.asarray(postcodes)
    distances = [great_circle(postcode, point).meters for point in postcodes]
    closest_postcode_index = np.argmin(distances)
    return [round(distances[closest_postcode_index]), closest_postcode_index]

Collecting package metadata: done
Solving environment: - 
  - anaconda::ca-certificates-2018.03.07-0, anaconda::openssl-1.0.2o-h26aff7b_0
  - anaconda::openssl-1.0.2o-h26aff7b_0, defaults::ca-certificates-2018.03.07-0
  - anaconda::ca-certificates-2018.03.07-0, defaults::openssl-1.0.2o-h26aff7b_0
  - defaults::ca-certificates-2018.03.07-0, defaults::openssl-1.0.2o-h26aff7bdone

## Package Plan ##

  environment location: /anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          84 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geogr

In [28]:
coordinates = postcodes_with_coordinates_df[['Latitude', 'Longitude']]
import matplotlib.pyplot as plt

X = postcodes_with_coordinates_df[['Longitude', 'Latitude']]

distance, closest_point_index = closest_postcode(coordinates[:1], coordinates[~coordinates.index.isin([0])])
close_points = X[:1].append(X[closest_point_index+1:closest_point_index+2])
close_points

Unnamed: 0,Longitude,Latitude
0,-79.194353,43.806686
16,-79.205636,43.836125


In [29]:
from math import isnan

for lat, lng, idx in zip(postcodes_with_coordinates_df['Latitude'], postcodes_with_coordinates_df['Longitude'], postcodes_with_coordinates_df.index):
    distance, closest_point_index = closest_postcode([lat,lng], coordinates[~coordinates.index.isin([idx])])
    postcodes_with_coordinates_df.at[idx,'Distance'] = np.int64(distance//2) #use the half of the disstance to avoid overlapping
    
postcodes_with_coordinates_df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Distance
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1698.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1625.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1205.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,913.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,913.0
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1301.0
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,1112.0
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,1052.0
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,1112.0
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,1297.0


In [30]:
def get_all_venues(postcodes, lat, lng, radius):
    
    venues_list=[]
    for postcode, lat, lng, radius in zip(postcodes, lat, lng, radius):
        url= format_url(lat, lng, radius)
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            postcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'], 
            v['venue']['categories'][0]['name'])
            for v in results])
    all_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    all_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category'
                  ]
    
    return all_venues

In [32]:
all_venues = get_all_venues(postcodes_with_coordinates_df['Postcode'], postcodes_with_coordinates_df['Latitude'], postcodes_with_coordinates_df['Longitude'], postcodes_with_coordinates_df['Distance'])

print('The total number of venues returned is ', all_venues.shape[0])

all_venues.head(10)

The total number of venues returned is  3314


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,African Rainforest Pavilion,43.817725,-79.183433,Zoo Exhibit
1,M1B,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
2,M1B,43.806686,-79.194353,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
3,M1B,43.806686,-79.194353,penguin exhibit,43.819435,-79.185959,Zoo Exhibit
4,M1B,43.806686,-79.194353,Lion Exhibit,43.819228,-79.186977,Zoo Exhibit
5,M1B,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
6,M1B,43.806686,-79.194353,Gorilla Exhibit,43.81908,-79.184235,Zoo Exhibit
7,M1B,43.806686,-79.194353,Orangutan Exhibit,43.818413,-79.182548,Zoo Exhibit
8,M1B,43.806686,-79.194353,LCBO,43.796671,-79.204586,Liquor Store
9,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant


In [33]:
postcodes_diff = np.setdiff1d(postcodes_with_coordinates_df['Postcode'].values,all_venues['Postcode'].unique())
postcodes_with_coordinates_df[postcodes_with_coordinates_df['Postcode']==postcodes_diff[0]]

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Distance
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,111.0
