## Business case
John and Anne are moving to Toronto. John works mainly from home and traveling to the office only one day per week. Anne has not found a job yet and she is a trained teacher. After moving to Toronto she would start looking for work. They are also planing to have children. Therefore, they decided to purchase a house in a neighborhood where there is a high density of schools. This would increase the chance that Anne would find a job near the house so she can minimize traveling. Then when they have children there is a high probability that they can find a school near the house. 

In [66]:
#Import the necessary libraries
import numpy as np
import requests # library to handle requests
from pandas.io.json import json_normalize 
from pandas import json_normalize 
import pandas as pd 
import folium 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [101]:
#Load the foursquare credentials from file
f = open("keys", "r")
CLIENT_ID=f.read(49)
CLIENT_SECRET=f.read(49) 

VERSION = '20200825' # Foursquare API version

#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

In [68]:
#Get a list postcodes from Wikipedia 
wiki_url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto_wiki  = pd.read_html(wiki_url)
toronto_df=toronto_wiki[0]
toronto_df.head(5)

Unnamed: 0,Postal Code,District,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [102]:
#Get a list of schools and their locations using foursquare
schools="4bf58dd8d48988d13b941735"
preschool="52e81612bcbc57f1066b7a45"
version='20200823'
location='Toronto,ON,Canada'
radius='40000'
limit='1000'
url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&near={}&limit={}&categoryId={}'.format(CLIENT_ID, CLIENT_SECRET,version,location,limit,schools)
print(url)
foursquare_respo=requests.get(url).json()["response"]


https://api.foursquare.com/v2/venues/explore?&client_id=5IOZ0WK1G13QTMKG0GH0IOTHBQOHH4MPKZMQBIK0PJQPY505
&client_secret=4UQUSIBM520YLUYPAUABP3URJG1FJPREFKVFLF3A0WGVDM35
&v=20200823&near=Toronto,ON,Canada&limit=1000&categoryId=4bf58dd8d48988d13b941735


In [104]:
#Extracte the school information from the Json resutls
venues = foursquare_respo['groups'][0]['items']
nearby_venues = json_normalize(venues)
#nearby_venues.keys()
column_names = ['Name', 'Latitude', 'Longitude','PostalCode'] 
schools = pd.DataFrame(columns=column_names)
schools
for name, lat, lng, pc in zip(nearby_venues.get('venue.name'), 
                          nearby_venues.get('venue.location.lat'),
                          nearby_venues.get('venue.location.lng'),
                          nearby_venues.get('venue.location.postalCode')
                          ):
    if len(str(pc).split(" ")) == 2:
        schools=schools.append({'Name':name,
                            'Latitude':lat,
                            'Longitude':lng,
                            'PostalCode':str(pc).split(" ")[0]
                             
                           },ignore_index=True)
    #print(name,' lat=',lat, 'long=',lng)
print(schools.size)
schools.head(3)

208


Unnamed: 0,Name,Latitude,Longitude,PostalCode
0,SCHOOL Restaurant,43.637775,-79.424297,M6K
1,Nella Cucina,43.667122,-79.412022,M5R
2,ILSC,43.654743,-79.387793,M5G


In [105]:
#Count number of schools in each postcode 
schools_g=schools.filter(['PostalCode','Name'], axis=1)
schools_g=schools_g.groupby('PostalCode').count()
print(len(schools_g))
schools_g.tail(10).head(3)

36


Unnamed: 0_level_0,Name
PostalCode,Unnamed: 1_level_1
M6G,4
M6H,2
M6K,1


In [106]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(schools_g)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0], dtype=int32)

In [107]:
#Include lables
schools_g.insert(0, 'ClusterLabels', kmeans.labels_)
print(len(schools_g))
schools_g.tail(10).head(3)

36


Unnamed: 0_level_0,ClusterLabels,Name
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M6G,2,4
M6H,1,2
M6K,0,1


In [108]:
#Prepare the original data frame to include Latitude, Logitude and cluster lables
toronto_df['Latitude']=0
toronto_df['Longitude']=0
toronto_df['ClusterLabels']=-1
toronto_df.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
toronto_df.head(3)

Unnamed: 0,PostalCode,District,Neighbourhood,ClusterLabels,Latitude,Longitude
26,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",-1,0,0
32,M6E,York,Caledonia-Fairbanks,-1,0,0
40,M5G,Downtown Toronto,Central Bay Street,-1,0,0


In [109]:
#Drop the duplicate postal codes
schools.drop_duplicates(subset=['PostalCode'], inplace=True)
schools.set_index('PostalCode', inplace=True)
print(schools.size)
schools.head(3)

108


Unnamed: 0_level_0,Name,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M6K,SCHOOL Restaurant,43.637775,-79.424297
M5R,Nella Cucina,43.667122,-79.412022
M5G,ILSC,43.654743,-79.387793


In [110]:
#Transfrer the data from data frame used to evaluate the lables to the data frame used to create the map
for pcod in toronto_df.index:
    c_pcod=toronto_df['PostalCode'][pcod]
    if c_pcod in schools_g.index:
        toronto_df.loc[pcod, 'ClusterLabels'] = schools_g.loc[c_pcod].ClusterLabels
        if c_pcod in schools.index:
            toronto_df.loc[pcod, 'Latitude'] = schools.loc[c_pcod].Latitude
            toronto_df.loc[pcod, 'Longitude'] = schools.loc[c_pcod].Longitude
         #else:
        #    print('NA')

In [111]:
#Drop rows where there were not cluster labesl
#These are the postal codes where foursquare filed to find any schools
toronto_df.drop(toronto_df[toronto_df.ClusterLabels==-1].index, inplace=True)
print(toronto_df.size)
toronto_df.tail(100).head(3)


216


Unnamed: 0,PostalCode,District,Neighbourhood,ClusterLabels,Latitude,Longitude
26,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",0,43.637892,-79.580519
32,M6E,York,Caledonia-Fairbanks,0,43.679217,-79.44173
40,M5G,Downtown Toronto,Central Bay Street,1,43.654743,-79.387793


In [116]:
# create map
cent_lati='43.637775'
cent_long='-79.424297'
map_clusters = folium.Map(location=[cent_lati, cent_long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.brg(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#print(rainbow)
# add markers to the map
print("One schools -blue ")
print("Two schools - green ")
print("Four schools - Red ")
markers_colors = []
for lat, lon, poc, cluster in zip(toronto_df['Latitude'], 
                                  toronto_df['Longitude'], 
                                  toronto_df['PostalCode'],
                                  toronto_df['ClusterLabels']):
        label = folium.Popup('Postcode='+str(poc) + '|Cluster=' + str(cluster+1), parse_html=True)
        folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

One schools -blue 
Two schools - green 
Four schools - Red 
