# Canada neiborhood task

## Step 1: Read the table from web and convert it to dataframe

In [9]:
## 1.1 load data from page and reformat
import pandas as pd
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url) # Returns list of all tables on page
neiborh_table = tables[0]
header=neiborh_table.iloc[0]
neiborh_table = neiborh_table[1:]
neiborh_table.columns=header
neiborh_table = neiborh_table[neiborh_table.Borough!= "Not assigned"] ## remove rows with not assigned Borough
nn = neiborh_table[neiborh_table.Neighbourhood == "Not assigned"]["Borough"] ## get values for not assigned nbh
neiborh_table.loc[ neiborh_table.Neighbourhood == "Not assigned", "Neighbourhood" ] = nn ## assgin value to nbh
neiborh_tableg=neiborh_table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x:  ', '.join(x)).reset_index(name='Neighbourhood')
neiborh_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


In [88]:
##1.2 check the data
neiborh_tableg.shape

(103, 3)

## Step 2: add geographical location

In [10]:
## 2.1 export the data
import pandas as pd
url='https://cocl.us/Geospatial_data'
data_gs = pd.read_csv(url,sep=",")  
data_gs = data_gs.rename(columns={'Postal Code': 'Postcode'})
data_gs.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
## 2.2 check the result 
neiborh_gs = pd.merge(neiborh_tableg, data_gs, on='Postcode')
neiborh_gs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
## install folium 
!conda install -c conda-forge folium=0.5.0 --yes

In [None]:
## 2.3 create toronto folium map with labels
import folium
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neiborh_gs['Latitude'], neiborh_gs['Longitude'], neiborh_gs['Borough'], neiborh_gs['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
# create map of New York using latitude and longitude values
    
map_toronto

## Step 3: make a foresquare request and extract the data

In [26]:
### 3.1 set credentials
CLIENT_ID = '1S55TLM4HKPAUAX2APKQSAARFGXHBOPQL412PDV4HBFCTU55' # Foursquare ID
CLIENT_SECRET = 'HSCKQUGRN5QDZVYZXG405G1UDW1WTVH45QMCMIYVRQYZXJ5O' #Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1S55TLM4HKPAUAX2APKQSAARFGXHBOPQL412PDV4HBFCTU55
CLIENT_SECRET:HSCKQUGRN5QDZVYZXG405G1UDW1WTVH45QMCMIYVRQYZXJ5O


In [27]:
## 3.2 make function to extract data from the result
def getNearbyVenues(names, latitudes, longitudes, radius=500,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [25]:
import requests
## 3.3 extract information
toronto_venues = getNearbyVenues(names=neiborh_gs['Neighbourhood'], 
                                    latitudes=neiborh_gs['Latitude'],
                                   longitudes=neiborh_gs['Longitude'] #neiborh_gs
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [45]:
## 3.4 count number of venues for each neibborhood
toronto_grouped=toronto_venues.groupby('Neighborhood').count().reset_index('Neighborhood')
neiborh_table.head()
toronto_grouped

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide, King, Richmond",100,100,100,100,100,100
1,Agincourt,4,4,4,4,4,4
2,"Agincourt North, L'Amoreaux East, Milliken, St...",3,3,3,3,3,3
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",9,9,9,9,9,9
4,"Alderwood, Long Branch",10,10,10,10,10,10
5,"Bathurst Manor, Downsview North, Wilson Heights",19,19,19,19,19,19
6,Bayview Village,4,4,4,4,4,4
7,"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
8,Berczy Park,58,58,58,58,58,58
9,"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [32]:
## 3.5 reformat the table
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
### 3.6  create list of top venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

import numpy as np
num_top_venues = 4

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted.head()
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,"Adelaide, King, Richmond",Venue Category,Venue Longitude,Venue Latitude,Venue
1,Agincourt,Venue Category,Venue Longitude,Venue Latitude,Venue
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Venue Category,Venue Longitude,Venue Latitude,Venue
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Venue Category,Venue Longitude,Venue Latitude,Venue
4,"Alderwood, Long Branch",Venue Category,Venue Longitude,Venue Latitude,Venue


In [48]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 3, 3, 3, 3, 0, 3, 0, 4, 3])

In [49]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neiborh_gs
toronto_merged = toronto_merged.rename(columns={'Neighbourhood': 'Neighborhood'})
#merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head() # check the last columns

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,3.0,Venue Category,Venue Longitude,Venue Latitude,Venue
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,3.0,Venue Category,Venue Longitude,Venue Latitude,Venue
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,3.0,Venue Category,Venue Longitude,Venue Latitude,Venue
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3.0,Venue Category,Venue Longitude,Venue Latitude,Venue
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3.0,Venue Category,Venue Longitude,Venue Latitude,Venue


In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,## look at the cluster
        #fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [50]:
## look at the cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
13,Scarborough,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
15,Scarborough,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
27,North York,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
28,North York,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
35,East York,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
39,East York,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
42,East Toronto,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
46,Central Toronto,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
49,Central Toronto,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
62,North York,0.0,Venue Category,Venue Longitude,Venue Latitude,Venue
