# Importing Modules

In [None]:
!conda install -c conda-forge geopy --yes

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes

In [1]:
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from bs4 import BeautifulSoup



# Creating Dataframe

In [2]:
canada_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [97]:
# Scraping the Data

parse = BeautifulSoup(canada_data, 'html.parser')
Postal_Codes = []
Boroughs = []
Neighborhoods = []

find_table = parse.find('table').find_all('tr')
for row in find_table:
    cells = row.find_all('td')
    if(len(cells) > 0):
        Postal_Codes.append(cells[0].text)
        Boroughs.append(cells[1].text)
        Neighborhoods.append(cells[2].text.rstrip('\n'))
        
toronto_data = pd.DataFrame({"Postal_Codes": Postal_Codes,
                           "Boroughs": Boroughs,
                           "Neighborhoods": Neighborhoods})

# Preprocessing Steps

toronto_data_dropna = toronto_data[toronto_data.Boroughs != "Not assigned"].reset_index(drop=True)

toronto_data_grouped = toronto_data_dropna.groupby(["Postal_Codes", "Boroughs"], as_index=False).agg(lambda x: ", ".join(x))

for index, row in toronto_data_grouped.iterrows():
    if row["Neighborhoods"] == "Not assigned":
        row["Neighborhoods"] = row["Boroughs"]
        
# Now that we're done, let's display the data!        

toronto_data_grouped.shape

(103, 3)

In [5]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.rename(columns={"Postal Code": "Postal_Codes"}, inplace=True)

coordinates.head()

Unnamed: 0,Postal_Codes,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [88]:
toronto_data_final = toronto_data_grouped.merge(coordinates, on="Postal_Codes", how="left")
toronto_data_final

(103, 5)

# Getting the latitude and longitude of Toronto from the Geopy library

In [7]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are: ', latitude, longitude)

The coordinates of Toronto are:  43.653963 -79.387207


# Mapping via  Folium

In [99]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_data_final['Latitude'], toronto_data_final['Longitude'], toronto_data_final['Boroughs'], toronto_data_final['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [9]:
CLIENT_ID = 'I1COYOEIRSOHTSPOLCEF5REJ0SF1RDSFVGXADZLTGXSJEAL0'
CLIENT_SECRET = 'CTQZW0ZRRJYIJOOILIYJZGHZE0HT4NAOWJYDNO3PB0GQGQRH'
VERSION = '20180605'

## Let's see the top 50 venues within a radius of 500 meters..

In [101]:
radius = 500
LIMIT = 10

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_data_final['Latitude'], toronto_data_final['Longitude'], toronto_data_final['Postal_Codes'], toronto_data_final['Boroughs'], toronto_data_final['Neighborhoods']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
print(len(venues))        

695


In [117]:
venues_data = pd.DataFrame(venues)
venues_data.columns = ['Postal_Codes', 'Boroughs', 'Neighborhoods', 'Borough_Latitude', 'Borough_Longitude', 'Venue_Name', 'Venue_Latitude', 'Venue_Longitude', 'Venue_Category']
venues_data.head()

Unnamed: 0,Postal_Codes,Boroughs,Neighborhoods,Borough_Latitude,Borough_Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


# Analysing individual areas

### One Hot Encoding

In [103]:
# One hot encoding
toronto_one_hot = pd.get_dummies(venues_data[['Venue_Category']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_one_hot['Postal_Codes'] = venues_df['Postal_Codes'] 
toronto_one_hot['Boroughs'] = venues_df['Boroughs'] 
toronto_one_hot['Neighborhoods'] = venues_df['Neighborhoods'] 

# Moving postal, borough and neighborhood column to the initial column
fixed_columns = list(toronto_one_hot.columns[-3:]) + list(toronto_one_hot.columns[:-3])
toronto_one_hot = toronto_one_hot[fixed_columns]

toronto_one_hot.shape

(695, 188)

### Grouping rows by neighbourhood..

In [104]:
toronto_one_hot_grouped = toronto_one_hot.groupby(["Postal_Codes", "Boroughs", "Neighborhoods"]).mean().reset_index()
toronto_one_hot_grouped.shape

(41, 188)

In [105]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

areaColumns = ['Postal_Codes', 'Boroughs', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal_Codes'] = toronto_one_hot_grouped['Postal_Codes']
neighborhoods_venues_sorted['Boroughs'] = toronto_one_hot_grouped['Boroughs']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_one_hot_grouped['Neighborhoods']

for ind in np.arange(toronto_one_hot_grouped.shape[0]):
    row_categories = toronto_one_hot_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.shape

(41, 13)

# Applying k-means clustering on the data

In [121]:
kclusters = 5
toronto_one_hot_grouped_clustered = toronto_one_hot_grouped.drop(["Postal_Codes", "Boroughs", "Neighborhoods"], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_one_hot_grouped_clustered)
len(kmeans.labels_)

41

In [122]:
toronto_merged_final = toronto_one_hot_grouped.copy()

toronto_merged_final["Cluster Labels"] = kmeans.labels_
toronto_merged_final["Latitude"] =venues_data["Venue_Latitude"]
toronto_merged_final["Longitude"] =venues_data["Venue_Longitude"]

toronto_merged_final = toronto_merged_final.join(neighborhoods_venues_sorted.drop(["Boroughs", "Neighborhoods"], 1).set_index("Postal_Codes"), on="Postal_Codes")
# toronto_merged_final.shape
toronto_merged_final.head()

Unnamed: 0,Postal_Codes,Boroughs,Neighborhoods,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Pizza Place,Intersection,Coffee Shop,Print Shop,Rental Car Location,Mexican Restaurant,Electronics Store,Medical Center,Fast Food Restaurant,Bar
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Caribbean Restaurant,Hakka Restaurant,Coffee Shop,Korean Restaurant,Athletics & Sports,Diner,Falafel Restaurant,Event Space,Empanada Restaurant,Electronics Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Discount Store,Department Store,Gas Station,Bakery,Thai Restaurant,Fried Chicken Joint,Ice Cream Shop,Coffee Shop,Playground,Lounge
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Bus Line,Metro Station,Bus Station,Intersection,Yoga Studio,Discount Store,Event Space,Empanada Restaurant,Electronics Store,Eastern European Restaurant
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,Bakery,Indian Restaurant,General Entertainment,Park,College Stadium,Pet Store,Motel,Middle Eastern Restaurant,Chinese Restaurant,Café


In [126]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding relevant markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged_final['Latitude'], toronto_merged_final['Longitude'], toronto_merged_final['Postal_Codes'], toronto_merged_final['Boroughs'], toronto_merged_final['Neighborhoods'], toronto_merged_final['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters