## Scraping Toronto's Postal Codes from Wikipedia 

In [1]:
import pandas as pd
import numpy as np

Tables of a website can be directly read into an array of dataframe objects using the pd.read_html() function

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df=pd.read_html(url, header=0)[0]

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Dropping all the rows which do not have a Borough assigned to them

In [3]:
df = df[df.Borough != "Not assigned"].reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
df.shape

(103, 3)

## Adding Latutudes and Longitudes to the dataframe

In [5]:
cor = pd.read_csv('Geospatial_coordinates.csv')

cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Performing a left join on the two dataframes df and cor (containing postal code info and latitude/longitudes respectively) on the "Postal Code" Column

In [6]:
dft = df.merge(cor, on="Postal Code", how="left")
dft

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Clustering Neighborhoods in Toronto

Importing KMeans to cluster data and folium to visualize the same

In [7]:
from sklearn.cluster import KMeans
import folium

In [8]:
dft['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

#### Working with the Borough Scarborough and storing it's data in a different table

In [9]:
scar = dft[dft['Borough'] == 'Scarborough'].reset_index(drop=True)
scar.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Since Geocoder doesn't seem to work properly, I just looked up the coordinates of Scarborough on Google 

In [10]:
scar_lat = 43.7764
scar_lon = -79.2318

In [11]:
map_scar = folium.Map(location=[scar_lat, scar_lon], zoom_start=11)

for lat, lng, label in zip(scar['Latitude'], scar['Longitude'], scar['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scar)  
    
map_scar

Storing API Keys:

In [12]:
CLIENT_ID = CLIENT_ID
CLIENT_SECRET = CLIENT_SECRET
VERSION = '20180605'
LIMIT = 50

Method to get all the nearby venues for all of the neighborhoods in Scarborough

In [13]:
import requests

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Getting all nearby venues

In [16]:
scar_venues = getNearbyVenues(names=scar['Neighborhood'],
                                   latitudes=scar['Latitude'],
                                   longitudes=scar['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge


In [17]:
scar_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Toronto Pan Am Sports Centre,43.790623,-79.193869,Athletics & Sports
1,"Malvern, Rouge",43.806686,-79.194353,African Rainforest Pavilion,43.817725,-79.183433,Zoo Exhibit
2,"Malvern, Rouge",43.806686,-79.194353,Polar Bear Exhibit,43.823372,-79.185145,Zoo
3,"Malvern, Rouge",43.806686,-79.194353,Toronto Zoo,43.820582,-79.181551,Zoo
4,"Malvern, Rouge",43.806686,-79.194353,Orangutan Exhibit,43.818413,-79.182548,Zoo Exhibit


In [18]:
scar_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,50,50,50,50,50,50
"Birch Cliff, Cliffside West",50,50,50,50,50,50
Cedarbrae,50,50,50,50,50,50
"Clarks Corners, Tam O'Shanter, Sullivan",50,50,50,50,50,50
"Cliffside, Cliffcrest, Scarborough Village West",50,50,50,50,50,50
"Dorset Park, Wexford Heights, Scarborough Town Centre",50,50,50,50,50,50
"Golden Mile, Clairlea, Oakridge",50,50,50,50,50,50
"Guildwood, Morningside, West Hill",50,50,50,50,50,50
"Kennedy Park, Ionview, East Birchmount Park",50,50,50,50,50,50
"Malvern, Rouge",50,50,50,50,50,50


## Analysing each Neigbourhood

In [19]:
scar_onehot = pd.get_dummies(scar_venues[['Venue Category']], prefix="", prefix_sep="")

scar_onehot['Neighborhood'] = scar_venues['Neighborhood'] 

Fixing the 'Neighborhood' column to the first place

In [20]:
cols = list(scar_onehot.columns.values) 
cols.pop(cols.index('Neighborhood')) 
scar_onehot  = scar_onehot[['Neighborhood']+cols]
scar_onehot

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,Bank,Beach,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Xinjiang Restaurant,Zoo,Zoo Exhibit
0,"Malvern, Rouge",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845,Upper Rouge,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
846,Upper Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
847,Upper Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
848,Upper Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
scar_onehot.shape

(850, 110)

In [22]:
scar_grouped = scar_onehot.groupby('Neighborhood').mean().reset_index()
scar_grouped

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,Bank,Beach,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Xinjiang Restaurant,Zoo,Zoo Exhibit
0,Agincourt,0.02,0.02,0.02,0.0,0.0,0.0,0.04,0.0,0.0,...,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0
1,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.04,0.02,0.02,0.0,0.12,...,0.02,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.02,0.02,0.04,0.02,0.0,0.0,0.02,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0
3,"Clarks Corners, Tam O'Shanter, Sullivan",0.02,0.02,0.02,0.0,0.02,0.0,0.06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
4,"Cliffside, Cliffcrest, Scarborough Village West",0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.02,0.02,0.0,0.02,0.0,0.02,0.0,0.0
5,"Dorset Park, Wexford Heights, Scarborough Town...",0.02,0.02,0.02,0.0,0.0,0.0,0.04,0.0,0.0,...,0.02,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0
6,"Golden Mile, Clairlea, Oakridge",0.02,0.0,0.0,0.0,0.02,0.02,0.04,0.0,0.04,...,0.0,0.0,0.02,0.02,0.02,0.02,0.0,0.0,0.0,0.0
7,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0
8,"Kennedy Park, Ionview, East Birchmount Park",0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.02,...,0.0,0.0,0.02,0.0,0.02,0.02,0.0,0.02,0.0,0.0
9,"Malvern, Rouge",0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.22


In [23]:
scar_grouped.shape

(17, 110)

In [24]:
num_top_venues = 5

for hood in scar_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scar_grouped[scar_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                  venue  freq
0  Caribbean Restaurant  0.10
1           Coffee Shop  0.06
2     Indian Restaurant  0.04
3      Greek Restaurant  0.04
4          Burger Joint  0.04


----Birch Cliff, Cliffside West----
            venue  freq
0           Beach  0.12
1            Park  0.10
2     Coffee Shop  0.08
3  Breakfast Spot  0.04
4            Café  0.04


----Cedarbrae----
                  venue  freq
0           Coffee Shop  0.08
1  Caribbean Restaurant  0.06
2                  Park  0.06
3      Asian Restaurant  0.04
4     Indian Restaurant  0.04


----Clarks Corners, Tam O'Shanter, Sullivan----
                       venue  freq
0       Caribbean Restaurant  0.12
1                     Bakery  0.06
2                Coffee Shop  0.06
3  Middle Eastern Restaurant  0.06
4              Burrito Place  0.04


----Cliffside, Cliffcrest, Scarborough Village West----
          venue  freq
0          Park  0.12
1           Gym  0.06
2  Burger Joint  0.06
3   Coffee Sho

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] =scar_grouped['Neighborhood']

for ind in np.arange(scar_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scar_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Caribbean Restaurant,Coffee Shop,Greek Restaurant,Bakery,Bubble Tea Shop,Burger Joint,Indian Restaurant,Chinese Restaurant,Park,Noodle House
1,"Birch Cliff, Cliffside West",Beach,Park,Coffee Shop,Ice Cream Shop,Café,Breakfast Spot,Pub,BBQ Joint,Hungarian Restaurant,Filipino Restaurant
2,Cedarbrae,Coffee Shop,Caribbean Restaurant,Park,Burger Joint,Gym,Bookstore,Indian Restaurant,Pub,Asian Restaurant,Fried Chicken Joint
3,"Clarks Corners, Tam O'Shanter, Sullivan",Caribbean Restaurant,Middle Eastern Restaurant,Coffee Shop,Bakery,Supermarket,Burrito Place,Grocery Store,Burger Joint,Filipino Restaurant,Korean Restaurant
4,"Cliffside, Cliffcrest, Scarborough Village West",Park,Gym,Coffee Shop,Burger Joint,Grocery Store,Golf Course,Pub,Beach,Indian Restaurant,Distribution Center


## Clustering Neighborhoods

In [27]:
kclusters = 5

scar_grouped_clustering = scar_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scar_grouped_clustering)

kmeans.labels_

array([0, 2, 4, 0, 2, 3, 2, 4, 2, 1, 0, 1, 4, 0, 1, 3, 4])

In [28]:
neighborhoods_venues_sorted.insert(0,'Cluster Labels' ,kmeans.labels_)

scar_merged = scar

scar_merged = scar_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scar_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,1,Zoo Exhibit,Burger Joint,Pharmacy,Breakfast Spot,Fried Chicken Joint,Caribbean Restaurant,Grocery Store,Zoo,Restaurant,Chocolate Shop
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1,Zoo Exhibit,Park,Breakfast Spot,Grocery Store,Zoo,Fast Food Restaurant,Pharmacy,Fried Chicken Joint,Bakery,Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4,Coffee Shop,Park,Pharmacy,Indian Restaurant,Breakfast Spot,Grocery Store,Gym,Pub,Restaurant,Burger Joint
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4,Park,Coffee Shop,Caribbean Restaurant,Restaurant,Breakfast Spot,Gym,Indian Restaurant,Fried Chicken Joint,Pizza Place,Pub
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4,Coffee Shop,Caribbean Restaurant,Park,Burger Joint,Gym,Bookstore,Indian Restaurant,Pub,Asian Restaurant,Fried Chicken Joint


In [29]:
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[scar_lat, scar_lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scar_merged['Latitude'], scar_merged['Longitude'],scar_merged['Neighborhood'], scar_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters