# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans

## Part 1

### Scrape the Data

Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

In [3]:
soup = BeautifulSoup(source, 'xml')

In [4]:
table = soup.find('table')

The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.

In [5]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']

In [6]:
df = pd.DataFrame(columns = column_names)

In [7]:
# Fill in the dataframe
for row_cell in table.find_all('tr'):
    row_data = []
    for data_cell in row_cell.find_all('td'):
        row_data.append(data_cell.text.strip()) # append the corresponding data into each row and remove all the leading and trailing spaces from strings
    if len(row_data) == 3:
        df.loc[len(df)] = row_data

In [8]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Preprocessing

Only process the cells that have an assigned borough. Ignore cells with a borough that is *Not assigned*.

In [9]:
# Remove rows where Borough is 'Not assigned'
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that *M5A* is listed twice and has two neighborhoods: *Harbourfront* and *Regent Park*. These two rows are combined into one row with the neighborhoods separated by a comma.

If a cell has a borough but a *Not assigned* neighborhood, then the neighborhood will be the same as the borough.

In [10]:
df.replace(df['Neighborhood']=='Not assigned', df['Borough'], inplace=True)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
df.shape

(103, 3)

## Part 2

Now that a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name has been built, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

#### Option 1.

Use the Geocoder Python package: https://geocoder.readthedocs.io/index.html.

The problem with this Package is you have to be persistent sometimes in order to get the geographical coordinates of a given postal code. So you can make a call to get the latitude and longitude coordinates of a given postal code and the result would be None, and then make the call again and you would get the coordinates. Thus, in order to make sure that you get the coordinates for all of our neighborhoods, run a while loop for each postal code.

In [12]:
import geocoder

In [13]:
def get_geocoder(postal_code):
    
    # Initialize your variable to None
    lat_lng_coords = None
    
    # Loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude

In [14]:
#df['Latitude'], df['Longitude'] = get_geocoder(df['PostalCode'].values)
#df.head()

#### Option 2.

Given that the Geocoder Python package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the package, here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [15]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now, merge the two dataframes together.

In [16]:
# Make sure the related column of postal code are titled the same
geo_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

# Merge the two dataframes
geo_data = pd.merge(df, geo_df, on='PostalCode')

geo_data.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Part 3

Explore and cluster the neighborhoods in Toronto. 

We will work with only boroughs that contain the word *Toronto*.

In [17]:
toronto = geo_data[geo_data['Borough'].str.contains('Toronto', regex=False)]
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Create a map centered around Toronto.

In [18]:
toronto_map = folium.Map(location=[43.651070,-79.347015], zoom_start=10)
toronto_map

Now, make a visualization of all the neighborhoods from boroughs that contain the word *Toronto*.

In [19]:
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat,lng], 
                        radius=5,
                        popup=label, 
                        color='blue',
                        fill=True, 
                        fill_color='#3186cc',
                        fill_opacity=0.7, 
                        parse_html=False).add_to(toronto_map)

toronto_map

### Cluster Neighborhoods

Run k-means to cluster the neighborhoods into 5 clusters.

In [20]:
# Set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto.drop(['PostalCode', 'Borough', 'Neighborhood'], 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:]

array([0, 0, 0, 0, 1, 0, 0, 3, 0, 2, 0, 3, 1, 0, 3, 1, 0, 1, 4, 4, 4, 4,
       2, 4, 3, 2, 4, 3, 2, 4, 3, 4, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [21]:
# Add clustering labels
toronto.insert(0, 'Cluster Labels', kmeans.labels_)
toronto

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,2,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


Finally, let's visualize the resulting clusters

In [22]:
# Create map
map_clusters = folium.Map(location=[43.651070,-79.347015], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood'], toronto['Cluster Labels']):
    label = folium.Popup(str(neighborhood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], 
                        radius=5,
                        popup=label, 
                        color=rainbow[cluster-1],
                        fill=True, 
                        fill_color=rainbow[cluster-1],
                        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters