# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada
## Part 1- Data Scraping from Wikipedia

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')



In [5]:
My_table = soup.find('table',{'class':'wikitable sortable'})

In [6]:

table_rows = My_table.find_all('tr')

PostCode =[]
Borough = []
Neighbourhood = []

for tr in table_rows:
    td = tr.find_all('td')
    if (len(td) > 0 and td[1].text != 'Not assigned'): #check if it valid row and if borough!="Not Assigned"
        if td[0].text not in PostCode: #if postcode not present before then create new
            PostCode.append((td[0].text)) #append postcode in array
            a = td[1].find_all('a')
            if(len(a) > 0):
                Borough.append((((td[1].find('a')).get('title')).split(','))[0]) #append borough in array for hyperlink
            else:
                Borough.append((((td[1].text)).split(','))[0]) #append borough in array for non-hyperlink
            a = td[2].find_all('a')
            if(len(a) > 0):
                Neighbourhood.append((((td[2].find('a')).get('title')).split(','))[0]) #append neighbourhood in array for hyperlink
            else:   #append neighbourhood in array for non-hyperlink
                if td[2].text == 'Not assigned\n': 
                    Neighbourhood.append(((td[1].text).split(','))[0])#append borough if non assigned
                else:
                    Neighbourhood.append((td[2].text).replace('\n', '')) # append neighbourhood if non-hyperlink
        else: #if postcode is present in the array
            ind = PostCode.index(td[0].text)
            a = td[2].find_all('a')
            if(len(a) > 0):
                testneighbourhood = (((td[2].find('a')).get('title')).split(','))[0] #append neighbourhood in array for hyperlink
            else:   #append neighbourhood in array for non-hyperlink
                if td[2].text != 'Not assigned\n':
                    testneighbourhood = (td[2].text).replace('\n', '') # append neighbourhood if non-hyperlink
            if Borough[ind] == Neighbourhood[ind]:
                Neighbourhood[ind] = testneighbourhood
            else:
                Neighbourhood[ind] = Neighbourhood[ind] + ', ' + testneighbourhood           
        
df=pd.DataFrame()
df['PostCode'] = PostCode
df['Borough'] = Borough
df['Neighbourhood'] = Neighbourhood

In [7]:
df.shape

(103, 3)

## Part 2 - Geocoding

##  I used the 'Geospatial_Coordinates.csv' and got geocoding data from that csv file

In [12]:

latitude = []
longitude = []
latlong = pd.read_csv('http://cocl.us/Geospatial_data')
#print((latlong.loc[latlong['Postal Code'] == 'M3A'])['Postal Code'])
for index, row in df.iterrows():
    templatlong = latlong[latlong['Postal Code'].str.match(row['PostCode'])]
    latitude.append((templatlong['Latitude'].values)[0])
    longitude.append((templatlong['Longitude'].values)[0])
df['Latitude'] = latitude
df['Longitude'] = longitude

In [15]:
df.shape

(103, 5)

In [14]:
df.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494


# Part 3 - Clustering Toronto Canada
## Select Toronto boroughs with word "Toronto" in the name.

In [16]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
1,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494
2,M9A,Downtown Toronto,Queen's Park (Toronto),43.667856,-79.532242
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [17]:
toronto_data_grouped = toronto_data.groupby('Neighbourhood').mean().reset_index()
toronto_data_grouped.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,"Adelaide, King, Richmond",43.650571,-79.384568
1,Berczy Park,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


## K Means Clustering

In [18]:
# set number of clusters
from sklearn.cluster import KMeans
kclusters = 5

toronto_grouped_clustering = toronto_data_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 2, 1, 3, 3, 3, 3, 2, 3], dtype=int32)

## Insert cluster information to original dataframe

In [19]:
toronto_data['Cluster Labels']=kmeans.labels_
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,3
1,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494,3
2,M9A,Downtown Toronto,Queen's Park (Toronto),43.667856,-79.532242,2
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3


In [20]:

!conda install -c conda-forge folium=0.5.0 --yes 
import folium 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    altair-4.0.0               |             py_0         606 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.1 MB

The following NEW packages will be 

In [21]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.653963, -79.387207.


In [22]:

import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters