# Segmenting and Clustering Neighbourhoods in Toronto


## This project includes data from Wikipedia page for the postal codes of Canada and then process and clean the data for the clustering. 
## K Means is used for clustering and the clusters are plotted using the Folium Library. First 'Toronto' is plotted, clustered and plotted in the last map.

In [5]:
#importing libraries needed!

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
!pip install scikit-learn==0.23.1
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: - 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::ibm-wsrt-py37main-keep==0.0.0=1937
  - conda-forge/linux-64::pytorch==1.8.0=cpu_py37hafa7651_0
  - defaults/noarch::ibm-wsrt-py37main-main==custom=1937
done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: - 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::ibm-wsrt-py37main-keep==0.0.0=1937
  - conda-forge/linux-64::pytorch==1.8.0=cpu_py37hafa7651_0
  - defaults/noarch::ibm-wsrt-py37main-main==custom=1937
done

# All requested packages already installed.

Libraries imported.


### Data from Wikipedia page for the list postal codes of Canada

In [15]:
!wget -q -O 'canada_data.json' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('Data downloaded!')

Data downloaded!


In [9]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [46]:
df.shape

(180, 3)

In [10]:
df.columns

Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')

### Data Processing and Cleaning of the list of postal codes of canada data

In [11]:
df = df[df['Borough']!='Not assigned']  


In [12]:
print(df.head())

  Postal Code           Borough                                Neighbourhood
2         M3A        North York                                    Parkwoods
3         M4A        North York                             Victoria Village
4         M5A  Downtown Toronto                    Regent Park, Harbourfront
5         M6A        North York             Lawrence Manor, Lawrence Heights
6         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [55]:
df.shape

(103, 3)

In [13]:
df['Neighbourhood'].loc[df['Neighbourhood'] == 'Not assigned'] =  df.Borough

In [14]:
df.shape

(103, 3)

### Data from csv file of geospatial data

In [15]:

df_geo = pd.read_csv('https://cocl.us/Geospatial_data')                

In [16]:
print(df_geo.head())

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


###  Data Processing and Cleaning of the geospatial data

In [None]:
#renameing the postal code columns before mergeing the two tables.
df_geo.rename(columns={'Postal Code':'Postcode'},inplace=True)
df.rename(columns={'Postal Code':'Postcode'},inplace=True)
print(df_geo.head()) 

In [18]:
print(df.head())


  Postcode           Borough                                Neighbourhood
2      M3A        North York                                    Parkwoods
3      M4A        North York                             Victoria Village
4      M5A  Downtown Toronto                    Regent Park, Harbourfront
5      M6A        North York             Lawrence Manor, Lawrence Heights
6      M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [19]:
#merging the two tables
df2 = pd.merge(df,df_geo, on='Postcode')
print(df2.head())
df2.shape

  Postcode           Borough                                Neighbourhood  \
0      M3A        North York                                    Parkwoods   
1      M4A        North York                             Victoria Village   
2      M5A  Downtown Toronto                    Regent Park, Harbourfront   
3      M6A        North York             Lawrence Manor, Lawrence Heights   
4      M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

    Latitude  Longitude  
0  43.753259 -79.329656  
1  43.725882 -79.315572  
2  43.654260 -79.360636  
3  43.718518 -79.464763  
4  43.662301 -79.389494  


(103, 5)

In [32]:
# Choosing borough 'Toronto' 
df3 = df2[df2['Borough'].str.contains('Toronto',regex=False)]
print(df3.head())
print(df3.shape)

   Postcode           Borough                                Neighbourhood  \
2       M5A  Downtown Toronto                    Regent Park, Harbourfront   
4       M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   
9       M5B  Downtown Toronto                     Garden District, Ryerson   
15      M5C  Downtown Toronto                               St. James Town   
19      M4E      East Toronto                                  The Beaches   

     Latitude  Longitude  
2   43.654260 -79.360636  
4   43.662301 -79.389494  
9   43.657162 -79.378937  
15  43.651494 -79.375418  
19  43.676357 -79.293031  
(40, 5)


### Displaying the Toronto map using Folium before clustering

In [38]:


toronto_map = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(toronto_map)
print(toronto_map)

<folium.folium.Map object at 0x7f84867c3d50>


### Clustering Toronto using K-means

In [33]:

k=5
toronto_cluster = df3.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_cluster)
kmeans.labels_


array([3, 3, 3, 3, 2, 3, 3, 4, 3, 4, 3, 4, 2, 3, 4, 2, 3, 2, 0, 0, 1, 0,
       0, 1, 0, 4, 1, 0, 4, 1, 0, 4, 0, 3, 3, 3, 3, 3, 3, 2], dtype=int32)

In [34]:
df3.insert(0, 'Cluster Labels', kmeans.labels_)

### Displaying the Toronto map using Folium after clustering with K-means

In [35]:
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighbourhood'], df3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
print(map_clusters)

<folium.folium.Map object at 0x7f848683a050>


#### Please note the map may not display on github