# Segmenting and Clustering Neighborhoods in Toronto
## IBM Data Science Professional Certification Program with Coursera
#### Princeton Brooke, Data Scientist
###### This Jupyter notebook will be used to explore and cluster neighbourhoods in Toronto

## Part 1

In [261]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
# from HTMLParser import HTMLParser
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from folium import plugins

print('Folium installed')
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
raw_random_wikipedia_page = requests.get(wikipedia_link)

In [148]:
page = raw_random_wikipedia_page.text
# print(page)

In [213]:
table_tag_start = '<table class="wikitable sortable">'
table_tag_end = '</table>'
table = page[page.index(table_tag_start) : page.index(table_tag_end)]

dfs = pd.read_html(table)
df = dfs[0]
df.columns = df.iloc[0]
df = df[df.Borough != 'Not assigned']


In [214]:
# Warning: Multiple executions of this block will alter subsequent data.
# Re-execute block above to get original data
df = df.reset_index()
df.drop(columns=['index'], inplace=True)
df.drop([0], inplace=True)
# df

In [215]:
neighbourhood_not_assigned = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough'].index.item()
df['Neighbourhood'][neighbourhood_not_assigned] = df['Borough'][neighbourhood_not_assigned]

In [262]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df[:15]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [217]:
df.shape

(103, 3)

## Part 2

In [218]:
df_zip = pd.read_csv('https://cocl.us/Geospatial_data')
df_zip.columns = ['Postcode', 'Latitude', 'Longitude']
df_zip[:15]

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [275]:
df_merged = df.merge(dfs_zip, on="Postcode", how='inner')
df_merged[:15]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [282]:
df_merged.shape

(103, 5)

## Part 3
#### Analyze data and Create maps to visualize clusters

In [283]:
df_merged.groupby('Borough').count()

Unnamed: 0_level_0,Postcode,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,18,18,18,18
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Queen's Park,1,1,1,1
Scarborough,17,17,17,17
West Toronto,6,6,6,6


In [289]:
# Toronto latitude and longitude values
latitude = 43.65
longitude = -79.38

# create map and display it
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# display the map of toronto_map
toronto_map

In [286]:
boroughs_arr = df_merged.Borough.unique()
boroughs_arr

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [287]:
# instantiate feature groups for the boroughs in the dataframe

colors = [
    'green',
    'darkgreen',
    'lightgreen',
    'darkblue',
    'lightblue',
    'purple',
    'darkpurple',
    'pink',
    'cadetblue',
    'lightgray',
    'black'
]

for borough_x in boroughs_arr:
    #print(borough_x)
    borough_df = df_merged[df_merged.Borough == borough_x]
    borough_feature = folium.map.FeatureGroup()
    borough_color = colors[random.randint(0,10)]
    
    # loop through boroughs and add each to a feature group
    for lat, lng, in zip(borough_df.Latitude, borough_df.Longitude):
        
        borough_feature.add_child(
            folium.features.CircleMarker(
                [lat, lng],
                radius=5, # define how big you want the circle markers to be
                color=borough_color,
                fill=True,
                fill_color=borough_color,
                fill_opacity=0.6
            )
        )

    # add feature to map
    toronto_map.add_child(borough_feature)
    
toronto_map

In [290]:

toronto_map_clustered = folium.Map(location = [latitude, longitude], zoom_start = 11)

# instantiate a mark cluster object for the borough in the dataframe
borough_marker = plugins.MarkerCluster().add_to(toronto_map_clustered)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(df_merged.Latitude, df_merged.Longitude, df_merged.Borough):
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(borough_marker)

# display map
toronto_map_clustered
