# Assignment: Segmenting and Clustering Neighborhoods in Toronto
## First Part: Scrap Data from Website and Clean it

In [1]:
#import package
from bs4 import BeautifulSoup
import requests
import pandas as pd

Using requests to get url address
<br>Creating soup object from source in 'lxml'

In [2]:
#get url
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

Parse the info from body --> div --> class --> table
<br>Return a list of dataframe where 0 index is our table

In [3]:
#parse table
table = soup.body.find('div', class_ = 'mw-parser-output').find('table')

#the reason to put [0] --> table is on the first index of the list
df = pd.read_html(str(table))[0]

In [4]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Extract info without Borough 'Not assigned'

In [5]:
# 1. remove Borough that is not assigned
df_B = df[df.Borough != 'Not assigned']
df_B.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


Combine rows according to Postcode and use agg func to separate them

In [6]:
# 2.0 combine neighbourhood with same postcode
df_postcode = df_B.groupby('Postcode', as_index = False).agg({'Neighbourhood': ', '.join, 'Borough': 'first'})
df_postcode.head()

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


Rearrange the columns

In [7]:
# 2.1 re-column the dataframe
new_df_postcode = df_postcode[['Postcode', 'Borough', 'Neighbourhood']]
new_df_postcode.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Check how many rows contain neighbourhood with 'Not assigned'

In [8]:
# 3.0 Reassigned 'not assigned' Neighbourhood with corresponding borough --> 1st check how many
check_df = new_df_postcode.loc[new_df_postcode['Neighbourhood'] == 'Not assigned']
check_df

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Not assigned


Since we only have 1 row (Queen's Park) contains 'Not assigned' neighbourhood, we can just change it directly

In [9]:
# 3.1 Since we only have 1 not assigned, just change it
new_df_postcode.loc[93, 'Neighbourhood'] = "Queen's Park"

double_check_df = new_df_postcode.loc[[93]]
double_check_df

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


Get the final shape of the dataframe

In [10]:
# 4. Provide the shape of the dataframe
new_df_postcode.shape

(103, 3)

## Second Part: Merge Dataset with Latitude and Longitude
Import the second dataframe

In [11]:
# 5.0 Combine Longitude and Lattitude to the previous dataframe 
# first get csv file

LL = pd.read_csv('Geospatial_Coordinates.csv')
LL.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# 5.1 Rename column so we can merge them
LL = LL.rename(columns={"Postal Code": "Postcode"})
LL.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
merge_df = pd.merge(new_df_postcode, LL, on = "Postcode")
merge_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Third Part: Data Clustering and Visualization
Using geopy --> get the Toronto coordinate
<br>Using folium --> to plot the map

In [14]:
# 6.1 Get the Toronto city coordinate from geopy package
import folium
from geopy.geocoders import Nominatim

address = 'Toronto City, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City is 43.653963, -79.387207.


1. Set the Toronto cooridinate as center
2. Get data from merge_df
3. Creating circles around city of Toronto
4. Display

In [15]:
# 6.2 Plot the neighbourhood near Toronto city
# create map of New York using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(merge_df['Latitude'], merge_df['Longitude'], merge_df['Borough'], merge_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Third Part - 1: Explore Neighbourhood in Scarborough

In [16]:
scarborough_data = merge_df[merge_df['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
address = 'Scarborough, CA'

geolocator = Nominatim(user_agent="sc_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough is 43.773077, -79.257774.


In [18]:
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Borough'], scarborough_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_scarborough)  
    
map_scarborough

## Third Part - 2: Explore Neighbourhood in North York

In [19]:
north_york_data = merge_df[merge_df['Borough'] == 'North York'].reset_index(drop=True)
north_york_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493


In [20]:
address = 'North York, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough is 43.7543263, -79.44911696639593.


In [21]:
map_north_york = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(north_york_data['Latitude'], north_york_data['Longitude'], north_york_data['Borough'], north_york_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_north_york)  
    
map_north_york

## Third Part - 3: Explore Neighbourhood in Downtown Toronto

In [22]:
downtown_data = merge_df[merge_df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [23]:
address = 'Downtown Toronto, CA'

geolocator = Nominatim(user_agent="dt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough is 43.6563221, -79.3809161.


In [24]:
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Borough'], downtown_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown