# Neighborhoods in Toronto

---
## Question 1
#### Import pandas

In [42]:
import pandas as pd

#### Get raw table from html

In [43]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove records with "Not assigned" borough

In [44]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Combine neighborhoods of the same postal code

In [45]:
df = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [46]:
df['Neighbourhood'][df['Neighbourhood'] == 'Not assigned'] = df['Borough'][df['Neighbourhood'] == 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Print number of rows

In [47]:
df.shape[0]

103

---
## Question 2

#### Load coordinates data

In [48]:
df_coor = pd.read_csv('https://cocl.us/Geospatial_data')
print(df_coor.shape)
df_coor.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge coordinates to existing data

In [49]:
df_full = pd.merge(df, df_coor, on=['Postal Code'], how='inner')
print(df_full.shape)
df_full

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


---
## Question 3

#### Filter borough with Toronto string

In [50]:
df_Toronto = df_full[df_full.Borough.str.contains("Toronto")].reset_index(drop=True)
print(df_Toronto.shape)
df_Toronto.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Cluster the neighborhoods (I choose 5 clusters)

In [51]:
from sklearn.cluster import KMeans

In [52]:
colors = ['red', 'green', 'blue', 'cadetblue', 'darkpurple']
kmeans = KMeans(n_clusters=len(colors), random_state=0).fit(df_Toronto[['Latitude', 'Longitude']])
df_Toronto['Cluster'] = kmeans.labels_
print(df_Toronto.shape)
df_Toronto

(39, 6)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,4
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,4
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,4
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,2
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,2


#### Visualize clusters
#### Observations: in the 5 clusters, the one at Toronto downtown has the highest density of borough while the others are more sparse.

In [53]:
#!conda install -c conda-forge folium=0.11.0 --yes
!pip install folium
import folium



In [58]:
latitude = 43.653226
longitude = -79.383184
Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)
n_row = df_Toronto.shape[0]

for i in range(n_row):
    borough = df_Toronto['Borough'][i]
    latitude = df_Toronto['Latitude'][i]
    longitude = df_Toronto['Longitude'][i]
    cluster = df_Toronto['Cluster'][i]
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(Toronto_map)  

Toronto_map