Requirements for assignment at Week 3:
    
1. Scrape the Wikipedia page
2. Wrangle, clean and read data into apandas dataframe (make it as a structured format like the New York dataset)
3. Replicate the analysis that we did to the New York City to explore and cluster the neighborhoods in the city of Toronto. 

### Part one: Data Scrapping and Pre-processing

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
# import re
import pandas as pd
import urllib

def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    req = urllib.request.Request(url, headers=headers)
    global html
    html = urlopen(req).read().decode('ISO-8859-1')
    global soup
    soup = BeautifulSoup(html,'html.parser')

get_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [2]:
# find the table first, go to the tbody section and find all labels called “tr"
content_extracted = soup.find("table")
content = content_extracted.tbody.find_all("tr")

res = []
for tr in content:
    
    td = tr.find_all("td")
    data = [tr.text for tr in td]
    
    # according to the requirement: 
    # only process the cells that have an assigned borough. 
    # Ignore cells with a borough that is Not assigned.
    if (data != []) and (data[1].strip() != "Not assigned"):
        
        # according to another requirement: 
        # if a cell has a borough but a Not assigned neighborhood
        # then the neighborhood will be the same as the borough
        if data[2].strip() == "Not assigned": 
            data[2] = data[1]
        
        res.append(data)

# Create the dataframe
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [3]:
# there are some "\n", which needs to be replaced
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
df["Borough"] = df["Borough"].str.replace("\n","")
df["PostalCode"] = df["PostalCode"].str.replace("\n","")
print(df.head())
print("Shape: ", df.shape)
# we don't need to group the postcodes since it has been done by wiki website itself!!!

  PostalCode           Borough                                 Neighborhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
Shape:  (103, 3)


### Part two: transfer addresses to lat/lon

In [4]:
# below it's the try to retrieve the data from geocoder package, 
# however, there is no response for a long time. so choose to use the csv file provided by this course

# import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format("M5G"))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [5]:
# import the csv file from online source
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# merge two tables according to postcodes

df_toronto = pd.merge(df, lat_lon, how = "left", left_on = 'PostalCode', \
                      right_on = 'Postal Code')
df_toronto.drop("Postal Code", axis=1, inplace=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Part three: Explore and cluster the neighborhoods

In [7]:
# get a general idea of how many boroughs and neighborhoods we have
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        len(df_toronto['Neighborhood'].unique())
    )
)

The dataframe has 10 boroughs and 99 neighborhoods.


In [8]:
# create a map of Toronto with neighborhoods 

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


In [9]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

# add more points on the map
for lat, lng, borough, neighborhood in zip(
        df_toronto['Latitude'], 
        df_toronto['Longitude'], 
        df_toronto['Borough'], 
        df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [10]:
# using K-means clustering for all the neigbourhoods
from sklearn.cluster import KMeans

k = 5 # let's assume the number of clusters is 5
toronto_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_[0:10]

# create a new dataframe that includes the clustering information
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,2,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,4,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [11]:
# let's visualize the resulting clusters
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], \
                                  df_toronto['Longitude'], \
                                  df_toronto['Neighborhood'], \
                                  df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [12]:
# Examine Clusters

# cluster 1
df_toronto.loc[df_toronto['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
10,0,M6B,North York,Glencairn,43.709577,-79.445073
27,0,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,0,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
34,0,M3J,North York,"Northwood Park, York University",43.76798,-79.487262
39,0,M2K,North York,Bayview Village,43.786947,-79.385975
40,0,M3K,North York,Downsview,43.737473,-79.464763
45,0,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
52,0,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493
55,0,M5M,North York,"Bedford Park, Lawrence Manor East",43.733283,-79.41975


In [13]:
# cluster 2
df_toronto.loc[df_toronto['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,1,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
11,1,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.650943,-79.554724
17,1,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201
46,1,M3L,North York,Downsview,43.739015,-79.506944
49,1,M6L,North York,"North Park, Maple Leaf Park, Upwood Park",43.713756,-79.490074
50,1,M9L,North York,Humber Summit,43.756303,-79.565963
53,1,M3M,North York,Downsview,43.728496,-79.495697
56,1,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
57,1,M9M,North York,"Humberlea, Emery",43.724766,-79.532242
60,1,M3N,North York,Downsview,43.761631,-79.520999


In [14]:
# cluster 3
df_toronto.loc[df_toronto['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
16,2,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
20,2,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
21,2,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
24,2,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,2,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,2,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568


In [15]:
# cluster 4
df_toronto.loc[df_toronto['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
6,3,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
12,3,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
18,3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
22,3,M1G,Scarborough,Woburn,43.770992,-79.216917
26,3,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
32,3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
51,3,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
78,3,M1S,Scarborough,Agincourt,43.7942,-79.262029
85,3,M1V,Scarborough,"Milliken, Agincourt North, Steeles East, L'Amo...",43.815252,-79.284577
95,3,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [16]:
# cluster 5
df_toronto.loc[df_toronto['Cluster Labels'] == 4]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
7,4,M3B,North York,Don Mills,43.745906,-79.352188
8,4,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,4,M3C,North York,Don Mills,43.7259,-79.340923
14,4,M4C,East York,Woodbine Heights,43.695344,-79.318389
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
23,4,M4G,East York,Leaside,43.70906,-79.363452
29,4,M4H,East York,Thorncliffe Park,43.705369,-79.349372
33,4,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
