# Section 1

In [54]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [55]:
!pip install BeautifulSoup4
!pip install requests



In [56]:
from bs4 import BeautifulSoup

Webpage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(Webpage.text, 'lxml')

data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    if (index == 0):
        columns = section
    else:
        data.append(section)

toronto_df = pd.DataFrame(data = data,columns = columns)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


###  Ignore cells with a borough that is Not assigned.

In [57]:
toronto_df.shape

(288, 3)

In [58]:
toronto_df.Borough.head()

0        Not assigned
1        Not assigned
2          North York
3          North York
4    Downtown Toronto
Name: Borough, dtype: object

In [59]:
Torontoclean_df = toronto_df[toronto_df.Borough != 'Not assigned']
Torontoclean_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Index are reset

In [60]:
Torontoclean_df = Torontoclean_df.reset_index(drop=True)
Torontoclean_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [61]:
Torontoclean_df.shape

(211, 3)

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [62]:
Torontoclean_df[Torontoclean_df.Neighbourhood == 'Not assigned'].count()

Postcode         1
Borough          1
Neighbourhood    1
dtype: int64

In [63]:
Torontoclean_df['Neighbourhood'].replace("Not assigned", Torontoclean_df["Borough"],inplace=True)
Torontoclean_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [64]:
Torontoclean_df[Torontoclean_df.Neighbourhood == 'Not assigned'].count()

Postcode         0
Borough          0
Neighbourhood    0
dtype: int64

In [65]:
Torontoclean_df.groupby("Postcode")["Neighbourhood"].sum().count()

103

### We here try to manage to use comma between different neighbourhood

In [66]:
cluster_toronto = Torontoclean_df.groupby("Postcode").Neighbourhood.agg([('Neighbourhood', ', '.join)])

In [67]:
cluster_toronto.head()

Unnamed: 0_level_0,Neighbourhood
Postcode,Unnamed: 1_level_1
M1B,"Rouge, Malvern"
M1C,"Highland Creek, Rouge Hill, Port Union"
M1E,"Guildwood, Morningside, West Hill"
M1G,Woburn
M1H,Cedarbrae


In [68]:
cluster_toronto['Borough'] = Torontoclean_df.groupby("Postcode").Borough.agg([('Borough', ', '.join)])
cluster_toronto.head()

Unnamed: 0_level_0,Neighbourhood,Borough
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,"Rouge, Malvern","Scarborough, Scarborough"
M1C,"Highland Creek, Rouge Hill, Port Union","Scarborough, Scarborough, Scarborough"
M1E,"Guildwood, Morningside, West Hill","Scarborough, Scarborough, Scarborough"
M1G,Woburn,Scarborough
M1H,Cedarbrae,Scarborough


In [69]:
new = cluster_toronto["Borough"].str.split(',', n = 1, expand = True)

In [70]:
cluster_toronto["Borough"] = new[0]

In [71]:
columnsTitles=["Borough","Neighbourhood"]
cluster_toronto=cluster_toronto.reindex(columns=columnsTitles)

In [72]:
cluster_toronto.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [73]:
cluster_toronto.shape

(103, 2)

# Section 2
## Reading latitude and langitude of neighbours

In [74]:
lat_long_df = pd.read_csv("Geospatial_Coordinates.csv")
lat_long_df.columns = ['Postcode', 'Latitude', 'Longitude']
lat_long_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### The new data is sorted and postcode is set as index

In [75]:
lat_long_df.sort_values(by=['Postcode'], inplace=True)
lat_long_df.set_index('Postcode', inplace=True)
lat_long_df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


### The previous data is now also sorted

In [76]:
cluster_toronto.sort_values(by=['Postcode'], inplace=True)
cluster_toronto.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [77]:
cluster_toronto['Latitude'] = lat_long_df['Latitude']
cluster_toronto['Longitude'] = lat_long_df['Longitude']

In [78]:
cluster_toronto.head(11)

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Section 3

In [85]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address, timeout=20)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [86]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(cluster_toronto['Latitude'], 
                    cluster_toronto['Longitude'], cluster_toronto['Borough'], cluster_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto