# Toronto Web Scraping Code

## Get the Source from WikiPedia page

## Imports

In [9]:
# standard computing libraries
import pandas as pd
import numpy as np
import json

# webscraping
from bs4 import BeautifulSoup
import requests

# geocoder for retrieving coordinates of postcodes
import geocoder

# k-means clustering
from sklearn.cluster import KMeans

# folium for maps
import folium

# geopandas for advanced geojson handling
import geopandas

# library to access overpass api in a more convenient way
import overpy

# ipython command to use matplotlib
%matplotlib inline

ModuleNotFoundError: No module named 'geocoder'

In [10]:
!pip install geocoder


Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 12.2MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [11]:
# standard computing libraries
import pandas as pd
import numpy as np
import json

# webscraping
from bs4 import BeautifulSoup
import requests

# k-means clustering
from sklearn.cluster import KMeans

# folium for maps
import folium

# geopandas for advanced geojson handling
import geopandas

# library to access overpass api in a more convenient way
import overpy

# ipython command to use matplotlib
%matplotlib inline

ModuleNotFoundError: No module named 'folium'

In [12]:
! pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 6.8MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [13]:
# geopandas for advanced geojson handling
import geopandas

# library to access overpass api in a more convenient way
import overpy

# ipython command to use matplotlib
%matplotlib inline

ModuleNotFoundError: No module named 'geopandas'

In [14]:
! pip install geopandas

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/83/c5/3cf9cdc39a6f2552922f79915f36b45a95b71fd343cfc51170a5b6ddb6e8/geopandas-0.7.0-py2.py3-none-any.whl (928kB)
[K     |████████████████████████████████| 931kB 15.3MB/s eta 0:00:01
Collecting fiona (from geopandas)
[?25l  Downloading https://files.pythonhosted.org/packages/ec/20/4e63bc5c6e62df889297b382c3ccd4a7a488b00946aaaf81a118158c6f09/Fiona-1.8.13.post1-cp36-cp36m-manylinux1_x86_64.whl (14.7MB)
[K     |████████████████████████████████| 14.7MB 27.3MB/s eta 0:00:01
[?25hCollecting pyproj>=2.2.0 (from geopandas)
[?25l  Downloading https://files.pythonhosted.org/packages/ce/37/705ee471f71130d4ceee41bbcb06f3b52175cb89273cbb5755ed5e6374e0/pyproj-2.6.0-cp36-cp36m-manylinux2010_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 12.1MB/s eta 0:00:01
[?25hCollecting shapely (from geopandas)
[?25l  Downloading https://files.pythonhosted.org/packages/20/fa/c96d3461fda99ed8e82ff0b219ac2c83

In [15]:
postcode = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [16]:

postcodesoup = BeautifulSoup(postcode.content)

In [17]:
list_postcodes = list()

for tr in postcodesoup.find_all("tr")[2:-5]:
    
    row = (td.text for td in tr.find_all("td"))
    
    list_postcodes.append(row)

## use pandas to read the table into a pandas dataframe.

In [18]:

df_raw = pd.DataFrame(list_postcodes, columns=["Postcode","Borough","Neighbourhood"])

In [19]:
def concatNeighbourhood(row):
    
    return pd.Series(dict(Borough = row['Borough'].unique()[0],
                        Neighbourhood = ', '.join(row['Neighbourhood'])))

In [20]:
df_post = (df_raw
            .query('Borough != "Not assigned"')
            .assign(Neighbourhood = lambda x: x["Neighbourhood"].str[:-1].replace('Not assigned', x["Borough"]))
            .groupby('Postcode').apply(concatNeighbourhood)
            .reset_index()
)

df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B\n,Scarborough\n,Malvern / Rouge
1,M1C\n,Scarborough\n,Rouge Hill / Port Union / Highland Creek
2,M1E\n,Scarborough\n,Guildwood / Morningside / West Hill
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae


In [21]:
df_post.shape

(178, 3)

In [22]:
df_post.head(30)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B\n,Scarborough\n,Malvern / Rouge
1,M1C\n,Scarborough\n,Rouge Hill / Port Union / Highland Creek
2,M1E\n,Scarborough\n,Guildwood / Morningside / West Hill
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae
5,M1J\n,Scarborough\n,Scarborough Village
6,M1K\n,Scarborough\n,Kennedy Park / Ionview / East Birchmount Park
7,M1L\n,Scarborough\n,Golden Mile / Clairlea / Oakridge
8,M1M\n,Scarborough\n,Cliffside / Cliffcrest / Scarborough Village West
9,M1N\n,Scarborough\n,Birch Cliff / Cliffside West
