In [1]:
from bs4 import BeautifulSoup
import requests

## 1. Web Scrapping 

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(source.content,'lxml')

In [3]:
#print(soup.prettify())

In [4]:
toronto_table = soup.find('table',{'class':'wikitable sortable'}) 
#print(toronto_table.prettify())

In [5]:
with open('toronto.txt','w') as r :
    for header in toronto_table.find_all('th'):
        for row in toronto_table.find_all('tr'):
            for cell in row.find_all('td'):
                r.write(cell.text.ljust(30))
            r.write('\n')

In [6]:
import pandas as pd 
data = pd.read_fwf('toronto.txt')
data.columns = ['postcode','borough','neighbourhood']

In [7]:
print(data)

     postcode           borough        neighbourhood
0         M1A      Not assigned         Not assigned
1         NaN               NaN                  NaN
2         M2A      Not assigned         Not assigned
3         NaN               NaN                  NaN
4         M3A        North York            Parkwoods
5         NaN               NaN                  NaN
6         M4A        North York     Victoria Village
7         NaN               NaN                  NaN
8         M5A  Downtown Toronto         Harbourfront
9         NaN               NaN                  NaN
10        M5A  Downtown Toronto          Regent Park
11        NaN               NaN                  NaN
12        M6A        North York     Lawrence Heights
13        NaN               NaN                  NaN
14        M6A        North York       Lawrence Manor
15        NaN               NaN                  NaN
16        M7A      Queen's Park         Not assigned
17        NaN               NaN               

In [8]:
data = data.dropna()

In [9]:
data.reset_index(drop=True)

Unnamed: 0,postcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [10]:
data = data[data['borough'] != 'Not assigned'].reset_index(drop=True)

In [11]:
data.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [12]:
data.shape

(633, 3)

In [13]:
data.loc[data['neighbourhood']=="Not assigned",'neighbourhood']=data.loc[data['neighbourhood']=="Not assigned",'borough']

In [14]:
data['postcode'].value_counts()

M8Y    24
M9V    24
M5V    21
M9B    15
M8Z    15
M4V    15
M1V    12
M9R    12
M6M    12
M9C    12
M1C     9
M5R     9
M2J     9
M5J     9
M1E     9
M6L     9
M5H     9
M1K     9
M5T     9
M1L     9
M1M     9
M8X     9
M3H     9
M6K     9
M1P     9
M8V     9
M1T     9
M6P     6
M5K     6
M8W     6
       ..
M1X     3
M4E     3
M9A     3
M9L     3
M1G     3
M9P     3
M4W     3
M2P     3
M4G     3
M3B     3
M2R     3
M6C     3
M4H     3
M2H     3
M7R     3
M5W     3
M1H     3
M9N     3
M9W     3
M6E     3
M1J     3
M4M     3
M6B     3
M4N     3
M4R     3
M5E     3
M3N     3
M7Y     3
M2N     3
M4Y     3
Name: postcode, Length: 103, dtype: int64

In [15]:
df = data.groupby("postcode").agg(lambda x:','.join(set(x)))
df.head()
print(df.shape)

(103, 2)


## 2. Pandas method

In [16]:
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',headers=header)
tables = pd.read_html(source.text)

In [17]:
data1 = pd.DataFrame(tables[0])
data1.head()
data1 = data1[data1['Borough']!= 'Not assigned']
data1.shape
data1 = data1.groupby('Postcode').agg(lambda x :','.join(set(x)))
data1.loc[data1['Neighbourhood']=="Not assigned",'Neighbourhood']=data1.loc[data1['Neighbourhood']=="Not assigned",'Borough']

In [18]:
data1.shape

(103, 2)

In [19]:
data1 = data1.reset_index()
data1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
# getting the geo_data file from website since I was unable to extract them from the code given by coursera.
geo_data = pd.read_csv('Geospatial_Coordinates.csv')

In [21]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
data1['Latitude'] = geo_data['Latitude']
data1['Longitude'] = geo_data['Longitude']
data1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Plotting Toronto Map

In [23]:
from geopy.geocoders import Nominatim

In [24]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of toronto are 43.653963, -79.387207.


In [25]:
import folium
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(data1['Latitude'], data1['Longitude'],data1['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto