# Extract the neiborhood data of Toronto from wikipedia

### Last week, we learnt download dataset from FourSqure API. This  time, let's scrap the information from Wikipedia and make them grouped

First, let's import the libary we need

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib
print('libraries imported successfully')

libraries imported successfully


Now, we are going to use beautiful soup to scrap the data from url

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
# make soup
soup = BeautifulSoup(page,'html.parser')
tables = soup.select('table')
table = tables[0].findAll('tr')

In [3]:
postcodes = [] #creat list to contain the data
broughs = []
neighborhoods = []
for row in tables[0].findAll('tr'): # find each row
    if row.findAll('td'):           # the data start with td
        postcodes.append(row.findAll('td')[0].contents[0]) # the first column is postcode.
        broughs.append(row.findAll('td')[1].contents[0].string.replace('\n',''))# the second column is broughs. delete the '\n' if ther is
        neighborhoods.append(row.findAll('td')[2].contents[0].string.replace('\n',''))# the third column is neighborhood.delete the '\n' if there is
# convert the data into dataframe
df = pd.DataFrame({'Postcode':postcodes, 'Broughs':broughs, 'Neighborhoods':neighborhoods})

#### let's clean the data further

In [7]:
df = df[df.Broughs!='Not assigned'] # let's delete those brough not assigned
nh_mask = df.Neighborhoods =='Not assigned' # where neighbor hood was not assigned
df["Neighborhoods"][nh_mask] = df['Broughs'][nh_mask]
df.shape

(211, 3)

In [8]:
df.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


We can find the row 2 and row 3 has the same Postocode and same brough with different neigborhood. We are suposed to merge them. So, we are going to use groupby()

In [10]:
# to define a function applied to groupby, here I acknowlwdge the bbs https://codeday.me/bug/20171205/104918.html
#It told me how to merge string if we jsut want to join them together
def f(x):
     return pd.DataFrame(dict(Postcode = np.unique(x['Postcode']),  #We extract the unique postcodes
                        Broughs = np.unique(x['Broughs']),          #We extract the unique broughs, since many are the same with the same postcodes
                        Neighborhoods = "%s" % ', '.join(x['Neighborhoods']))) # merge the neighborhood

In [12]:
df_group.shape

(103, 3)

In [11]:
df_group = df.groupby('Postcode').apply(f)

In [13]:
df_group.reset_index(drop = True, inplace = True) # let's drop the index

Now, we know there is 103 postcodes if we delete the unasigned broughs and neighborhoods

Let's print the first 5 postcodes and neighborhoods

In [16]:
df_group.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## the second task is to find the latitude and longtitude 

First, we would like to use google api

In [31]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 6.8MB/s 
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [None]:
# let's import the libraries
import geocoder
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    lat_lng_coords = None
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[0]))
        lat_lng_coord = g.latlng
    latitude.append(lat_lng_coord[0])
    longitude.append(lat_lng_coord[1])

Ops! It seems the geocoder is not stable. Let's explore other way. Fine, we once used other API name as Noinatim, maybe it works

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    address = '{}, Toronto, Ontario'.format(postal_code[0])
    location = None
    while (location is None):
        geolocator = Nominatim(user_agent="To_explorer")
        location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print(latitude, longitude)

OK, we failed again. Anyway, we have backup. We can load the data from https://cocl.us/Geospatial_data

In [43]:
df_to = pd.read_csv('https://cocl.us/Geospatial_data')
df_to.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Great. Let's merge these two dataframes

In [44]:
df_toronto = pd.concat([df_group, df_to], axis = 1, join = 'inner')
df_toronto.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [48]:
# we can find two postal codes and we drop the second one
df_toronto.drop(columns = ["Postal Code"], axis = 1, inplace = True)

Let's have a look again

In [49]:
df_toronto.head(10)

Unnamed: 0,Postcode,Broughs,Neighborhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [50]:
df_toronto.shape

(103, 5)

congrats! we made it.In summary, what we did in this section is combing the two existence dataframe.
The concat method has several parameters, the first is the object to merge, second axis, we concate along the row if axis == 1 else along the column, join means intersaction(inner) or union(outter)