In [2]:
#importing some important packages
import requests 
import lxml.html as lh
import bs4 as bs
import urllib.request

import numpy as np
import pandas as pd

In [3]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [20]:
# Using BS4 as suggested in Assignment.

def scrape_table_bs4(cname,cols):
    page  = urllib.request.urlopen(url).read()
    soup  = bs.BeautifulSoup(page,'lxml')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data    = [row for row in data if len(row) == cols]
   
    # Store data to this temporary dataframe
    raw_df = pd.DataFrame(data,columns=header)
    return raw_df


# Parsing using xpath
def scrape_table_lxml(XPATH,cols):
    page = requests.get(url)
    doc = lh.fromstring(page.content)
    table_content = doc.xpath(XPATH)
    for table in table_content:
        headers = [th.text_content().strip() for th in table.xpath('//th')]
        headers = headers[0:3]
        data    = [[td.text_content().strip() for td in tr.xpath('td')] 
                   for tr in table.xpath('//tbody/tr')]
        data    = [row for row in data if len(row) == cols]
        raw_df = pd.DataFrame(data,columns=headers)
        return raw_df

In [21]:
#Test in beautifulSoup
raw_Toronto_Postal_Codes = scrape_table_bs4("wikitable",3)

print("# Toronto Postal codes stored in data")
print(raw_Toronto_Postal_Codes.info(verbose=True))

# Toronto Postal codes stored in data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
Postcode         287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 6.9+ KB
None


In [22]:
#Test in lxml ( for xpath based extraction)
raw_Toronto_Postal_Codes = scrape_table_lxml("/html/body/div[3]/div[3]/div[4]/div/table[1]",3)

print("# Toronto Postal codes stored in data")
print(raw_Toronto_Postal_Codes.info(verbose=True))

# Toronto Postal codes stored in data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
Postcode         287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 6.9+ KB
None


In [23]:
# We are expected to process the cells that have an assigned borough. SO we ignore cells with a borough that is Not assigned.

Toronto_Postal_Codes=raw_Toronto_Postal_Codes[~raw_Toronto_Postal_Codes['Borough'].isin(['Not assigned'])]

# Sort and Reset index.
Toronto_Postal_Codes=Toronto_Postal_Codes.sort_values(by=['Postcode','Borough','Neighbourhood'], ascending=[1,1,1]).reset_index(drop=True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

Toronto_Postal_Codes.loc[Toronto_Postal_Codes['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = Toronto_Postal_Codes['Borough']
check_unassigned_post_state_sample = Toronto_Postal_Codes.loc[Toronto_Postal_Codes['Borough'] == 'Queen\'s Park']

#Displays what was changed
print('DEBUG:',check_unassigned_post_state_sample)



DEBUG:     Postcode       Borough Neighbourhood
182      M9A  Queen's Park  Queen's Park


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table

In [24]:
Toronto_Postal_Codes = Toronto_Postal_Codes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [25]:
Toronto_Postal_Codes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [26]:
Toronto_Postal_Codes.shape

(103, 3)

In [27]:
Toronto_Postal_Codes.to_csv('Toronto.TASK_1_df.csv',index=False)

# Latitudes and Longitudes

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [28]:
# Reimporting dataframe from Task1

task1_data_csv = "Toronto.TASK_1_df.csv"

Toronto_Postal_Codes = pd.read_csv(task1_data_csv).set_index("Postcode")
Toronto_Postal_Codes.rename_axis("Postal Code", axis='index', inplace=True)

In [29]:
#Displaying to ensure it has been successfully imported
Toronto_Postal_Codes.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


Retrieving postcode coordinates:

In [30]:
toronto_geocsv = 'https://cocl.us/Geospatial_data'
!wget -q -O 'toronto_m.geospatial_data.csv' toronto_geocsv
geocsv_data = pd.read_csv(toronto_geocsv).set_index("Postal Code")
geocsv_data.head()

'wget' is not recognized as an internal or external command,
operable program or batch file.


Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


Combining our dataframe generated in Task1 with the latitudes and longitudes retireved in this task. The common vairable is PostalCode :

In [32]:
toronto_neighborhoods = TorontoPostalCodes.join(geocsv_data)
toronto_neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",,
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,


In [35]:
toronto_neighborhoods.shape

(103, 5)

In [36]:
toronto_neighborhoods.to_csv('Toronto.TASK_II_df.csv',index=False)