# IBM Data Science | Capstone Project - Web scraping
Created by: Sangwook Cheon   
Date: June 4, 2019

First let's import important libraries including Beautiful Soup, which will be used for web scraping. 

In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import requests
from bs4 import BeautifulSoup
import json
# import geopy

Now load the html from a Wikipedia page containing information about postal codes of Canada.

In [1]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

#quickly see the html
# print(soup.prettify())

#getting the html of table
raw_table = soup.find('table', class_='wikitable sortable')
# print(raw_table.prettify())

NameError: name 'requests' is not defined

Now let's process the html of the table to convert it into a DataFrame.

In [104]:

rows_all = raw_table.find_all('tr')
row_data = []

for row in rows_all:
    td = row.find_all('td')
    row = [i.text for i in td]
    #Only add cells with borough, and with three values
    if len(row) != 0 and row[1] != 'Not assigned':
        row[2] = row[2].rstrip() #remove /n
        #If Neighborhood is not assigned, the name is same as borough
        if row[2] == 'Not assigned':
            row[2] = row[1]
        row_data.append(row)
    
print(row_data[0])

['M3A', 'North York', 'Parkwoods']


In [105]:
data = pd.DataFrame(row_data, columns=['PostalCode', 'Borough', 'Neighborhood'])

In [106]:
data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


Now we need to process this table further:

In [107]:
data.loc[6, 'Neighborhood']

"Queen's Park"

We need to combine neighbors hoods with same Postal Code into one row

In [108]:
postal_codes = data.PostalCode.unique()
print(postal_codes)

['M3A' 'M4A' 'M5A' 'M6A' 'M7A' 'M9A' 'M1B' 'M3B' 'M4B' 'M5B' 'M6B' 'M9B'
 'M1C' 'M3C' 'M4C' 'M5C' 'M6C' 'M9C' 'M1E' 'M4E' 'M5E' 'M6E' 'M1G' 'M4G'
 'M5G' 'M6G' 'M1H' 'M2H' 'M3H' 'M4H' 'M5H' 'M6H' 'M1J' 'M2J' 'M3J' 'M4J'
 'M5J' 'M6J' 'M1K' 'M2K' 'M3K' 'M4K' 'M5K' 'M6K' 'M1L' 'M2L' 'M3L' 'M4L'
 'M5L' 'M6L' 'M9L' 'M1M' 'M2M' 'M3M' 'M4M' 'M5M' 'M6M' 'M9M' 'M1N' 'M2N'
 'M3N' 'M4N' 'M5N' 'M6N' 'M9N' 'M1P' 'M2P' 'M4P' 'M5P' 'M6P' 'M9P' 'M1R'
 'M2R' 'M4R' 'M5R' 'M6R' 'M7R' 'M9R' 'M1S' 'M4S' 'M5S' 'M6S' 'M1T' 'M4T'
 'M5T' 'M1V' 'M4V' 'M5V' 'M8V' 'M9V' 'M1W' 'M4W' 'M5W' 'M8W' 'M9W' 'M1X'
 'M4X' 'M5X' 'M8X' 'M4Y' 'M7Y' 'M8Y' 'M8Z']


In [109]:
clean_data = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'], index=None)
clean_data

Unnamed: 0,PostalCode,Borough,Neighborhood


In [110]:
#Create a new dataframe with unique ppostal codes.
for code in postal_codes:
    # Get DataFrame containing rows with same postal code.
    df = data.loc[data['PostalCode'] == code]
    borough = df.iloc[0, 1]
    # Join each column into a string containing all neighborhoods in the same code.
    df = ', '.join(df['Neighborhood'].tolist())
    
    #fill in the new dataframe clean_data
    clean_data = clean_data.append({'PostalCode': code, 'Borough': borough , 'Neighborhood': df}, ignore_index=True)

In [111]:
clean_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [112]:
clean_data.shape

(103, 3)

In [114]:
#Save this as csv so that we can work with this cleaned data later
clean_data.to_csv('postal_codes_canada-cleaned.csv')
print('Saved as csv')

Saved as csv


Now the DataFrame is fully cleaned and is ready to be analyzed! 
