<b>This notebook scraps a data (Toronto neighborhoods) from Wikipedia page and converts that into nice and clean pandas dataframe.</b>

<i> (1) Import libraries</i>

In [28]:
# import libraries
 
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup


<i> (2) Scrape the data</i>

In [29]:
#  scrape the data from the wiki page to a table

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find('table')

<i> (3) Clean the data</i>

In [30]:
# clean the table

Postcode      = []
Borough       = []
Neighborhood = []

for i in table.find_all('tr'):
    
    x = 1
    Postcode_      = -1
    Borough_       = -1
    Neighborhood_ = -1
    
    for j in i.find_all('td'):
        if x == 1: 
            Postcode_ = j.text
        if x == 2: 
            Borough_ = j.text
            tag_a_Borough = j.find('a')
            
        if x == 3: 
            Neighborhood_ = str(j.text).strip()
            tag_a_Neighborhood = j.find('a')
            
        x +=1
        
        if (Postcode_ == 'Not assigned' or Borough_ == 'Not assigned' or Neighborhood_ == 'Not assigned'):
            
            continue
        
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighborhood is None)):
            
            continue
           
    except:
        
        pass
    
    if(Postcode_ == -1 or Borough_ == -1 or Neighborhood_ == -1):
        
        continue

    Postcode.append(Postcode_)
    Borough.append(Borough_)
    Neighborhood.append(Neighborhood_)
    
raw_boroughs_table = {'PostalCode':Postcode, 'Borough':Borough, 'Neighborhood':Neighborhood}

<i> (4) Wrap the data to a dataframe</i>

In [31]:
# create a dataframe from the table

df = pd.DataFrame.from_dict(raw_boroughs_table)
df.to_csv('toronto_part1.csv')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


<i> (5) Group duplicate Neighborhoods</i>

In [32]:
# group duplicate neighborhoods

df.sort_values(['PostalCode'])

duplics=df['PostalCode'].duplicated()

df['Duplic'] = duplics

i=len(df['PostalCode'])

while i>1:
    i=i-1
    if df['Duplic'].values[i]:
        temp_str = df['Neighborhood'].values[i-1] + ", "+df['Neighborhood'].values[i]
        df['Neighborhood'].values[i-1] = temp_str
        df.drop(i, inplace = True)
        
df.reset_index(drop = True, inplace = True)
df = df.drop('Duplic', 1)

<i> (6) Describe the output dataframe</i>

In [33]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [34]:
df.shape

(77, 3)

<i>(7) Add geo coordinates </i>

In [54]:
csv_path = 'http://cocl.us/Geospatial_data'
geo_df = pd.read_csv(csv_path)
geo_df.head()

Lat_list  = []
Lon_list = []

for i in range(0,len(df['PostalCode'])):
    x = geo_df.loc[geo_df['Postal Code'] == df['PostalCode'].values[i]]
    Lat_list.append(x['Latitude'].values[0])
    Lon_list.append(x['Longitude'].values[0])
    
df['Latitude'] = Lat_list
df['Longitude'] = Lon_list

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
