In [2]:
# install beautifulsoup
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [1]:
# import beautifulsoup library
from bs4 import BeautifulSoup as bs
import requests

In [2]:
# get the table data from wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
soup = bs(response.text, 'html.parser')
print(soup.title)
#soup.find('table').find_all('tr')[1].find_all('td')[0].string

<title>List of postal codes of Canada: M - Wikipedia</title>


In [3]:
# create a empty dataframe with three columns as below
import pandas as pd
column_name =['PostalCode','Borough','Neighborhood']
neighborhoods = pd.DataFrame(columns = column_name)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


In [4]:
#  get the table data from wiki page and store it to new data frame
listcode=[]
for row in soup.find('table',{'class': 'wikitable sortable'}).find_all('tr'):
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    
    if len(cols)>0:
        #print (cols)
        listcode.append([ele for ele in cols if ele])

df = pd.DataFrame(listcode)
#print (df)
neighborhoods[['PostalCode','Borough','Neighborhood']]=pd.DataFrame(df.values)

In [5]:
# display new dataframe 
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# remove records which with a borough as 'Not assigned' 
neighborhoods = neighborhoods[neighborhoods['Borough'] != 'Not assigned']
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
# combine neigborhood column data where borough are same with a comma
neighborhoods['Neighborhood']=neighborhoods.groupby('PostalCode')['Neighborhood'].transform(lambda x: ','.join(x))
neighborhoods= neighborhoods[['PostalCode','Borough','Neighborhood']].drop_duplicates()
neighborhoods.reset_index(inplace =True, drop=True)
neighborhoods.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [8]:
# select records which has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough

for i, rw in neighborhoods.iterrows():
    #print (rw)
    if rw['Neighborhood'] == 'Not assigned':
        rw['Neighborhood']=rw['Borough']

In [9]:
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [10]:
# Check the datafrome shape
neighborhoods.shape

(103, 3)

In [18]:
import pandas as pd

# Read the dataset given in Geospatial_coordinates.csv file
Geospatial_data = pd.read_csv("Geospatial_Coordinates.csv")
Geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
# Filter the Geospatial_data dataframe with the postal code present in our neighborhoods dataframe
input_data= Geospatial_data[Geospatial_data['Postal Code'].isin(neighborhoods['PostalCode'].tolist())]
input_data.shape

(103, 3)

In [26]:
# Check out the Geospatial_data set after filter
input_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
# Now merge both data set to get latitude and longitude for each neighborhoods in neighborhoods dataframe.
neighborhoods_data= neighborhoods.merge(input_data, left_on = 'PostalCode', right_on ='Postal Code')
neighborhoods_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


In [35]:
# Drop the Postal Code column as repeats twice
neighborhoods_data.drop(columns='Postal Code',inplace = True)
neighborhoods_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
