In [1]:
#pip install requests BeautifulSoup4

In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [2]:
# read the web page
import requests

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source , 'lxml')

In [3]:
# read the table into dataframe
# after inspecting the webpage, the required table has attrs with class 'wikitable sortable'

import pandas as pd

table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
    
neighborhoud_table = pd.DataFrame(l, columns=["A", "B","C"]);

In [4]:
# rename the tables
neighborhoud_table.rename(columns={'A':'Postal Code','B':'Borough', "C":'Neighborhoud'}, inplace=True);

In [5]:
print(list(neighborhoud_table.columns))
print (neighborhoud_table.shape)

['Postal Code', 'Borough', 'Neighborhoud']
(181, 3)


In [6]:
neighborhoud_table.head()

Unnamed: 0,Postal Code,Borough,Neighborhoud
0,,,
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


In [7]:
# drop rows that contains all nulls
modified_neighborhoud = neighborhoud_table.dropna(axis=0 , how='all');
print (modified_neighborhoud.shape)

(180, 3)


In [8]:
#neighborhoud_table.drop(labels='0' , axis=0)
modified_neighborhoud.head()

Unnamed: 0,Postal Code,Borough,Neighborhoud
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


In [9]:
# drop rows where Borough is not assigned
indexNames = modified_neighborhoud[ modified_neighborhoud['Borough'] == 'Not assigned\n'].index
print(indexNames.shape)
# Delete these row indexes from dataFrame
modified_neighborhoud.drop(indexNames , inplace=True)
print (modified_neighborhoud.shape)

(77,)
(103, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [10]:
modified_neighborhoud.head()

Unnamed: 0,Postal Code,Borough,Neighborhoud
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
6,M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n
7,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n


In [11]:
# remove '\n' characters from the dataframe
modified_neighborhoud['Postal Code'] = modified_neighborhoud['Postal Code'].str.replace('\n','');
modified_neighborhoud['Borough'] = modified_neighborhoud['Borough'].str.replace('\n','');
modified_neighborhoud['Neighborhoud'] = modified_neighborhoud['Neighborhoud'].str.replace('\n','');
modified_neighborhoud.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Postal Code,Borough,Neighborhoud
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [12]:
# replace / with , in neighborhoud column
modified_neighborhoud['Neighborhoud'] = modified_neighborhoud['Neighborhoud'].str.replace('/',',');
modified_neighborhoud.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Postal Code,Borough,Neighborhoud
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park , Harbourfront"
6,M6A,North York,"Lawrence Manor , Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [13]:
indexNames = modified_neighborhoud[ modified_neighborhoud['Neighborhoud'] == 'Not assigned\n'].index
print(indexNames)

Int64Index([], dtype='int64')


In [14]:
# shape of the dataframe
print (modified_neighborhoud.shape)

(103, 3)


In [15]:
# reading Geospatial_Coordinates file
Geospatial_Coordinates = pd.read_csv('Geospatial_Coordinates.csv')
print(Geospatial_Coordinates.shape)
Geospatial_Coordinates.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
# reordering neighborhoud table to be the same order with Geospatial_Coordinates table
reordered_neighborhoud = modified_neighborhoud.sort_values(by=['Postal Code'])
reordered_neighborhoud.head()

Unnamed: 0,Postal Code,Borough,Neighborhoud
10,M1B,Scarborough,"Malvern , Rouge"
19,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
28,M1E,Scarborough,"Guildwood , Morningside , West Hill"
37,M1G,Scarborough,Woburn
46,M1H,Scarborough,Cedarbrae


In [20]:
# Concatinating Latitude and Longitude columns to the neighborhoud dataframe

Latitude = Geospatial_Coordinates.iloc[:,1];
Longitude = Geospatial_Coordinates.iloc[:,2];


Latitude.reset_index(drop=True, inplace=True)
Longitude.reset_index(drop=True, inplace=True)
reordered_neighborhoud.reset_index(drop=True, inplace=True)


final_neighborhoud = pd.concat([reordered_neighborhoud,Latitude,Longitude], axis=1);
final_neighborhoud.head()


Unnamed: 0,Postal Code,Borough,Neighborhoud,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [21]:
print(final_neighborhoud.shape)

(103, 5)
