# Scrapping from the web

### Importing Libraries

In [44]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

### Get the destination page

In [21]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
print(page)

<Response [200]>


### Parsing the web page and getting table

In [22]:
soup = BeautifulSoup(page.content, 'html.parser')

In [50]:
#Getting the table
table = soup.find('table', {"class" : "wikitable sortable"})

In [24]:
#Getting Values
Toronto = []

#Loop through tables to get the values from the 3 columns
for row in table.find_all('tr'):
    col = row.find_all('td')
    if len(col) == 3:
        Toronto.append((col[0].text.strip(), col[1].text.strip(), col[2].text.strip()))


### Creating arrays and data frame

In [25]:
#Converting the list to an array
Toronto = np.asarray(Toronto)

In [26]:
#Converting the array to a data frame
Toronto = pd.DataFrame(Toronto)
print(Toronto.shape)
Toronto.head()

(289, 3)


Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [27]:
#Naming the columns
names = ['PostalCode', 'Borough', 'Neighborhood']
Toronto.columns = names
Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Handling the Not assigned values

In [28]:
#Dropping the values with Not assigned Borough
Toronto = Toronto[Toronto.Borough != 'Not assigned']
print(Toronto.shape)
print(Toronto.head())

(212, 3)
  PostalCode           Borough      Neighborhood
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront
5        M5A  Downtown Toronto       Regent Park
6        M6A        North York  Lawrence Heights


In [29]:
#Checking for the Not assigned Neighborhood
Toronto.loc[Toronto.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [30]:
#Making the Not Assigned Neighborhood, same as the value of Borough
Toronto.loc[Toronto.Neighborhood == 'Not assigned','Neighborhood'] = Toronto.loc[Toronto.Neighborhood == 'Not assigned','Borough']

In [31]:
#Checking for the not assigned row
Toronto.loc[8]

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 8, dtype: object

In [32]:
Toronto1 = Toronto.groupby('PostalCode').agg({'Borough':'first','Neighborhood':', '.join}).reset_index()

### Final look of the Data Frame

In [33]:
Toronto1.shape

(103, 3)

In [34]:
#storing the variable to use in other notebooks
%store Toronto

Stored 'Toronto' (DataFrame)


In [35]:
Toronto1.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [37]:
Toronto1.shape

(103, 3)