# Coursera Capstone Project 
#### Week 3 part 1 - Web Scraping
<p text-align="justify">In this Notebook I have code to scrape the following Wikipedia page<b>https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M</b>, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe</p>

In [1]:
# importting required library
import time
import pandas as pd
import numpy as np
import requests
from requests import get
from bs4 import BeautifulSoup

In [2]:
# Fetching the content from url and slicing it till 500 characters
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
print(page.text[:500])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":900271985,"wgRe


In [3]:
# Displaying the content in the std HTML format
soup = BeautifulSoup(page.content, 'html5lib') 
print(soup.prettify()[:500]) 

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionI


In [4]:
# Find the content inside the table tag and Displaying the First few lines of content
table = soup.find('table')
content = table.text
print(content[0:50])



Postcode
Borough
Neighbourhood


M1A
Not assigne


In [5]:
# Writing the content in a text file i.e TableData.txt
with open ('TableData.txt','w') as r:
    r.write(content)

In [6]:
# Storing the content of text file in pandas data frame
df = pd.read_csv('TableData.txt', sep="\n", header=None)
df.head(10)

Unnamed: 0,0
0,Postcode
1,Borough
2,Neighbourhood
3,M1A
4,Not assigned
5,Not assigned
6,M2A
7,Not assigned
8,Not assigned
9,M3A


In [7]:
# Converting the Data Frame to list
list1  = df[0].tolist()

In [8]:
# Converting the single list into the desired columns i.e. as per the table structure 
    col1 = [] 
    col2 = []
    col3 = []
    for idx, val in enumerate(list1): 
        if (idx % 3 == 0): 
            col1.append(val) 
        elif (idx % 3 == 1):
            col2.append(val)
        else:
            col3.append(val)

In [9]:
# Creating the new Data Frame, Storing the lists in the newly created TableData data frame and Displaying the table
TableData = pd.DataFrame()
TableData['PostalCode'] = col1[1:]
TableData['Borough'] = col2[1:]
TableData['Neighbourhood'] = col3[1:]
TableData.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
# Removing the 'Not assigned' value from Borough and displaying the result
TableData = TableData[TableData.Borough != 'Not assigned']

In [11]:
#Often you don’t need the extra column with original row index. 
#We can specify pandas to not to keep the original index with the argument drop=True.
TableData = TableData.reset_index(drop = True)

In [12]:
#To put same value as of Borough if neighbourhood value is 'Not assigned'
TableData['Neighbourhood'] = np.where(TableData['Neighbourhood'] ==  'Not assigned', TableData['Borough'], TableData['Neighbourhood'])

In [13]:
# To append the entries of  Neighbourhood for same PostalCode using groupby and apply method
output = TableData.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(list)

In [14]:
#Converting the series output to pandas data frame
TableData = output.to_frame().reset_index()
TableData.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [15]:
# Removing the paranthesis and quotes
TableData['Neighbourhood'] = TableData['Neighbourhood'].astype(str).str.replace("[","")
TableData['Neighbourhood'] = TableData['Neighbourhood'].astype(str).str.replace("]","")
TableData['Neighbourhood'] = TableData['Neighbourhood'].astype(str).str.replace("'","")
TableData.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
# Getting the shape of the table structure
TableData.shape

(103, 3)