## **Scrapping Table from Wikipedia Page**

**Importing necessary libraries**

In [1]:

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import time
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

**Specifying the required URL/web page for scraping**

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

## **Requesting to get the url page and putting the HTML into the data variable**


**Using BeautifulSoup library we parse the HTML from our URL into the BeautifulSoup parse tree format and saving in 'soup' variable**

In [3]:
data= requests.get(url)
soup= BeautifulSoup(data.text,"lxml")

**Using the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable**

In [4]:
all_tables=soup.find_all("table")
all_tables[0]

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
<tbody><tr>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M1A</b><br/><span style="font-size:80%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M2A</b><br/><span style="font-size:80%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M3A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M4A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M5A</b><br/><span style="font-size:80%;"><a hr

**Checking if the table extracted is the correct one by verifying number of rows**

In [5]:
rows=all_tables[0].find_all('tr')
len(rows)


20

Looping through the rows to get the data. The table is well structured with 20 rows and 9 columns. I want to scrape the code, borough and neighbourhood. 
So, I set up two empty lists (A, B). By observing xml tree I found that Postal codes are the text value under 'b' tag and Borough, neighbourhood are the text value under 'span' tag. 

To start with, I used the Beautiful Soup ‘find_all’ function again and set it to look for the string ‘tr’. Then I set up a FOR loop for each row within that array and set Python to loop through the rows, one by one.

Within the loop I used find_all again to search each row for <td> tags with the ‘td’ string.I add all of these to a variable called ‘col’ and then check to make sure that there are 9 items in our ‘col’ array (i.e. one for each column and total 9 columns).

In [6]:
A=[]
B=[]
for row in all_tables[0].find_all('tr'):
  col= row.find_all('td')
  if len(col)==9:
    for i in range(9):
      col1=col[i].find('b')
      A.append(col1.text)
      col2=col[i].find('span')
      B.append(col2.text)
      


# Converting the extracted data in the list to a dataframe

**Dropping rows with Not Assigned entry in Borough(Neighbourhood) column**

In [7]:
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough(Neighbourhood)']=B
df=df[df['Borough(Neighbourhood)']!="Not assigned"]
df.head()

Unnamed: 0,PostalCode,Borough(Neighbourhood)
2,M3A,North York(Parkwoods)
3,M4A,North York(Victoria Village)
4,M5A,Downtown Toronto(Regent Park / Harbourfront)
5,M6A,North York(Lawrence Manor / Lawrence Heights)
6,M7A,Queen's Park / Ontario Provincial Government


**Refining the dataframe to get in desired shape** 

In [8]:
df[['Borough','Neighbourhood']] = df['Borough(Neighbourhood)'].str.split('(',expand=True, n=1)
df.drop(['Borough(Neighbourhood)'], axis="columns", inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods)
3,M4A,North York,Victoria Village)
4,M5A,Downtown Toronto,Regent Park / Harbourfront)
5,M6A,North York,Lawrence Manor / Lawrence Heights)
6,M7A,Queen's Park / Ontario Provincial Government,


In [9]:
df['Neighbourhood']=df['Neighbourhood'].str.replace(')',"")
df['Neighbourhood']=df['Neighbourhood'].str.replace('/',",")

In [10]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park / Ontario Provincial Government,


**Checking if we have borough which has corresponding neighbourhood empty and copying the borough entry to the empty cell in the neighbourhood column**

In [11]:
df["Neighbourhood"].isnull().sum()

1

In [12]:
df.loc[df["Neighbourhood"].isnull(),'Neighbourhood'] = df["Borough"] 

In [13]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government


**Final shape of the dataframe obtained**

In [14]:
df.shape

(103, 3)

In [15]:
! pip install geocoder



In [16]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(df['PostalCode']))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

**Merging GeoSpatial Coordinates to the main dataset by PostalCode Variable**

In [18]:
df1=pd.read_csv('Geospatial_Coordinates.csv')

In [19]:
dataframe= df.merge(df1, how='inner', on='PostalCode', left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

# Final Dataset 

In [20]:
dataframe.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


# Visualization of Neighbourhood and how they cluster together

In [25]:
! pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl (104kB)
[K     |████████████████████████████████| 112kB 3.9MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0


In [26]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 


from geopy.geocoders import Nominatim 

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [28]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [31]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dataframe['Latitude'], dataframe['Longitude'], dataframe['Borough'], dataframe['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto