## Web scrap data from a html table

In [41]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# request the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)

In [0]:
# Get the table data
soup = BeautifulSoup(req.text, "html.parser")

table = soup.find('table', {'class': 'wikitable sortable'})

In [0]:
# Get all rows from the table
rows = table.findAll("tr")

In [0]:
# Create an array with the rows
data = []
for row in rows:
  data.append(row.text.strip().split('\n'))
data = np.array(data)

In [0]:
# Create a DataFrame with the data
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.drop(df.index[0])

#### Clean the data > 'Not assigned' values

In [47]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [0]:
df = df[df['Borough']!='Not assigned']

In [0]:
df['Neighbourhood'] = df['Borough'].where(df['Neighbourhood']=='Not assigned', df['Neighbourhood'])

In [50]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [51]:
df.shape

(211, 3)

## Get latitude and longitude

In [0]:
# Read coordinates csv fron each postal code
geo_coor = pd.read_csv('/content/drive/My Drive/GITHUB REPO/segmenting-and-clustering-NBHD-in-Toronto/Geospatial_Coordinates.csv')

In [53]:
geo_coor.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
geo_coor.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Join two dataframes by Postcode
data = df.merge(geo_coor, on='Postcode')

In [59]:
data.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
