# Importing Libraries

In [2]:

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

import urllib.request

try:
    import folium
except:
    !pip install folium
    import folium
    
print("libraries imported") 

libraries imported


# Scraping data from Wikipedia

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")
all_tables=soup.find_all("table")
all_tables

right_table=soup.find('table', class_='wikitable sortable')
right_table

postalcode = []
borough = []
neighbourhood = []
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        postalcode.append(cells[0].find(text = True))
        borough.append(cells[1].find(text = True))
        neighbourhood.append(cells[2].find(text = True))

df=pd.DataFrame(postalcode,columns=['Postal Code'])
df['Borough']=borough
df['Neighbourhood']=neighbourhood
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Dropping 'Not assigned' rows of Borough

In [4]:
df = df[~df.Borough.str.contains("Not assigned")].reset_index(drop = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Grouping the data

In [5]:
df = df.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge\n"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek\n"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


# For 'Not assigned' Neighbourhood assign the same value as Borough

In [6]:
for index, row in df.iterrows():
    if row["Neighbourhood"] == "Not assigned":
        row["Neighbourhood"] = row["Borough"]
        
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge\n"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek\n"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


# Number of rows in Dataframe

In [7]:
df.shape

(103, 3)

# Loading the coordinates from csv file

In [8]:
coord = pd.read_csv("http://cocl.us/Geospatial_data")
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Concatinating the dataframes 

In [12]:
coord_only = coord[['Latitude', 'Longitude']]
new_df = pd.concat([df, coord_only], axis = 1)
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge\n",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek\n",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill\n",43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
