In [2]:
#Installing all the required libraries for this assignment
import numpy as np 
import pandas as pd
import json
#Using Beautiful Soup for Webscrapping
from bs4 import BeautifulSoup
import lxml
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
# library for map rendering
!conda install -c conda-forge folium=0.5.0 --yes
import folium
#Library for getting longititude and latitude of an address
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
#For converting Json file into Dataframe
from pandas.io.json import json_normalize
# library for requests
import requests 

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [4]:
#downloading the data
link = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(link.text)
table=soup.find('table', attrs={'class':'wikitable sortable'})


In [5]:
#getting header of the table and removing <th> from the headings
columns = table.findAll('th')
for i, column in enumerate(columns): 
    columns[i]=str(columns[i]).replace("<th>","").replace("</th>","").replace("\n","")
columns,i

(['Postcode', 'Borough', 'Neighbourhood'], 2)

In [6]:
#getting rows of the table and removing <td> & <tr> from the headings
rows=table.findAll('tr')
rows=rows[1:len(rows)]
for i, row in enumerate(rows): 
    rows[i] = str(rows[i]).replace("\n</td></tr>","").replace("<tr>\n<td>","")

In [8]:
# creating canada_location dataframe, expand rows and drop the old one:
canada_location = pd.DataFrame(rows)
canada_location[columns] = canada_location[0].str.split("</td>\n<td>", n = 2, expand = True) 
canada_location.drop(columns=[0],inplace=True)
canada_location.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Parkwoods"" title=""Parkwoods"">Pa..."
3,M4A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Victoria_Village"" title=""Victor..."
4,M5A,"<a href=""/wiki/Downtown_Toronto"" title=""Downto...","<a href=""/wiki/Harbourfront_(Toronto)"" title=""..."


In [9]:
#skipping not assigned rows
canada_location = canada_location.drop(canada_location[(canada_location.Borough == "Not assigned")].index)
#canada_location.head(5)
# give "Not assigned" Neighborhoods same name as Borough:
canada_location.Neighbourhood.replace("Not assigned", canada_location.Borough, inplace=True)
canada_location.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Parkwoods"" title=""Parkwoods"">Pa..."
3,M4A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Victoria_Village"" title=""Victor..."
4,M5A,"<a href=""/wiki/Downtown_Toronto"" title=""Downto...","<a href=""/wiki/Harbourfront_(Toronto)"" title=""..."
5,M5A,"<a href=""/wiki/Downtown_Toronto"" title=""Downto...","<a href=""/wiki/Regent_Park"" title=""Regent Park..."
6,M6A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Lawrence_Heights"" title=""Lawren..."
7,M6A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Lawrence_Manor"" title=""Lawrence..."
8,M7A,"<a href=""/wiki/Queen%27s_Park_(Toronto)"" title...","<a href=""/wiki/Queen%27s_Park_(Toronto)"" title..."
10,M9A,"<a href=""/wiki/Etobicoke"" title=""Etobicoke"">Et...","<a class=""mw-redirect"" href=""/wiki/Islington_A..."
11,M1B,"<a href=""/wiki/Scarborough,_Toronto"" title=""Sc...","<a href=""/wiki/Rouge,_Toronto"" title=""Rouge, T..."
12,M1B,"<a href=""/wiki/Scarborough,_Toronto"" title=""Sc...","<a href=""/wiki/Malvern,_Toronto"" title=""Malver..."


In [10]:
# copy Borough value to Neighborhood if NaN:
canada_location.Neighbourhood.fillna(canada_location.Borough, inplace=True)
# drop duplicate rows:
canada_location=canada_location.drop_duplicates()
canada_location.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Parkwoods"" title=""Parkwoods"">Pa..."
3,M4A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Victoria_Village"" title=""Victor..."
4,M5A,"<a href=""/wiki/Downtown_Toronto"" title=""Downto...","<a href=""/wiki/Harbourfront_(Toronto)"" title=""..."
5,M5A,"<a href=""/wiki/Downtown_Toronto"" title=""Downto...","<a href=""/wiki/Regent_Park"" title=""Regent Park..."
6,M6A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Lawrence_Heights"" title=""Lawren..."
7,M6A,"<a href=""/wiki/North_York"" title=""North York"">...","<a href=""/wiki/Lawrence_Manor"" title=""Lawrence..."
8,M7A,"<a href=""/wiki/Queen%27s_Park_(Toronto)"" title...","<a href=""/wiki/Queen%27s_Park_(Toronto)"" title..."
10,M9A,"<a href=""/wiki/Etobicoke"" title=""Etobicoke"">Et...","<a class=""mw-redirect"" href=""/wiki/Islington_A..."
11,M1B,"<a href=""/wiki/Scarborough,_Toronto"" title=""Sc...","<a href=""/wiki/Rouge,_Toronto"" title=""Rouge, T..."
12,M1B,"<a href=""/wiki/Scarborough,_Toronto"" title=""Sc...","<a href=""/wiki/Malvern,_Toronto"" title=""Malver..."


In [11]:
# extract titles from columns
canada_location.update(canada_location.Neighbourhood.loc[lambda x: x.str.contains('title')].str.extract('title=\"([^\"]*)',expand=False))

canada_location.update(canada_location.Borough.loc[lambda x: x.str.contains('title')].str.extract('title=\"([^\"]*)',expand=False))

# delete Toronto annotation from Neighbourhood:
canada_location.update(canada_location.Neighbourhood.loc[lambda x: x.str.contains('Toronto')].str.replace(", Toronto",""))
canada_location.update(canada_location.Neighbourhood.loc[lambda x: x.str.contains('Toronto')].str.replace("\(Toronto\)",""))

In [12]:
canada_location.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park (Toronto),Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,"Scarborough, Toronto",Rouge
12,M1B,"Scarborough, Toronto",Malvern


In [13]:
#creating a new data frame named can_code and taking values from dataframe canada_location
#adding unique postal code and matching borough and neighbourhood to the new dataframe can_code for combining neighbourhoods with same post code
can_code = pd.DataFrame({'Postcode':canada_location.Postcode.unique()})
can_code['Borough']=pd.DataFrame(list(set(canada_location['Borough'].loc[canada_location['Postcode'] == x['Postcode']])) for i, x in can_code.iterrows())
can_code['Neighborhood']=pd.Series(list(set(canada_location['Neighbourhood'].loc[canada_location['Postcode'] == x['Postcode']])) for i, x in can_code.iterrows())
can_code['Neighborhood']=can_code['Neighborhood'].apply(lambda x: ', '.join(x))
can_code.rename(columns={'Postcode':'Postalcode'}, inplace=True)
can_code.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront , Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park (Toronto),Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,"Scarborough, Toronto","Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [14]:
#Not able to get the geographical coordinates of the neighborhoods using the Geocoder package 
#Using the link to the csv file given in the course module with coordinates of each postal code:
coardinates = pd.read_csv("http://cocl.us/Geospatial_data")
coardinates.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
#resetting the index for dataframe coardinates and can_code before merging
coardinates.set_index("Postalcode")
can_code.set_index("Postalcode")
tor_address=pd.merge(can_code, coardinates)
tor_address.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront , Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,"Scarborough, Toronto","Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [16]:
tor_address.shape

(103, 5)