## Importing the data from HTML into a Pandas Dataframe

In [1]:
#import the necessary libraries
import numpy as np #data as arrays

import pandas as pd #arrays in the dataframe created by pandas
pd.set_option('display.max_columns',None) #displays all columns
pd.set_option('display.max_rows',None) #displays all rows

#For geolocation purposes
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import requests #to make data sharing between code and URL easier

from bs4 import BeautifulSoup #to retrieve data from URL 

from sklearn.cluster import KMeans

print("All the necessary libraries have been imported")

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [55]:
#extract table from Wikipedia page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')
table  = soup.find('table',{'class':'wikitable sortable'})

#format pandas dataframe
col_names = ['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns = col_names)

In [56]:
#Collecting all the data for Postal Codes, Boroughs and Neighborhoods

for a in table.find_all('tr'):
    row = []
    for b in a.find_all('td'):
        row.append(b.text.strip())
    if len(row)==3:
        df.loc[len(df)] = row  
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Wrangling

In [58]:
#eliminate Boroughs that are Not Assigned
df = df[df.Borough != 'Not assigned']

#Display neighborhoods with same post codes in one row
df = df.groupby(['PostalCode','Borough'],sort = False).agg(', '.join)
df.reset_index(inplace = True)

#Neighborhood = Borough, for those boroughs without an assigned Neighborhood
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df['Borough']

df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [60]:
df.shape

(103, 3)