# Segmenting and Clustering Neighborhoods in Toronto – Week 2 – Task 1

In [1]:
# Importing Packages
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup  # for extracting data from HTML and XML files Library

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### URL from wikipedia - List of postal codes of Canada: M

This is a list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario. Only the first three characters are listed, corresponding to the Forward Sortation Area.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
toronto_wikipedia_data = requests.get(url)

In [5]:
print(toronto_wikipedia_data)

<Response [200]>


In [6]:
print(toronto_wikipedia_data.status_code)

200


## We will follow the guidelines of the problem statement

### Tips for  Webscraping Updated Table in Week3 Peer Graded Assignment

 **After retreiving the URL and creating a Beautiful soup object** 

 **Firstly create a list**  

 **Later after finding the table and table data  create a dictionary called cell having 3 keys PostalCode, Borough and Neighborhood.**

 **As postal code contains upto 3 characters extract that using tablerow.p.text**

 **Next use split ,strip and replace functions for getting Borough and Neighborhood information.**.

 **Append to the list**  

 **Create a dataframe with list**

In [7]:
soup = BeautifulSoup(toronto_wikipedia_data.content, 'html.parser')

In [8]:
tables = soup.find('table')

In [9]:
trs = tables.find_all('tr')

In [10]:
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)

##### Following instructions:

Only process the cells that have an assigned borough. Ignore cells with a borough that is 'Not assigned'.

In [11]:
trb = []
for row in rows:
    for i in range(9):
        trab = str(row[i].text.strip(' \n'))
        if trab[3:] != 'Not assigned':
            postalcode = trab[:3]
            aaa = ''
            bol1 = True
            for j in range(3, len(trab)):
                if trab[j] == '(':
                    bol1 = True
                elif trab[j] == ')':
                    aaa = aaa + trab[j]
                    bol1 = False
                if bol1:
                    aaa = aaa + trab[j]

            borough = ''
            neighborhood = ''
            bol1 = True
            for j in range(len(aaa)):
                if aaa[j] == '(':
                    bol1 = False
                if bol1:
                    borough = borough + aaa[j]
                else:
                    neighborhood = neighborhood + aaa[j]
            borough = str(str(borough.strip()).replace('YorkEast','York East')).replace('Stn A PO Boxes25 The Esplanade','')
            borough = str(borough).replace('Canada Post Gateway Processing Centre','')
            borough = str(borough).replace('Business reply mail Processing Centre969 Eastern','')
            borough = str(borough).replace('EtobicokeNorthwest','Etobicoke Northwest')
            neighborhood = str(neighborhood.strip(' ()')).replace('/', ',').replace('(',' ').replace(')',',')
            trb.append([postalcode, borough, neighborhood])

In [12]:
# Let's check the trb list before load into a pandas dataframe
trb[:10]

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park , Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor , Lawrence Heights'],
 ['M7A', "Queen's Park", 'Ontario Provincial Government'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Malvern , Rouge'],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill , Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson']]

##### Let's tranform the loaded data into a pandas dataframe

In [13]:
# define the dataframe columns
df_column_names = ['Postal Code', 'Borough', 'Neighborhood']

In [14]:
toronto_df = pd.DataFrame(trb, columns=df_column_names)

In [15]:
print(toronto_df.shape)

(103, 3)


In [16]:
toronto_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [17]:
toronto_df = toronto_df.sort_values(by=['Postal Code', 'Borough'], ascending=True).reset_index()

In [18]:
toronto_df.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,6,M1B,Scarborough,"Malvern , Rouge"
1,12,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,18,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,22,M1G,Scarborough,Woburn
4,26,M1H,Scarborough,Cedarbrae


Reading the downloaded file Geospatial_Coordinates.csv

In [19]:
toronto_geo = pd.read_csv('Geospatial_Coordinates.csv', sep=',')

In [20]:
toronto_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


joining the 2 dataframes - combined the Postal Code Column

In [21]:
toronto_df = pd.merge(toronto_df, toronto_geo, on="Postal Code", how='left')

In [22]:
toronto_df.head(15)

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,12,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,18,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,22,M1G,Scarborough,Woburn,43.770992,-79.216917
4,26,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,32,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,38,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,44,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,51,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,58,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


In [23]:
toronto_df.tail(15)

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Latitude,Longitude
88,88,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores",43.605647,-79.501321
89,93,M8W,Etobicoke,"Alderwood , Long Branch",43.602414,-79.543484
90,98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
91,101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509
92,102,M8Z,Etobicoke,"Mimico NW , The Queensway West , South of Bloo...",43.628841,-79.520999
93,5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
94,11,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.650943,-79.554724
95,17,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.643515,-79.577201
96,50,M9L,North York,Humber Summit,43.756303,-79.565963
97,57,M9M,North York,"Humberlea , Emery",43.724766,-79.532242
