# SEGMENTING AND CLUSTERING NEIGHBORHOOD IN TORONTO

##                                                        PART-2

In [2]:
#importing required packages and libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

#Dataframe

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))


df1=pd.DataFrame(df[0].loc[:,:])
df1

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Drop the rows which contains 'Not assigned' values 

In [3]:
indexNames = df1[ df1['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
df1.drop(indexNames , inplace=True)

In [4]:
df1

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### Shape of the Dataframe

In [5]:
df1.shape

(103, 3)

### Reseting the Index

In [9]:
df1.reset_index(drop=True, inplace=True)
df1

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Importing Packages


In [10]:
import sys
!{sys.executable} -m pip install geocoder

print('Packages installed.')

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 8.6MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Packages installed.


In [11]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import geocoder # import geocoder
import requests 
from bs4 import BeautifulSoup 

print('Libraries imported.')

Libraries imported.


### Scrapping Data from website

In [12]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 

# print(soup.prettify()) 
print('Page Scrapped.')

Page Scrapped.


In [13]:
postalCodes = [];
boroughs= [];
neighborhoods = [];
columnNum = 1;
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            passVal = False
            if columnNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    postalCodes.append(cell.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if cell.string == 'Not assigned':
                    passVal = True
                    del postalCodes[-1]
                    columnNum = 1
                    continue
                else:
                    boroughs.append(cell.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string); 
                columnNum = 1
                
print('Data Collected.')

Data Collected.


### Creating Dataframe

In [14]:
column_names = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighbors = pd.DataFrame(columns=column_names)

neighbors

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude


In [15]:
lat_lng_coords = None

for data in range(0, len(postalCodes)-1):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]
    
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    lat_lng_coords = g.latlng

    neighbors = neighbors.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name,
                                   'Latitude': lat_lng_coords[0],
                                   'Longitude': lat_lng_coords[1]}, ignore_index=True)

In [16]:
df2=pd.DataFrame(neighbors)
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1A,Not assigned,M2A,43.648690,-79.385440
1,M3A,North York,Parkwoods,43.752935,-79.335641
2,M4A,North York,Victoria Village,43.728102,-79.311890
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.661790,-79.389390
6,M8A,Not assigned,M9A,43.648690,-79.385440
7,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
8,M2B,Not assigned,M3B,43.648690,-79.385440
9,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529


### Removing Non assigned values

In [17]:
indexNames = df2[ df2['Borough'].astype(str) == 'Not assigned\n' ].index
 
# Delete these row indexes from dataFrame
df2.drop(indexNames , inplace=True)
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
1,M3A,North York,Parkwoods,43.752935,-79.335641
2,M4A,North York,Victoria Village,43.728102,-79.311890
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.661790,-79.389390
7,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
9,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
10,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
11,M6B,North York,Glencairn,43.707279,-79.447500
13,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.650023,-79.554089
