# Peer-Graded Assignment : Segmenting and Clustering Neighborhoods in Toronto (Part 2)

Build the dataframe by scraping wiki page of Canada postal codes which will have three columns Postal code, Borough, Neighborhood.

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import json,requests
from bs4 import BeautifulSoup
from selenium import webdriver
print("Libraries imported")

Libraries imported


### Using BeautifulSoup and Selenium  Web driver for Scraping data

In [2]:
driver= webdriver.Chrome(executable_path=r'd:\Profiles\sahsrivastava\Downloads\chromedriver.exe')

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
driver.get(url)
soup=BeautifulSoup(driver.page_source,'lxml')

In [4]:
postalCodeList = []
boroughList = []
neighborhoodList = []

In [5]:
table=soup.find('table')
tr=table.find_all('tr')
for row in tr:
    cells=row.find_all('td')
    if len(cells)>0:
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [6]:
toronto_df=pd.DataFrame({'PostalCode': postalCodeList,'Borough': boroughList, 'NeighborhoodList': neighborhoodList})

In [7]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,NeighborhoodList
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Drop cells with a borough that is "Not assigned"

In [8]:
toronto_df_new=toronto_df[toronto_df['Borough']!='Not assigned'].reset_index(drop=True)
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,NeighborhoodList
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


We can see in the DataFrame that Neighborhood List is already grouped according to Postal code and Borough,thus no need to performn grouping step.

### Check whether any NeighborhoodList column has values "Not assigned"

In [9]:
toronto_df_new[toronto_df_new['NeighborhoodList']=='Not assigned']

Unnamed: 0,PostalCode,Borough,NeighborhoodList


### Check if it is same as asked in the question

In [11]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
df_test = pd.DataFrame(columns=column_names)

list_test = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for items in list_test:
    df_test=df_test.append(toronto_df_new[toronto_df_new["PostalCode"]==items], ignore_index=True)
    
df_test

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


### Shape of DataFrame

In [12]:
toronto_df_new.shape

(103, 3)

### Adding coordinates csv file into DataFrame

In [14]:
coordinates=pd.read_csv('Geospatial_Coordinates.csv')
coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [23]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,Latitude,Longitude,PostalCode
0,43.806686,-79.194353,M1B
1,43.784535,-79.160497,M1C
2,43.763573,-79.188711,M1E
3,43.770992,-79.216917,M1G
4,43.773136,-79.239476,M1H


### Merge two tables

In [26]:
toronto_df_final=pd.merge(toronto_df_new, coordinates, on='PostalCode', how='outer')
toronto_df_final.rename(columns={'NeighborhoodList': 'Neighborhood'},inplace=True)
toronto_df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Check if it is same as asked in the question

In [28]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
df_test = pd.DataFrame(columns=column_names)

list_test = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for items in list_test:
    df_test = df_test.append(toronto_df_final[toronto_df_final["PostalCode"]==items], ignore_index=True)
    
df_test

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
