# Segmenting and Clustering Neighborhoods in Toronto

# Part 1

In [1]:
# Importing All Important Libraries, that are necessary for this project 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth',None)

#### Load and explore the data

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
result = requests.get(url)
htmlContent = result.content

In [4]:
soup = BeautifulSoup(htmlContent,'html.parser')
#soup.prettify

Create three empty lists, so that the data you had get from the parsing can be used to fill them.

In [5]:
postalcodeList = []
boroughList = []
neighborhoodList = []

Then let's loop through the data and fill the list's one row at a time.

In [6]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if len(cells)>0:
        postalcodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))

#### Tranform the data into a *pandas* dataframe

In [7]:
toronto_df = pd.DataFrame({'PostalCode':postalcodeList,
                           'Borough':boroughList,
                           'Neighborhood':neighborhoodList})
toronto_df.head()#to display only first five rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [8]:
print('Before Choosing the data : ',toronto_df.shape)
filt = (toronto_df['Borough'] != 'Not assigned')
toronto_a_b_df = toronto_df[filt].reset_index(drop=True)
print('After choosing the data : ',toronto_a_b_df.shape)

Before Choosing the data :  (180, 3)
After choosing the data :  (103, 3)


#### Grouped the Data by ['postalcode','borough'] and rows will be combined into one row with the neighborhoods separated with a comma.

In [9]:
toronto_grouped = toronto_a_b_df.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: ",".join(x))
toronto_grouped.head()#to display only first five rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [10]:
for index,row in toronto_grouped.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
toronto_grouped.head()#to display only first five rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Part 2

Now Import the Geographical Coordinates of each postal code

In [11]:
df = pd.read_csv('Geospatial_Coordinates.csv')
df.head()#to display only first five rows

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now rename the column name as we have to merge the both dataframe based on PostalCode.

In [12]:
df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df.head()#to display only first five rows

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now use the merge command to merge both data frame using inner keyword.

In [13]:
neighborhood = toronto_grouped.merge(df,how='inner')

In [14]:
pd.set_option('display.max_rows',None) #this is basically to see the all rows
neighborhood.head()#to display only first five rows

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
column_names = ["PostalCode", "Borough", "Neighborhood","Latitude","Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for postcode in test_list:
    filt = (neighborhood['PostalCode'] == postcode)
    test_df = test_df.append(neighborhood[filt],ignore_index=True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",43.628947,-79.39442


In [16]:
neighborhood.shape

(103, 5)