# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

In [1]:
# Importing All Important Libraries, that are necessary for this project
import numpy as np
import pandas as pd
import requests
pd.set_option('display.max_colwidth',None)
from bs4 import BeautifulSoup

#### Load and explore the data

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
result = requests.get(url)
htmlContent = result.content

In [4]:
soup = BeautifulSoup(htmlContent,'html.parser')
#print(soup.prettify)

Create three empty lists, so that the data you had get from the parsing can be used to fill them.

In [5]:
postalcodeList = []
boroughList = []
neighborhoodList = []

Then let's loop through the data and fill the list's one row at a time.

In [6]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if len(cells)>0:
        postalcodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))

#### Tranform the data into a *pandas* dataframe

In [7]:
toronto_df = pd.DataFrame({'PostalCode':postalcodeList,
                           'Borough':boroughList,
                           'Neighborhood':neighborhoodList})
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [8]:
print("Before Choosing the data : ",toronto_df.shape)
filt = (toronto_df['Borough'] != 'Not assigned')
toronto_a_b_df = toronto_df[filt].reset_index(drop=True)
print("After Choosing the data : ",toronto_a_b_df.shape)

Before Choosing the data :  (180, 3)
After Choosing the data :  (103, 3)


#### Grouped the Data by ['postalcode','borough'] and rows will be combined into one row with the neighborhoods separated with a comma.

In [9]:
toronto_grouped = toronto_a_b_df.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: ",".join(x))
toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [10]:
for index,row in toronto_grouped.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
for postcode in test_list:
    filt = (toronto_grouped['PostalCode'] == postcode)
    test_df = test_df.append(toronto_grouped[filt],ignore_index=True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens"
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport"


In [12]:
toronto_grouped.shape

(103, 3)