# Segmenting and Clustering Neighborhood

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

In [2]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


## Scrapping the data from the website

In [3]:
data=BeautifulSoup(source,'lxml')
body_div=data.find('div', class_='mw-parser-output')
table_data=body_div.table
columns=[]
for col in table_data.find_all('th'):
    if col.text.endswith('\n'):
        columns.append(col.text[:-1])
    else:
        columns.append(col.text)
print(columns)

['Postcode', 'Borough', 'Neighbourhood']


## Extracting table data from the website

In [66]:
wiki_data=pd.DataFrame(columns=columns)
index=0
t_data=[]
for col in table_data.find_all('td'):
    if col.text.endswith('\n'):
        t_data.append(col.text[:-1])
        wiki_data.loc[index]=t_data
        index+=1
        t_data=[]
    else:
        t_data.append(col.text)
wiki_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### The following code will replace "Not assigned" with "np.nan" and drop NA rows from the data

In [67]:
wiki_data.replace(to_replace='Not assigned', value=np.nan,inplace=True)
wiki_data.dropna(subset=['Borough'],axis=0,inplace=True)
wiki_data.reset_index(inplace=True,drop=True)
wiki_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### Following block performs

"If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park."

In [68]:
wiki_data1=wiki_data['Neighbourhood'].isnull()
for index,x in enumerate(wiki_data1):
    if x==True:
        wiki_data.replace(to_replace=np.nan, value=wiki_data.loc[index]['Borough'],inplace=True)
wiki_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


More than one neighborhood can exist in one postal code area. For example, in the above table, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma. The following will perform this task.

In [131]:
# Make a copy of the original dataset
wiki_data_copy=wiki_data.copy()

# The following code will join the rows having same postalCode and store that data into a new dataFrame i.e temp_d
temp_d=pd.DataFrame()
temp_d['Postcode']=list(set(wiki_data['Postcode']))
for index, x in enumerate(temp_d['Postcode']):
    temp_str=[]
    for y in wiki_data[wiki_data['Postcode']==x]['Neighbourhood']:
        temp_str.append(y)
    #print(s_c.join(temp_str))
    temp_d.loc[index,'Neighbourhood']=', '.join(temp_str)
#------------------------------------------------------------------------------------------------
# In the following code I set the index of both the dataFrames to PostalCode so that I can perform inner join.

wiki_data_copy.set_index('Postcode',inplace=True)
temp_d.set_index('Postcode',inplace=True)

#Inner joining both the dataframes i.e wiki_data_copy and temp_d and storing the new dataframe in wiki_data_copy
wiki_data_copy=pd.merge(wiki_data_copy, temp_d, left_index=True, right_index=True)

#Here I drop the old Neighbourhood column which contain duplicated data.
wiki_data_copy.drop(columns={'Neighbourhood_x'},inplace=True)

#------------------------------------------------------------------------------------------------

# In the following code I drop all the duplicate rows
wiki_data_copy.drop_duplicates(inplace=True)

#Reseting the index of wiki_data_copy
wiki_data_copy.reset_index(inplace=True)
wiki_data_copy.columns=columns
wiki_data_copy.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [132]:
wiki_data_copy.shape

(103, 3)