<H1>Segmenting and Clustering Neighborhoods in Toronto. </H1>

<H3> Importing the Libraries. </H3>

In [30]:
import pandas as pd
import numpy as np
import json
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

print('Libraries Imported.')

Libraries Imported.


<H3> Q1: Scraping Wiki Page by using BeautifulSoup. </H3>

In [8]:
# Fetiching wiki page data using requests.
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Prasing data using beautifulsoup.
Soup = BeautifulSoup(data,'html.parser')

# Initiating list that will be used to create dataframe.
PostCodeList = []
BoroughList = []
NeighborhoodList = []

# Looking for any tables on the Wiki page.
Soup.find('table').find_all('tr')

for row in Soup.find('table').find_all('tr'):
    cells =row.find_all('td')

# Looking for necessary information we need to create our dataframe.
for row in Soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells)>0):
        PostCodeList.append(cells[0].text)
        BoroughList.append(cells[1].text)
        NeighborhoodList.append(cells[2].text.rstrip('\n'))

<H3> Creating a dataframe. </H3>

In [11]:
# Creating the dataframe using pandas.
toronto_df = pd.DataFrame({'Postal Code': PostCodeList,
                           'Borough': BoroughList,
                           'Neighborhood': NeighborhoodList})
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M2A\n,Not assigned\n,Not assigned
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


<H3> Cleaning Dataframe. </H3>

In [12]:
# Removing uncessary \n from dataframe.
toronto_df['Postal Code'] = toronto_df['Postal Code'].str.replace('\n','')
toronto_df['Borough'] = toronto_df['Borough'].str.replace('\n','')
toronto_df['Neighborhood'] = toronto_df['Neighborhood'].str.replace('\n','')

toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [14]:
# Ignoring cells with Borough Not assigned.
toronto_drop_df = toronto_df[toronto_df['Borough']!='Not assigned'].reset_index(drop=True)
toronto_drop_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
# Grouping the Neighborhood with the same Borough.
toronto_grouped_df = toronto_drop_df.groupby(['Postal Code','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_grouped_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
# Making Neighborhood == Not assigned as same as Borough name.
for index , row in toronto_grouped_df.iterrows():
    if row['Neighborhood']=='Not assigned':
        row['Neighborhood']==row['Borough']
        
toronto_grouped_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
# Taking a look at the unique values for Postal Code in the dataset.
toronto_grouped_df['Postal Code'].unique()

array(['M1B', 'M1C', 'M1E', 'M1G', 'M1H', 'M1J', 'M1K', 'M1L', 'M1M',
       'M1N', 'M1P', 'M1R', 'M1S', 'M1T', 'M1V', 'M1W', 'M1X', 'M2H',
       'M2J', 'M2K', 'M2L', 'M2M', 'M2N', 'M2P', 'M2R', 'M3A', 'M3B',
       'M3C', 'M3H', 'M3J', 'M3K', 'M3L', 'M3M', 'M3N', 'M4A', 'M4B',
       'M4C', 'M4E', 'M4G', 'M4H', 'M4J', 'M4K', 'M4L', 'M4M', 'M4N',
       'M4P', 'M4R', 'M4S', 'M4T', 'M4V', 'M4W', 'M4X', 'M4Y', 'M5A',
       'M5B', 'M5C', 'M5E', 'M5G', 'M5H', 'M5J', 'M5K', 'M5L', 'M5M',
       'M5N', 'M5P', 'M5R', 'M5S', 'M5T', 'M5V', 'M5W', 'M5X', 'M6A',
       'M6B', 'M6C', 'M6E', 'M6G', 'M6H', 'M6J', 'M6K', 'M6L', 'M6M',
       'M6N', 'M6P', 'M6R', 'M6S', 'M7A', 'M7R', 'M7Y', 'M8V', 'M8W',
       'M8X', 'M8Y', 'M8Z', 'M9A', 'M9B', 'M9C', 'M9L', 'M9M', 'M9N',
       'M9P', 'M9R', 'M9V', 'M9W'], dtype=object)

In [23]:
# Creating the dataset that we need.
column_names = ['Postal Code','Borough','Neighborhood']
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_grouped_df[toronto_grouped_df["Postal Code"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [27]:
print('The shape of the dataset is : {}'.format(toronto_grouped_df.shape))

The shape of the dataset is : (103, 3)


<H3> Q2: Leveraging Geocoder package & merging datasets. </H3>

In [49]:
# Using Geospatial_coordinates.csv to get the respective coordinates for each postal code.
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

In [50]:
#@hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_3e903b121232457f9198385d56db729c = 'https://s3.ap-geo.objectstorage.softlayer.net'
else:
    endpoint_3e903b121232457f9198385d56db729c = 'https://s3.ap-geo.objectstorage.service.networklayer.com'

client_3e903b121232457f9198385d56db729c = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='wmioSey4yL_3zwK_46o4R09FXkHJZyhx3GarDyLB1R4i',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_3e903b121232457f9198385d56db729c)

body = client_3e903b121232457f9198385d56db729c.get_object(Bucket='capstoneproject-donotdelete-pr-8gg6qvl293kdpl',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

In [51]:
geo_coordinate_df = pd.read_csv(body)
geo_coordinate_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [52]:
# Merge both data sets together.
df_toronto = pd.merge(toronto_grouped_df,geo_coordinate_df,how='left',left_on='Postal Code',right_on='Postal Code')

df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [53]:
# dataframe with necessary columns specified in the question along with the coordinates
column_names = ["Postal Code", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df_toronto[df_toronto["Postal Code"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
