# IBM Data Science Capstone Project Week 3

**Segmenting and Clustering Neighborhoods in Toronto**

# **Part 1** 
**Data Scrapping and Building a Dataframe**


**1. Importing required dependencies**

In [2]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np

**2. Web scrapping and storing the information in a dataframe**

In [3]:
#GET request and parsing
webpage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(webpage, 'lxml')

#from table to dataframe
table_contents=[]
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

df_toronto = pd.DataFrame(table_contents)
df_toronto['Borough']=df_toronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


**3. Grouping Neighborhoods with same Postal Code**

In [4]:
#grouping neighbourhoods with same postal codes

df_toronto = df_toronto.groupby(['PostalCode', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))
df_toronto['Neighborhood'] = df_toronto['Neighborhood'].str.replace('/', ',')
df_toronto.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


**4. Shape of the Dataframe**

In [5]:
#Shape of the dataframe
df_toronto.shape

(103, 3)

# **Part 2**

**Lattitude and Longitude**

**1. Importing required dependencies**

In [6]:
#installing and importing dependencies
!pip install geocoder
import geocoder



**2. Getting Lattitue and Longitude Information** 

*As geocoder package is taking too long to retrive the data, The dataset of postal codes is used for the assignment purpose as mentioned in the coursera course assignment.*'


In [None]:
#Getting Coordinates
postalcodelist = df_toronto['PostalCode'].tolist()
coordinates = []
for postalcode in postalcodelist:
  lat_lng_coords = None
  while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format(postalcode))
    lat_lng_coords = g.latlng
  coordinates = lat_lng_coords.tolist()

**3. Importing Postalcode dataset and preprocessing**

In [8]:
#importing the grospatial coordinates for postalcodes
url_csv = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"
df_toronto_postcodes = pd.read_csv(url_csv , header = None )

#Formatting the dataset 
df_toronto_postcodes.rename(columns = {0:"PostalCode" , 1:"Latitude", 2:"Longitude"}, inplace = True)
df_toronto_postcodes.drop(0 , axis = 0 , inplace=True)
df_toronto_postcodes = df_toronto_postcodes.reset_index(drop=True)
df_toronto_postcodes.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.8066863,-79.1943534
1,M1C,43.7845351,-79.1604971
2,M1E,43.7635726,-79.1887115
3,M1G,43.7709921,-79.2169174
4,M1H,43.773136,-79.2394761


In [9]:
#Merging df_toronto and df_toronto_postcodes 
df_toronto_ll = pd.merge(df_toronto, df_toronto_postcodes, on='PostalCode')
df_toronto_ll.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8066863,-79.1943534
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761


**4. Storing the dataset**

In [10]:
df_toronto_ll.to_csv('toronto.csv',index=False)

# **Part 3**

**Segmenting and Clustering Neighborhoods**

In [11]:
import folium
import requests 
import json 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
import sys
#print(sys.getrecursionlimit())
sys.setrecursionlimit(9999)

In [13]:
address = 'Toronto , Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto_ll['Latitude'], df_toronto_ll['Longitude'], df_toronto_ll['Borough'], df_toronto_ll['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

RecursionError: ignored