 # Segmenting and Clustering Neighborhoods in Toronto

In [5]:
#import packages
import pandas as pd
import requests
from bs4 import BeautifulSoup

Now we scrap Toronto postalcodes from Wikipedia

In [9]:
#wikipedia link
url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [10]:
#set up beautiful soup
bs=BeautifulSoup(url,'lxml')

since the zipcodes are displayed in a table, we pass "table" statement in the BeautifulSoup

In [14]:
#scrap postalcode table
postable=bs.find('table',{'class':'wikitable sortable'})

In [16]:
#convert the ziptable into a pandas dataframe
posdf=pd.read_html(str(postable),header=0)
posdf=pd.DataFrame(posdf[0])

In [26]:
#filter out the borough that are "not assigned"
posdf.drop(posdf[posdf.Borough=='Not assigned'].index, inplace=True)

In [28]:
# deal with repeated neighborhoods
posdf=posdf.groupby("Postcode").agg(lambda x:','.join(set(x)))
posdf.reset_index(inplace = True)

In [57]:
#rename column
posdf.rename(columns={"Postcode":"Postal Code"},inplace=True)

In [58]:
posdf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
102,M9W,Etobicoke,Northwest
101,M9V,Etobicoke,"Humbergate,Jamestown,Thistletown,Albion Garden..."
100,M9R,Etobicoke,"Martin Grove Gardens,Richview Gardens,Kingsvie..."
99,M9P,Etobicoke,Westmount
98,M9N,York,Weston


In [31]:
print('There are {} rows in the dataframe'.format(posdf.shape[0]))

There are 103 rows in the dataframe


# Use the Geocoder package to get the latitude and longitude

In [32]:
#download
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [33]:
#read teh spatial data into csv with pandas
geo=pd.read_csv('Geospatial_Coordinates.csv')
geo.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [38]:
geo.shape

(103, 3)

geo dataframe has the same number of rows with posdf dataframe

In [None]:
#match the posdf dateframe with geo dataframe
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies



In [59]:
pre_exam=posdf['Postal Code'].isin(geo['Postal Code'])
pre_exam.unique()

array([ True], dtype=bool)

In [68]:
posdf.sort_values(by='Postal Code',ascending=True,axis=0,inplace=True)
geo.sort_values(by='Postal Code',ascending=True,axis=0,inplace=True)

merged=pd.merge(posdf, geo, on=posdf['Postal Code'],right_index=True, left_index=True)

In [69]:
merged.columns

Index(['Postal Code_x', 'Borough', 'Neighbourhood', 'Postal Code_y',
       'Latitude', 'Longitude'],
      dtype='object')

In [70]:
#drop duplicated columns and rename Postal Code column
merged.drop('Postal Code_y',axis=1,inplace=True)
merged.rename(columns={'Postal Code_x':'Postal Code'},inplace=True)

In [71]:
merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Highland Creek,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,West Hill,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [74]:
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize 
!conda install -c conda-forge folium=0.5.0 --yes 
import folium
print('installation finished')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  50.06 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  33.70 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  36.39 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  45.51 MB/s


In [75]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [84]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto