Applied Data Science Capstone-Week 3 (Notebook 3)

In [1]:
!pip install bs4
!pip install geopy
import requests
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors



The following cell contains the code for: 1) Creating Dataframe via Web Scraping 2) Removing rows having a Borough that is Not assigned 3) Handling rows having a Neighborhood that is Not assigned 4) Grouping rows by PostalCode

In [2]:
urlData = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(urlData,'html.parser')
soupTableData=soup.find('table').find_all('td')
postalCode=[]
borough=[]
neighborhood=[]
for i in range(0,len(soupTableData),3):
    postalCode.append(soupTableData[i].text.strip())
    borough.append(soupTableData[i+1].text.strip())
    neighborhood.append(soupTableData[i+2].text.strip())
df=pd.DataFrame(data=[postalCode, borough, neighborhood]).transpose()
df.columns=['PostalCode', 'Borough', 'Neighborhood']
df=df[df['Borough']!='Not assigned']
df['Neighborhood'].replace('Not assigned',df['Borough'], inplace=True)
df=df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


The following cell contains the code for adding the Latitude and Longitude coordinates of each PostalCode to the dataframe

In [3]:
df_geo=pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df_ll=pd.merge(df, df_geo, on='PostalCode')
df_ll

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


The following cell contains the code for visualizing the neighborhoods in Toronto (only the boroughs that contain the word 'Toronto' were considered)

In [4]:
df_toronto=df_ll[df_ll['Borough'].str.contains('Toronto')].reset_index(drop=True)
address='Toronto, Canada'
geolocator=Nominatim(user_agent="toronto_explorer")
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))
map_toronto=folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


The following cell contains the code for exploring the neighborhoods in Toronto

In [5]:
CLIENT_ID='VSFLJD2ZWXQECAKVX35E31LYZHJGVPDLLWT50TBBNDDD1TD5' # your Foursquare ID
CLIENT_SECRET='YAI0HN5VTJRNKLWDMDYBTJ1LIFMYHZNLSMYBWJHMG5THJBHO' # your Foursquare Secret
VERSION='20180605' # Foursquare API version
LIMIT=100
radius=500
venues_list=[]
for neighbor, lat, lng in zip(df_toronto['Neighborhood'], df_toronto['Latitude'], df_toronto['Longitude']):
    url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
    results=requests.get(url).json()["response"]['groups'][0]['items']
    venues_list.append([(
        neighbor, 
        lat, 
        lng, 
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])
toronto_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
toronto_venues.columns=['Neighborhood', 
              'Neighborhood Latitude', 
              'Neighborhood Longitude', 
              'Venue', 
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.677820,-79.351265,Cosmetics Shop
...,...,...,...,...,...,...,...
1613,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden
1614,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,TTC Russell Division,43.664908,-79.322560,Light Rail Station
1615,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park
1616,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Olliffe On Queen,43.664503,-79.324768,Butcher


The following cell contains the code for listing out the top five most common venues of each neighborhood in Toronto

In [6]:
toronto_onehot=pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood']=toronto_venues['Neighborhood']
fixed_columns=[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot=toronto_onehot[fixed_columns]
toronto_grouped=toronto_onehot.groupby('Neighborhood').mean().reset_index()
num_top_venues=5
for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp=toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns=['venue','freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2  Seafood Restaurant  0.04
3                Café  0.04
4         Cheese Shop  0.04


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.12
1  Breakfast Spot  0.08
2       Nightclub  0.08
3          Bakery  0.08
4     Coffee Shop  0.08


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0    Light Rail Station  0.13
1           Pizza Place  0.07
2            Restaurant  0.07
3  Fast Food Restaurant  0.07
4        Farmers Market  0.07


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3       Coffee Shop  0.06
4               Bar  0.06


----Central Bay Street----
                 venue  freq
0      

The following cell contains the code for creating a dataframe that stores the top ten venues of each neighborhood in Toronto

In [7]:
def return_most_common_venues(row, num_top_venues):
    row_categories=row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues=10
indicators=['st', 'nd', 'rd']
columns=['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
neighborhoods_venues_sorted=pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood']=toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:]=return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Café,Restaurant,Cheese Shop,Seafood Restaurant,Beer Bar,Bakery,Bistro,Fish Market
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Bakery,Nightclub,Restaurant,Bar,Intersection,Stadium,Climbing Gym
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Pizza Place,Garden,Park,Skate Park,Farmers Market,Fast Food Restaurant,Burrito Place,Butcher,Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Harbor / Marina,Coffee Shop,Plane,Sculpture Garden,Bar,Rental Car Location
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Japanese Restaurant,Department Store,Burger Joint,Bubble Tea Shop,Salad Place,Ramen Restaurant
5,Christie,Grocery Store,Café,Park,Restaurant,Italian Restaurant,Diner,Baby Store,Athletics & Sports,Candy Store,Nightclub
6,Church and Wellesley,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Gay Bar,Yoga Studio,Hotel,Café,Burger Joint,Bubble Tea Shop
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Italian Restaurant,Deli / Bodega
8,Davisville,Dessert Shop,Sandwich Place,Pizza Place,Café,Italian Restaurant,Sushi Restaurant,Coffee Shop,Gym,Pharmacy,Indian Restaurant
9,Davisville North,Gym / Fitness Center,Breakfast Spot,Hotel,Food & Drink Shop,Department Store,Sandwich Place,Park,Dumpling Restaurant,Donut Shop,Eastern European Restaurant


The following cell contains the code for creating a dataframe that stores the cluster as well as the top ten venues of each neighborhood in Toronto

In [8]:
kclusters=5
toronto_grouped_clustering=toronto_grouped.drop('Neighborhood', 1)
kmeans=KMeans(n_init=300, n_clusters=kclusters, random_state=5).fit(toronto_grouped_clustering)
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged=df_toronto
toronto_merged=toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Trail,Health Food Store,Pub,Doner Restaurant,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Women's Store
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,2,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Bookstore,Restaurant,Furniture / Home Store,Yoga Studio,Japanese Restaurant,Indian Restaurant
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,2,Park,Pizza Place,Fast Food Restaurant,Pet Store,Liquor Store,Fish & Chips Shop,Steakhouse,Italian Restaurant,Brewery,Sandwich Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Gastropub,Gym / Fitness Center,Fish Market,Pet Store,Park
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Bus Line,Women's Store,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2,Gym / Fitness Center,Breakfast Spot,Hotel,Food & Drink Shop,Department Store,Sandwich Place,Park,Dumpling Restaurant,Donut Shop,Eastern European Restaurant
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,2,Clothing Store,Coffee Shop,Gym / Fitness Center,Mexican Restaurant,Salon / Barbershop,Restaurant,Rental Car Location,Pet Store,Park,Miscellaneous Shop
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,2,Dessert Shop,Sandwich Place,Pizza Place,Café,Italian Restaurant,Sushi Restaurant,Coffee Shop,Gym,Pharmacy,Indian Restaurant
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,0,Summer Camp,Lawyer,Park,Tennis Court,Women's Store,Dessert Shop,Diner,Discount Store,Distribution Center,Doner Restaurant
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,2,Coffee Shop,Pub,Light Rail Station,American Restaurant,Sushi Restaurant,Restaurant,Bank,Fried Chicken Joint,Sports Bar,Bagel Shop


The following cell contains the code for visualizing the clusters of neighborhoods in Toronto 

In [9]:
map_clusters=folium.Map(location=[latitude, longitude], zoom_start=11)
x=np.arange(kclusters)
ys=[i + x + (i*x)**2 for i in range(kclusters)]
colors_array=cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]
markers_colors=[]
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label=folium.Popup(str(poi) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)       
map_clusters