# Coursera Capstone project

## Author : Oliver Ignetik

Notebook for completing the capstone project

In [1]:
# pandas and numpy
import pandas as pd 
import numpy as np 

#foursquare
import json 
import requests

# packages for clustering 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium 

## Using pandas to parse HTML table

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

In [3]:
df = dfs[0]
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Check the data types 

In [4]:
df.dtypes

Postcode         object
Borough          object
Neighbourhood    object
dtype: object

# Data wrangling

## Dataframe for boroughs 

In [5]:
# Ensuring Boroughs and Neighbourhoods are not empty 
for index, row in df.iterrows():
    if df.loc[index].Borough == 'Not assigned': 
        df.drop([index], axis = 0,inplace=True)
    else: 
        if df.loc[index].Neighbourhood == 'Not assigned': 
            df.loc[index].Neighbourhood = df.loc[index].Borough

In [6]:
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
# list for container of dictionaries for each entry in dataframe 
clean_df = []

In [8]:
# get a unique list of postcodes 
postcodes = df.Postcode.unique().tolist()

# loop through list of postcodes 
for postcode in postcodes: 
    # initialize hoods with empty string 
    hoods = ''
    # initialize borough with empty string 
    borough = ''
    # loop through df 
    for index, row in df.iterrows():
        # if there is no entry in borough then assign 
        if df.loc[index].Postcode == postcode and len(borough) == 0: 
            borough=df.loc[index].Borough
            if len(hoods) == 0: 
                hoods = df.loc[index].Neighbourhood
            elif len(hoods) != 0:
                hoods = hoods + ', '+ df.loc[index].Neighbourhood
        # if not then don't assign borough 
        elif df.loc[index].Postcode == postcode and len(borough)!= 0: 
            if len(hoods) == 0: 
                hoods = df.loc[index].Neighbourhood
            elif len(hoods) != 0:
                hoods = hoods + ', '+ df.loc[index].Neighbourhood
    
    #append each entry to dataframe 
    clean_df.append({'Postcode':postcode,'Borough':borough,'Neighbourhood':hoods})

In [9]:
clean_df = pd.DataFrame(clean_df, columns = df.columns)
clean_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## Geospatial data for each Postcode 

In [10]:
# read in csv file
url = 'https://cocl.us/Geospatial_data'
c = pd.read_csv(url)
c.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# extract list of latitudes and longitudes 
latitudes = []
longitudes = []

# loop through using postcode as mask 
for postcode in postcodes: 
    for index, row in c.iterrows(): 
        if postcode == c.loc[index]['Postal Code']: 
            latitudes.append(c.loc[index].Latitude)
            longitudes.append(c.loc[index].Longitude)
            

In [12]:
# makes two new columns in clean_df
clean_df['Latitude'] = latitudes
clean_df['Longitude'] = longitudes 

# clean dataframe for analysis 
clean_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [178]:
# checking latitude and longitude are correct 
# clean_df.loc[clean_df['Borough'] =='Scarborough']

# Visualising Neighbourhoods 

In [14]:
# finding the address for the city 
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [15]:
# plotting all the neighbourhoods on a map 
# Author : Coursera 
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat,lng,borough,hood in zip(clean_df['Latitude'], clean_df['Longitude'], clean_df['Borough'], clean_df['Neighbourhood']):
    label = '{}'.format(hood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Clustering Neighbourhoods

In [187]:
# foursquare credentials 
import json 
import requests

CLIENT_ID = '12G2NUYXARHRHTBBBO2QGXOKE4PHB0G54DQ5FYI0WVVD1MVX' # your Foursquare ID
CLIENT_SECRET = 'DWMUSFMCBK4EDOH3KLHBIKQO4OYVJDO3TDQPOEKMZG4H00HZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 12G2NUYXARHRHTBBBO2QGXOKE4PHB0G54DQ5FYI0WVVD1MVX
CLIENT_SECRET:DWMUSFMCBK4EDOH3KLHBIKQO4OYVJDO3TDQPOEKMZG4H00HZ


In [205]:
# function to get nearby venues
# Author: Coursera  

def getNearbyVenues(names, latitudes, longitudes, radius=650, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [206]:
# get venues for each neighbourhood
toronto_venues = getNearbyVenues(names=clean_df['Neighbourhood'],
                                   latitudes=clean_df['Latitude'],
                                   longitudes=clean_df['Longitude']
                                  )

In [207]:
# Author: Coursera
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# grouping 
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Most popular venues in each neighbourhood

In [208]:
# Find top 10 venues in each neighbourhood 
# Author: Coursera
num_top_venues = 10

# function to find most common venues 
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,American Restaurant,Steakhouse,Bar,Bakery,Thai Restaurant,Theater,Hotel,Salad Place
1,Agincourt,Skating Rink,Sandwich Place,Lounge,Clothing Store,Breakfast Spot,Badminton Court,Yoga Studio,Dim Sum Restaurant,Diner,Discount Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,BBQ Joint,Pizza Place,Fast Food Restaurant,Gym,Park,Bakery,Dog Run,Dessert Shop,Dim Sum Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Fried Chicken Joint,Fast Food Restaurant,Beer Store,Discount Store,Japanese Restaurant,Sandwich Place,Coffee Shop,Hardware Store
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pool,Skating Rink,Gas Station,Pub,Sandwich Place,Gym,Pharmacy,Doner Restaurant


## K-means clustering

In [209]:
# K-means clustering algorithm 
# set number of clusters
kclusters = 5

# remove this feature as its not numeric
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = clean_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

# drop NaN values where foursquare could not retrieve information 
toronto_merged.dropna(inplace =True)
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Park,Fast Food Restaurant,Pet Store,Burger Joint,Food & Drink Shop,Yoga Studio,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,3.0,Playground,Coffee Shop,Portuguese Restaurant,Hockey Arena,Café,Dog Run,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,3.0,Coffee Shop,Pub,Bakery,Park,Theater,Italian Restaurant,Breakfast Spot,Café,Thai Restaurant,Mexican Restaurant
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,3.0,Furniture / Home Store,Clothing Store,Vietnamese Restaurant,Accessories Store,Bar,Sporting Goods Shop,Boutique,Bowling Alley,Rental Car Location,Cheese Shop
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,3.0,Coffee Shop,Sandwich Place,Café,Falafel Restaurant,Italian Restaurant,Burrito Place,Japanese Restaurant,Park,Burger Joint,Bar


## Mapping clusters

In [210]:
# finding the address for the city 
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [211]:
# create map and populate with markers that indicate cluster
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(int(cluster)), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters