Import the standard libraries to work with dataframes

In [None]:
import numpy as np
import pandas as pd

Use the read_html function of from the Pandas library and pass the url of the wikipedia page. This returns a list of dataframes. These dataframes are assigned to the variable 'dfs'

In [None]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

Display the first five dfs

In [None]:
dfs[0:5]

Assign the first dataframe from dfs to the dataframe: df_toronto

In [None]:
df_toronto = dfs[0]

Display first 8 rows in this dataframe

In [None]:
df_toronto.head(8)

Created a copy of this dataframe for data wrangling

In [None]:
df_toronto_w = df_toronto.copy()

Iterate through each row in dataframe and if the borough for that row is listed as 'Not assigned', removed the row from the dataframe

In [None]:
for index, row in df_toronto_w.iterrows():
    if row['Borough'] == 'Not assigned':
        df_toronto_w.drop(index, inplace=True)

Search for any rows where the Neighbourhood is listed as 'Not Assigned'. Result returns no records.

In [None]:
df_toronto_w.loc[df_toronto_w['Neighbourhood'] == 'Not assigned']

Dispaly resulting dataframe. This will show the indices having been removed. Will then reset index in the following cell.

In [None]:
df_toronto_w.head()

In [None]:
df_toronto_w.reset_index(drop=True, inplace=True)
df_toronto_w.head()

Using the shape function, indicate the number of rows in resulting dataframe.

In [None]:
print('There are ', df_toronto_w.shape[0], ' rows in this dataframe')

Reading the postal codes CSV from site below to obtain latitude and longitude coordinates of each postal code abve.

In [None]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')

Place the coordinates found above into the dataframe previously created using merge.

In [None]:
df_toronto_w = df_toronto_w.merge(coords, left_on='Postal Code', right_on='Postal Code')

Use the groupby and count functions to review number of neighbourhoods in each borough.

In [None]:
df_toronto_w.groupby('Borough').count()

Based on results above, will be reviewing neighbourhoods within the North York borough. Using geopy, obtain the coordinates of North York.

In [None]:
from geopy.geocoders import Nominatim

In [None]:
address = 'North York Ontario'

geolocator = Nominatim(user_agent="nyork_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York borough are {}, {}.'.format(latitude, longitude))

Installing and importing folium to create maps.

In [None]:
!pip install folium==0.5.0 # uncomment this line if you haven't completed the Foursquare API lab

In [None]:
import folium

In [None]:
# create map of New York using latitude and longitude values
map_nyork = folium.Map(location=[latitude, longitude], zoom_start=12)

In [None]:
map_nyork

Create dataframe to contain only North York data.

In [None]:
nyork_data = df_toronto_w[df_toronto_w['Borough'] == 'North York'].reset_index(drop=True)

Using the new dataframe, add markers to map focused on North York

In [None]:
for lat, lng, label in zip(nyork_data['Latitude'], nyork_data['Longitude'], nyork_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyork)  
    
map_nyork

Enter Foursquare API credentials

In [None]:
CLIENT_ID = 'JMH2AKCWYAKJBR3OH0RBY1OQ0CQNS5RU4LZWYWSBOPY214DH' # your Foursquare ID
CLIENT_SECRET = '30MTTRE5Q3XMSKHUL55JZIYAWXKHJE0UFLHPN3YQTMNH5Y1H' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [None]:
nyork_data.loc[0, 'Neighbourhood']

In [None]:
neighbourhood_latitude = nyork_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = nyork_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = nyork_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

In [None]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

Import libraries below to work with json files from Foursquare

In [None]:
import json # library to handle JSON files


import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

Function getNearbyVenues as shown in lab.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Using the above function, get venues from the North York specific dataframe.

In [None]:
# type your answer here
nyork_venues = getNearbyVenues(names=nyork_data['Neighbourhood'],
                                   latitudes=nyork_data['Latitude'],
                                   longitudes=nyork_data['Longitude']
                                  )

Group the venues by Neighbourhood

In [None]:
nyork_venues.groupby('Neighbourhood').count()

Analyze each neighbourhood

In [None]:
# one hot encoding
nyork_onehot = pd.get_dummies(nyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
nyork_onehot['Neighbourhood'] = nyork_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [nyork_onehot.columns[-1]] + list(nyork_onehot.columns[:-1])
nyork_onehot = nyork_onehot[fixed_columns]

nyork_onehot.head()

####  Group rows by neighbourhood by taking the mean of the frequency of occurrence of each category of venues

In [None]:
nyork_grouped = nyork_onehot.groupby('Neighbourhood').mean().reset_index()
nyork_grouped

Find the top venue categories in each neighbourhood

In [None]:
num_top_venues = 5

for hood in nyork_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = nyork_grouped[nyork_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

Function from lab to find most common venue types

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = nyork_grouped['Neighbourhood']

for ind in np.arange(nyork_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyork_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Import KMeans to preform clustering algorithms

In [None]:
from sklearn.cluster import KMeans

In [None]:
# set number of clusters
kclusters = 5

nyork_grouped_clustering = nyork_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

nyork_merged = nyork_data

# merge nyork_grouped with nyork_data to add latitude/longitude for each neighborhood
nyork_merged = nyork_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

nyork_merged.head() # check the last columns!

A number of fields in above dataframe contain Nan values so used the function 'dropna' to remove these to ensure the map markers can be placed in following cells. Also imported necessary classes from matplotlip to assist with map created. Finally, the map of North York with the computed clusters is displayed.

In [None]:
nyork_merged = nyork_merged.dropna()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nyork_merged['Latitude'], nyork_merged['Longitude'], nyork_merged['Neighbourhood'], nyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters