## Section 01

#### Scrape data from Wikipedia

In [187]:
# import Libraries

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

In [188]:
# link to the Wikipedia webpage where Toronto Neighborhood data is available

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# using urllib to request, open and read data (Note: Method used in the course with requests and json failed with this assignment)

wiki_html = urllib.request.urlopen(url).read()

# using BeautifulSoup scrap the webpage data

wiki_soup = BeautifulSoup(wiki_html, 'html.parser')

# using find_all method for 'table' tag, resultant list item 0 privdes the Toronto table data

Toronto_tbl_html=wiki_soup.find_all('table')[0]
Toronto_tbl_html

# using the find_all method withing Toronto table to list all 'tr' tags to find rows

table_rows=Toronto_tbl_html.find_all('tr')

# print first 10 items to analyse data

print(table_rows[0:3])

[<tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>]


In [189]:
# creating Array using each rows as follows

table_data=list()
for rows in table_rows:
    table_data.append(rows.find_all('td'))
    
# print first 5 items to understand data

print(table_data[0:5])

[[], [<td>M1A
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>], [<td>M2A
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>], [<td>M3A
</td>, <td>North York
</td>, <td>Parkwoods
</td>], [<td>M4A
</td>, <td>North York
</td>, <td>Victoria Village
</td>]]


In [190]:
# cleaning up the html tags appearing with the table data using following loop and creating new_table with clean data

new_table=list()
for row in table_data:
    new_row=list()
    for item in row:
        #print(item.text.strip()) --> verifying the outcome
        new_row.append(item.text.strip())
    new_table.append(new_row)
    
# print first five lists to verify the table

new_table[0:5]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [191]:
# convert to Dataframe using pandas method

df=pd.DataFrame(new_table, columns=None)
df.head()

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [192]:
# extract the column names of the table 
column_list=list()
for item in Toronto_tbl_html.find_all('th'):
    column_list.append(item.text.strip())
column_list

# remove redundant index, reset and rename columns
df=df.loc[1:,:].reset_index(drop=True)
df.columns=column_list
print('shape of the Toronto table before processing is {}'.format(df.shape))
df.head()

shape of the Toronto table before processing is (180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [193]:
# process the data frame as per the criteria 

Toronto_df=df[(df.Borough!='Not assigned')].reset_index(drop=True)
Toronto_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [194]:
print('shape of the Toronto dataframe after removing Not assigned rows is {}'.format(Toronto_df.shape))

shape of the Toronto dataframe after removing Not assigned rows is (103, 3)


In [195]:
print('Toronto dataframe contains {} unique Boroughs and {} unique Nieghborhoods'.format(len(Toronto_df.Borough.unique()),len(Toronto_df.Neighborhood.unique())))

Toronto dataframe contains 10 unique Boroughs and 99 unique Nieghborhoods


**Hence check which Neighborhoods are repeated in the Dataframe as follows**

In [196]:
Toronto_grouped = Toronto_df.groupby(['Borough','Neighborhood']).count()
Toronto_grouped[Toronto_grouped['Postal Code']>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Postal Code
Borough,Neighborhood,Unnamed: 2_level_1
North York,Don Mills,2
North York,Downsview,4


In [197]:
# now lets print the above Neighborhoods to find the relevant details from the Toronto_df

Toronto_repeated = Toronto_df[Toronto_df.Neighborhood=='Don Mills'].append(Toronto_df[Toronto_df.Neighborhood=='Downsview'])
Toronto_repeated

Unnamed: 0,Postal Code,Borough,Neighborhood
7,M3B,North York,Don Mills
13,M3C,North York,Don Mills
40,M3K,North York,Downsview
46,M3L,North York,Downsview
53,M3M,North York,Downsview
60,M3N,North York,Downsview


So combining the above postal codes we get

In [198]:
Toronto_df_grouped = Toronto_df.groupby(['Borough','Neighborhood'],as_index=False).sum()
Toronto_df_grouped

Unnamed: 0,Borough,Neighborhood,Postal Code
0,Central Toronto,Davisville,M4S
1,Central Toronto,Davisville North,M4P
2,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",M5P
3,Central Toronto,Lawrence Park,M4N
4,Central Toronto,"Moore Park, Summerhill East",M4T
...,...,...,...
94,York,Caledonia-Fairbanks,M6E
95,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",M6M
96,York,Humewood-Cedarvale,M6C
97,York,"Runnymede, The Junction North",M6N


In [199]:
Toronto_df_processed = Toronto_df_grouped[['Postal Code','Borough','Neighborhood']]
Toronto_df_processed.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M4S,Central Toronto,Davisville
1,M4P,Central Toronto,Davisville North
2,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park"
3,M4N,Central Toronto,Lawrence Park
4,M4T,Central Toronto,"Moore Park, Summerhill East"


However, it is noteworthy that later in Section 02, the CSV file provided by Coursera contains 103 indices and hence, i will disregard **'Toronto_df_processed'** and use **'Toronto_df'** for the subsequent assignment

____

## Section 02

**Use Geocoder and find the Latitudes and Longitudes corresponding to the Neighborhoods in the above Dataframe**

In [200]:
# import Geocoders Library to find coordinates

from geopy.geocoders import Nominatim

In [201]:
# use while loop to extract each Neighborhood address from the Toronto_df

i = 0
none_list=list()
while i < len(Toronto_df.Neighborhood):
    address=Toronto_df.Neighborhood[i] 
    
        #------Geocoder------
    try:
        geolocator = Nominatim(user_agent="Toronto_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        #print('{}. The geograpical coordinates of {} are {}, {}.'.format(i,address,latitude, longitude))
        i = i+1

    except:
        #print('{}. None for {}.'.format(i,address))
        none_list.append(address)
        i = i+1
        continue
print('Couldnt obtain coordinates for {} of Neighborhoods'.format(len(none_list)))

Couldnt obtain coordinates for 37 of Neighborhoods


**After several attempts to render coordinates via Geocoder, number of Neighnorhoods still missing their coordinates.
Hence, I will be opting to use the CSV file provided by Coursera.**

In [202]:
# extracting CSV file with coordinates to a Dataframe as follows

url_coord = 'https://cocl.us/Geospatial_data'

df_coord=pd.read_csv(url_coord)

# set Postal Code as index

df_coord=df_coord.set_index('Postal Code', drop=True) 
df_coord.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [203]:
# Create two lists to extract Lat and Long corresponding to each postal code

Latitude=list()
Longitude=list()
for p_code in Toronto_df['Postal Code']:
    #print(p_code)
    try:
        Latitude.append(df_coord.loc[p_code,'Latitude'])
        Longitude.append(df_coord.loc[p_code,'Longitude'])
    except:
        continue
print('{} Latitudes and {} Longitudes captured'.format(len(Latitude),len(Longitude)))

103 Latitudes and 103 Longitudes captured


In [204]:
Toronto_df['Latitude'],Toronto_df['Longitude'] = Latitude,Longitude

Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Saving the Dataframe containing Toronto data as a checkpoint hence, the above process is not requried to be run for subsequent processes --> Just did for fun

Toronto_df.to_csv(r'C:\Users\Prasanna Suresh\Desktop\py4e\DataScience_reloaded\Toronto.csv')

____

## Section 03

### Analysing Neighborhoods and Mapping

In [205]:
# Lets find coordinate for Toronto to create a Map so that we can superimpose all the Neighborhoods to visualise

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [206]:
# Lets findout the number of Boroughs and Neighborhoods in Toronto

print('The Toronto_df has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),Toronto_df.shape[0]))

The Toronto_df has 10 boroughs and 103 neighborhoods.


In [207]:
import folium

In [208]:
# create map of Toronto using latitude and longitude values

map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [209]:
# Lets find how many Boroughs in the Dataframe

Toronto_Boroughs = Toronto_df.Borough.unique()
Toronto_Boroughs

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

### Let's analyse the Neighborhood in the first Borough in the above list as follows:

**Step-1 : Creating a new Data Frame for the particular Borough**

In [210]:
df_NorthYork=Toronto_df[Toronto_df['Borough']==Toronto_Boroughs[0]]
df_NorthYork = df_NorthYork.reset_index(drop=True)
df_NorthYork.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073


In [211]:
print('Number of Neighborhood in North York is {}'.format(df_NorthYork.shape[0]))

Number of Neighborhood in North York is 24


**Step-2: Finding the coordinates for the Borough in order to create a Folium map and analyse Neighborhoods**

In [212]:
# Lets find coordinate for North York to create a Map so that we can superimpose Neighborhoods in North York

address = 'North York, Toronto'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address,latitude, longitude))

The geograpical coordinate of North York, Toronto are 43.7543263, -79.44911696639593.


**Step-3: Create the Map relevant to the Borough and its Neighboorhoods**

In [213]:
# create map using latitude and longitude values

map_NorthYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_NorthYork['Latitude'], df_NorthYork['Longitude'], df_NorthYork['Borough'], df_NorthYork['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork)  
    
map_NorthYork

**Step-4: Using Four Square API in order to analyse each Neighborhood venue**

In [214]:
#Setting up Frousquare credentials and other relevant search parameters

CLIENT_ID = '5ROK32HYDJCZ5LPXBBIJWULIPBQQKRRB2FXCPLZB0MC3P0IH' # your Foursquare ID
CLIENT_SECRET = 'KNXGCE5K11BWJCHRBHQHA4E1YVZA4EIPODIQQW3DX352HOH2' # your Foursquare Secret
VERSION = '20200630' # Foursquare API version
radius = 500
limit = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5ROK32HYDJCZ5LPXBBIJWULIPBQQKRRB2FXCPLZB0MC3P0IH
CLIENT_SECRET:KNXGCE5K11BWJCHRBHQHA4E1YVZA4EIPODIQQW3DX352HOH2


**Let us find out the Foursquare API call request content to obtain the relevant data for our Analysis**

GET https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}

**Step-5: Lets check the first Neighborhood in North York, Toronto**

In [215]:
Neighborhood_name = df_NorthYork.loc[0,'Neighborhood']
latitude = df_NorthYork.loc[0,'Latitude']
longitude = df_NorthYork.loc[0,'Longitude']
print('Latitude and Longitude of {} is {},{}'.format(Neighborhood_name,latitude,longitude))

Latitude and Longitude of Parkwoods is 43.7532586,-79.3296565


In [216]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID,CLIENT_SECRET,latitude,longitude,VERSION,radius,limit)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=5ROK32HYDJCZ5LPXBBIJWULIPBQQKRRB2FXCPLZB0MC3P0IH&client_secret=KNXGCE5K11BWJCHRBHQHA4E1YVZA4EIPODIQQW3DX352HOH2&ll=43.7532586,-79.3296565&v=20200630&radius=500&limit=100'

In [217]:
import requests
results = requests.get(url).json()

In [218]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [219]:
import json
from pandas import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [220]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


### Creating a function to carryout Step 5 above and Lets analyse all Neighborhood in North York Boroug

In [221]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [222]:
NorthYork_venues=getNearbyVenues(df_NorthYork.Neighborhood,df_NorthYork.Latitude,df_NorthYork.Longitude)

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


In [223]:
print(NorthYork_venues.shape)
NorthYork_venues.head()

(235, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [224]:
print('we found {} of venues related to {} of Neighborhoods in North York'.
      format(NorthYork_venues.shape[0],len(NorthYork_venues.Neighborhood.unique())))

we found 235 of venues related to 19 of Neighborhoods in North York


Again did for fun saving the work after the search of venues using Foursquare   
NorthYork_venues.to_csv(r'C:\Users\Prasanna Suresh\Desktop\py4e\DataScience_reloaded\NorthYork_venues.csv')

In [225]:
# Counting the number of venues returned for each Nieghborhood

NorthYork_venues[['Neighborhood','Venue']].groupby('Neighborhood').count().rename(columns={'Venue':'Count'})

Unnamed: 0_level_0,Count
Neighborhood,Unnamed: 1_level_1
"Bathurst Manor, Wilson Heights, Downsview North",19
Bayview Village,4
"Bedford Park, Lawrence Manor East",25
Don Mills,25
Downsview,16
"Fairview, Henry Farm, Oriole",63
Glencairn,4
Hillcrest Village,4
Humber Summit,4
"Humberlea, Emery",1


In [226]:
print('There are {} uniques categories.'.format(len(NorthYork_venues['Venue Category'].unique())))

There are 102 uniques categories.


**Create a Data Frame with encoding based on 'Venue Categories' for each Neighborhood**

In [227]:
# one hot encoding
NorthYork_onehot = pd.get_dummies(NorthYork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NorthYork_onehot['Neighborhood'] = NorthYork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

# grouping the onehot dataframe and deriving the mean

NorthYork_grouped = NorthYork_onehot.groupby('Neighborhood').mean().reset_index()
NorthYork_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,...,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.04,0.0,0.04,0.0,0.04,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.04,0.0,0.08,0.0,0.0,0.0,...,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finding out the 5 most frequent Venues for each Neighborhood in North York

In [228]:
num_top = 10

for i in range(NorthYork_grouped.shape[0]):
    print(NorthYork_grouped.iloc[i,0])
    print(NorthYork_grouped.iloc[i,1:].sort_values(ascending=False)[0:num_top])
    print('')

Bathurst Manor, Wilson Heights, Downsview North
Bank               0.105263
Coffee Shop        0.105263
Diner             0.0526316
Ice Cream Shop    0.0526316
Pharmacy          0.0526316
Deli / Bodega     0.0526316
Restaurant        0.0526316
Bridal Shop       0.0526316
Sandwich Place    0.0526316
Shopping Mall     0.0526316
Name: 0, dtype: object

Bayview Village
Chinese Restaurant            0.25
Japanese Restaurant           0.25
Café                          0.25
Bank                          0.25
Clothing Store                   0
Coffee Shop                      0
Comfort Food Restaurant          0
Construction & Landscaping       0
Convenience Store                0
Cosmetics Shop                   0
Name: 1, dtype: object

Bedford Park, Lawrence Manor East
Restaurant                 0.08
Sandwich Place             0.08
Italian Restaurant         0.08
Coffee Shop                0.08
Café                       0.04
Butcher                    0.04
Comfort Food Restaurant    0.04


**Now define function using the above step and creatind new data frame to capture the sorted venues per each Nieghborhood**

In [229]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = NorthYork_grouped['Neighborhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Diner,Ice Cream Shop,Pharmacy,Deli / Bodega,Restaurant,Bridal Shop,Sandwich Place,Shopping Mall
1,Bayview Village,Chinese Restaurant,Japanese Restaurant,Café,Bank,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
2,"Bedford Park, Lawrence Manor East",Restaurant,Sandwich Place,Italian Restaurant,Coffee Shop,Café,Butcher,Comfort Food Restaurant,Indian Restaurant,Breakfast Spot,Pharmacy
3,Don Mills,Gym,Beer Store,Japanese Restaurant,Asian Restaurant,Restaurant,Coffee Shop,Sandwich Place,Chinese Restaurant,Caribbean Restaurant,Café
4,Downsview,Park,Grocery Store,Electronics Store,Baseball Field,Discount Store,Bus Stop,Hotel,Shopping Mall,Snack Place,Gym / Fitness Center


Carrying out Kmeans clustering (no of clusters = 5) 

In [230]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [231]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

NorthYork_merged = df_NorthYork

# merge df_NorthYork with neighborhoods_venues_sorted to add latitude/longitude for each neighborhood
NorthYork_merged = NorthYork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


In [232]:
NorthYork_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,3.0,Food & Drink Shop,Park,Women's Store,Department Store,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Intersection,French Restaurant,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Department Store,Clothing Store,Comfort Food Restaurant,Construction & Landscaping
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Clothing Store,Accessories Store,Sporting Goods Shop,Event Space,Coffee Shop,Miscellaneous Shop,Furniture / Home Store,Boutique,Vietnamese Restaurant,Bank
3,M3B,North York,Don Mills,43.745906,-79.352188,0.0,Gym,Beer Store,Japanese Restaurant,Asian Restaurant,Restaurant,Coffee Shop,Sandwich Place,Chinese Restaurant,Caribbean Restaurant,Café
4,M6B,North York,Glencairn,43.709577,-79.445073,0.0,Park,Asian Restaurant,Pub,Japanese Restaurant,Department Store,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping
5,M3C,North York,Don Mills,43.7259,-79.340923,0.0,Gym,Beer Store,Japanese Restaurant,Asian Restaurant,Restaurant,Coffee Shop,Sandwich Place,Chinese Restaurant,Caribbean Restaurant,Café
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,0.0,Golf Course,Mediterranean Restaurant,Pool,Dog Run,Department Store,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,0.0,Bank,Coffee Shop,Diner,Ice Cream Shop,Pharmacy,Deli / Bodega,Restaurant,Bridal Shop,Sandwich Place,Shopping Mall
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,0.0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Food Court,Cosmetics Shop,Convenience Store,Bank,Bakery,Japanese Restaurant
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262,0.0,Caribbean Restaurant,Bar,Massage Studio,Coffee Shop,Furniture / Home Store,Arts & Crafts Store,Diner,Comfort Food Restaurant,Construction & Landscaping,Airport


**Finally, Map the Neighborhoods related to North York, Toronto using folium**

In [233]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [234]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighborhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    
    # Had to introduce following try and except since 
    try:
        folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
    except:
        folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[0],
        fill=False,
        #fill_color=rainbow[0],
        #fill_opacity=0.7
        ).add_to(map_clusters)

                               
map_clusters