### Note to Reviewers: This notebook contains all the 3 sections as mentioned in the assignment notes. Please scroll to the respective sections. Thank You !!

## SECTION 1 : Creating a DataFrame of the table in the wikipedia page.

In [190]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from bs4 import BeautifulSoup
import requests
%matplotlib inline

In [67]:
# Making a GET call to the wikipedia page and extracting the HTML content
request = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki_html = str()
if request.status_code == 200:
    wiki_html = request.text
#     print(wiki_html)

In [68]:
# The table that contains the required data is the one with the class name 'wikitable'. 
# We will extract all table data and head.
soup = BeautifulSoup(wiki_html, 'html.parser')
table = soup.find('table', class_='wikitable')
head = [ table_head.get_text().strip() for table_head in table.find_all('th')]

rows = [row.get_text().strip() for row in table.find_all('td')]


In [69]:
# Using pandas.read_html() method to read table to a dataframe from raw html
canada_df = pd.DataFrame(columns=head)
i=0
while True:
    try:
        rows_cleaned = [rows[i],rows[i+1],rows[i+2]]
        temp_df = pd.DataFrame([rows_cleaned], columns=head)
        canada_df = canada_df.append(temp_df, ignore_index=True)
        i = i + 3
    except IndexError:
        break
canada_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [70]:
# Manipulating rows in dataframe

# 1. Drop all rows where borough is 'Not Assigned'
canada_df = canada_df[canada_df['Borough']!='Not assigned'].reset_index(drop=True)
canada_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [71]:
# Manipulating rows in dataframe. There are no such rows.

# 2. Combine rows with same postal code
count = canada_df['Postal Code'].value_counts()
for c in count:
    if count[0] != 1:
        pass
#         print("Needs Attention")
#     else:
#         print("Good To Go")

In [72]:
# Manipulating rows in dataframe

# 3. If a cell has a Borough but not assigned a neighbourhood, then neighbourhood = Borough
for index in range(len(canada_df)):
    if canada_df.loc[index, 'Neighborhood'] == 'Not Assigned':
        canada_df.loc[index, 'Neighborhood'] = canada_df.loc[index, 'Borough']
canada_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [73]:
# Using .shape
canada_df.shape

(103, 3)

## SECTION 2 : Using location csv file to get latitude and longitude of each row in df

In [74]:
#Reading CSV to get lat and long
coords_df = pd.read_csv('Geospatial_Coordinates.csv')
coords_df.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [75]:
# Merging / joining data to old df
canada_df = canada_df.merge(coords_df, on='Postal Code', how='left', sort=True)
canada_df.rename(columns={'Latitude': 'Neighborhood Latitude', 'Longitude': 'Neighborhood Longitude'}, inplace=True)
canada_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


## SECTION 3 : Exploring and clustering neighbourhoods

In [135]:
client_id = 'REDACTED'
client_secret = 'REDACTED'
version = '20200223'
radius = 500
base_url = 'https://api.foursquare.com/v2/venues/explore'
payload = {'client_id':client_id, 'client_secret': client_secret, 'v': version, 'll': '19.2960,72.8487', 'radius': radius}
places_df = pd.DataFrame()

#### Using foursquare API to get categories of places near to the neighbourhoods. Creating a new df with neighborhoods and place categories

In [136]:
for index in range(len(canada_df)):
    payload['ll'] = '{},{}'.format(canada_df.loc[index, 'Neighborhood Latitude'], canada_df.loc[index, 'Neighborhood Longitude'])
    request = requests.get(base_url, params=payload)
    resp = request.json()
    for item in resp['response']['groups'][0]['items']:
        row_dict = {'Neighborhood': [canada_df.loc[index, 'Neighborhood']], 'Place Category': [item['venue']['categories'][0]['name']]}
        temp_df = pd.DataFrame.from_dict(row_dict)
        places_df = places_df.append(temp_df, ignore_index=True)
#         print(item['venue']['name'])
# # #         print(item['venue']['location']['lat'])
# #         print(item['venue']['location']['lng'])
#         print(item['venue']['categories'][0]['name'])
#         print('--------------------------------------------------------')
places_df.head()

Unnamed: 0,Neighborhood,Place Category
0,"Malvern, Rouge",Fast Food Restaurant
1,"Malvern, Rouge",Print Shop
2,"Rouge Hill, Port Union, Highland Creek",Bar
3,"Rouge Hill, Port Union, Highland Creek",Moving Target
4,"Guildwood, Morningside, West Hill",Bank


#### Using get_dummies to get numerical data and also group them to get frequency of categories of each neighborhoods

In [155]:
places_one_hot_df = pd.get_dummies(places_df['Place Category'])
places_one_hot_df.insert(0, 'Neighborhood', places_df['Neighborhood'])
places_one_hot_df.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
places_one_hot_df = places_one_hot_df.groupby('Neighborhood').mean().reset_index()
places_one_hot_df

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.047619,0.000000,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,"Wexford, Maryvale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
89,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.028571,0.0,0.0,0.0,0.0,0.0
90,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
91,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [160]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [161]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
places_sorted = pd.DataFrame(columns=columns)
places_sorted['Neighborhood'] = places_one_hot_df['Neighborhood']

for ind in np.arange(places_one_hot_df.shape[0]):
    places_sorted.iloc[ind, 1:] = return_most_common_venues(places_one_hot_df.iloc[ind, :], num_top_venues)

places_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,"Alderwood, Long Branch",Pizza Place,Gym,Pharmacy,Coffee Shop,Pub,Sandwich Place,Pool,Discount Store,Dessert Shop,Dim Sum Restaurant
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Fried Chicken Joint,Ice Cream Shop,Supermarket,Sushi Restaurant,Diner,Middle Eastern Restaurant,Chinese Restaurant
3,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Yoga Studio,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pharmacy,Pizza Place,Pub,Café,Butcher,Restaurant


#### Clustering using K Means

In [167]:
from sklearn.cluster import KMeans
places_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,"Alderwood, Long Branch",Pizza Place,Gym,Pharmacy,Coffee Shop,Pub,Sandwich Place,Pool,Discount Store,Dessert Shop,Dim Sum Restaurant
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Fried Chicken Joint,Ice Cream Shop,Supermarket,Sushi Restaurant,Diner,Middle Eastern Restaurant,Chinese Restaurant
3,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Yoga Studio,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pharmacy,Pizza Place,Pub,Café,Butcher,Restaurant


In [169]:
no_of_clusters= 6
places_sorted_clustering = places_one_hot_df.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=no_of_clusters, random_state=0).fit(places_sorted_clustering)

In [170]:
kmeans.labels_

array([2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 5, 2, 2, 2, 2,
       0, 2, 0, 2, 2, 1, 0, 2, 0, 2, 2, 2, 3, 2, 5, 2, 1, 0, 2, 2, 2, 0,
       4, 3, 2, 2, 5, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 1, 0,
       2, 2, 2, 2, 0])

In [171]:
final_df = places_sorted.copy()

In [173]:
final_df.insert(0, 'Cluster Labels', kmeans.labels_)

In [183]:
final_df = final_df.merge(canada_df, on = 'Neighborhood', how='left')
final_df

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Postal Code,Borough,Neighborhood Latitude,Neighborhood Longitude
0,2,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Skating Rink,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,M1S,Scarborough,43.794200,-79.262029
1,1,"Alderwood, Long Branch",Pizza Place,Gym,Pharmacy,Coffee Shop,Pub,Sandwich Place,Pool,Discount Store,Dessert Shop,Dim Sum Restaurant,M8W,Etobicoke,43.602414,-79.543484
2,2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Fried Chicken Joint,Ice Cream Shop,Supermarket,Sushi Restaurant,Diner,Middle Eastern Restaurant,Chinese Restaurant,M3H,North York,43.754328,-79.442259
3,2,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Yoga Studio,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,M2K,North York,43.786947,-79.385975
4,2,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Pharmacy,Pizza Place,Pub,Café,Butcher,Restaurant,M5M,North York,43.733283,-79.419750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,2,Willowdale,Coffee Shop,Ramen Restaurant,Grocery Store,Sandwich Place,Pizza Place,Café,Indonesian Restaurant,Steakhouse,Fast Food Restaurant,Ice Cream Shop,M2N,North York,43.770120,-79.408493
94,2,Willowdale,Coffee Shop,Ramen Restaurant,Grocery Store,Sandwich Place,Pizza Place,Café,Indonesian Restaurant,Steakhouse,Fast Food Restaurant,Ice Cream Shop,M2R,North York,43.782736,-79.442259
95,2,Woburn,Coffee Shop,Indian Restaurant,Korean Restaurant,Yoga Studio,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,M1G,Scarborough,43.770992,-79.216917
96,2,Woodbine Heights,Pharmacy,Athletics & Sports,Cosmetics Shop,Curling Ice,Diner,Skating Rink,Park,Beer Store,Spa,Garden Center,M4C,East York,43.695344,-79.318389


In [204]:
canada_lat = 43.6532
canada_long = -79.3832
canada_map = folium.Map(location=[canada_lat, canada_long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(no_of_clusters)
ys = [i + x + (i*x)**2 for i in range(no_of_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_df['Neighborhood Latitude'], final_df['Neighborhood Longitude'], final_df['Neighborhood'], final_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(canada_map)
       
canada_map