## Part 1.

In [22]:
#Import all the necessary libraries
import numpy as np 
import pandas as pd
import json 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
from bs4 import BeautifulSoup

### Q1: Scraping data from the Wikipedia page¶
First, we use the BeautifulSoup library to read the page html. Looking at the html, we find that the table is under the class 'wikitable sortable'. Using the find method, we pass this to a variable called table.

In [23]:
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#get the html and convert to parser
web_html = requests.get(wiki_link).text
soup = BeautifulSoup(web_html, 'html.parser')

#the table is under the class wikitable sortable
table = soup.find('table', {'class':'wikitable sortable'})

In [24]:
#create blank arrays to hold the data
postal_code = []
borough = []
neighborhood = []

rows = table.find_all('tr') #find rows with the TR tag

for r in rows:
    cell = r.find_all('td') #find cells with the TD tag
    if (len(cell) > 1):
        pcode = cell[0] #first element is the postal code
        brgh = cell[1] #second element is the borough name
        nbhd = cell[2] #third element is the neighborhood name
        
        if (brgh != 'Not assigned')&(nbhd == ''): #Assigned borough but no neighborhood
            nbhd = brgh #assign the neighborhood as the borough
        postal_code.append(pcode.text.strip())
        borough.append(brgh.text.strip())
        neighborhood.append(nbhd.text.strip())

#create a dataframe with the arrays
df_pbn = pd.DataFrame({'PostalCode':postal_code, 'Borough':borough, 'Neighborhood':neighborhood})

#check which Borough value is not Not assigned and pass to new dataframe
filter = df_pbn['Borough'] != 'Not assigned'
df_pbn = df_pbn[filter]

#replace the slash with a comma
for index, rowdata in df_pbn.iterrows():
    df_pbn.at[index, 'Neighborhood'] = df_pbn.at[index, 'Neighborhood'].replace(' /', ',')
    
df_pbn

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [25]:
df_pbn.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M2P,North York,Downsview
freq,1,24,4


## Part2
### Add latitude and longitude data
We first read the CSV from the link provided. The resulting dataframe has Postal Codr, Latitude and Longitude data. We can construct the final table by doing a left join of the previous dataframe with this one, on the postal code keys,

In [26]:
link = 'http://cocl.us/Geospatial_data'
df_pll = pd.read_csv(link)
df_pll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [None]:
#join the two dataframes on the postalcode keys
df_pbnll = df_pbn.merge(df_pll, how = 'left', left_on = 'PostalCode', right_on = 'Postal Code')
df_pbnll.drop('Postal Code', axis = 1, inplace = True)
df_pbnll.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3
### Clustering the neighborhoods

Using the client ID and client secret values, we write a function that iterates over all (latitude, longitude) values in the resultant array from the previous question. For each neighborhood, venues and their latitude and longitude values are read from the JSON and stored to the variable venue_list

In [None]:
CLIENT_ID = 'DFWIYOHQHB4C0ZPNIA0UNRABLJYWNW0RHUEC40GERCF2SZYC' 
CLIENT_SECRET = 'CM3M1LCF4KOXE5D1TAVE3AGQP1MNNFEBG05DG3ZTU0I2YSCF' 
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 500

names = df_pbnll['Neighborhood']
latitudes = df_pbnll['Latitude']
longitudes = df_pbnll['Longitude']

venue_list = []

for name, lat, long in zip(names, latitudes, longitudes):
    
    #create the api request
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        radius, 
        LIMIT)
    
    #make the get request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for v in results:
        venue_list.append([name, 
                            lat, 
                            long, 
                            v['venue']['name'], 
                            v['venue']['location']['lat'], 
                            v['venue']['location']['lng'],
                            v['venue']['categories'][0]['name']])

In [None]:
#create a dataframe
df_venues = pd.DataFrame(venue_list)
columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Name', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
df_venues.columns = columns

#convert categorical data to dummy data using the Venue Category column
cat_columns = ['Venue Category']
df_venues_cat = pd.get_dummies(df_venues, columns = cat_columns)

#remove non essential data columns
columns_rem = ['Venue Name', 'Venue Latitude', 'Venue Longitude']
df_venues_cat.drop(columns_rem, axis = 1, inplace = True)

#group data by neighborhood
df_venues_grouped = df_venues_cat.groupby(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude']).mean().reset_index()
df_venues_grouped.head()

We now run the k means clustering method on this grouped dataset. A total of 4 clusters are declared. The labels obtained from k-means are added to the original dataset as the column 'Cluster'.

In [None]:
#creating clusters
x = df_venues_grouped.drop(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude'], axis = 1)
num_clusters = 4
k_means = KMeans(n_clusters = num_clusters, init = 'k-means++', n_init = 300)
k_means.fit(x)
labels = k_means.labels_
df_venues_grouped['Cluster'] = labels+1
df_venues_grouped['Cluster'].value_counts()

In [None]:
results = pd.DataFrame()
results['N'] = df_venues_grouped['Neighborhood']
results['NLa'] = df_venues_grouped['Neighborhood Latitude']
results['NLo'] = df_venues_grouped['Neighborhood Longitude']
results['C'] = df_venues_grouped['Cluster']

colors = ['red', 'blue', 'green', 'black']

latitude = 43.6532
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start = 10)

for nbh, lat, long, cls in zip(results['N'], results['NLa'], results['NLo'], results['C']):
    label = str(nbh) + ' Cluster ' + str(cls)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = colors[cls-1],
        fill = True,
        fill_color = colors[cls-1],
        fill_opacity = 0.6,
        parse_html = False).add_to(map_toronto)

map_toronto