# Question 1

In [3]:
from bs4 import BeautifulSoup as bsp
import requests
import csv

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #this goes to a website
soup = bsp(source, "lxml")

In [4]:
import pandas as pd

columns = ['Postal Code', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = columns)

table = soup.find('table', class_='wikitable')

for row in table.find_all('tr')[1:]:
    pc = row.find_all('td')[0].text.replace("\n", "")
    bor = row.find_all('td')[1].text.replace("\n", "")
    neigh = row.find_all('td')[2].text.replace("\n", "").replace(" /", ",")
    df = df.append({'Postal Code':pc, 'Borough':bor, 'Neighborhood':neigh}, ignore_index=True)

df_assigned = df[df['Borough']!='Not assigned'].reset_index() # Cut out unassigned post codes
df_assigned.drop('index', axis = 1, inplace = True)

if df_assigned['Postal Code'].nunique() == df_assigned['Postal Code'].count():
    print('All Postal Codes in data are Unique')
else:
    print('There are duplicate Postal Codes!')

print("Shape of Dataframe is: " + str(df_assigned.shape))

print()
df_assigned.head(12)

All Postal Codes in data are Unique
Shape of Dataframe is: (103, 3)



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


# Question 2

Note: attempted to use geocoder, but per the instructions it did not return lat/long.

In [6]:
# import geocoder # import geocoder

# def latlong(postal_code):
#     lat_lng_coords = None
#     i=0
#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#         i = i + 1
#         print(i)
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     return (latitude, longitude)

# # print(latlong('M5G'))

In [7]:
import pandas as pd

df_latlong = pd.read_csv("https://cocl.us/Geospatial_data")

df_assigned = df_assigned.merge(df_latlong, on='Postal Code')

df_assigned.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Question 3

In [8]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

## Set Up Map

In [9]:
toronto_lat = 43.72
toronto_long = -79.40

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_lat, toronto_long], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_assigned['Latitude'], df_assigned['Longitude'], df_assigned['Borough'], df_assigned['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

# k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)
# k_means.fit(X)
map_toronto

## Explore Neighborhoods

In [10]:
CLIENT_ID = input('What is your Foursquare Client ID')
CLIENT_SECRET = input('What is your Foursquare Client Secret')
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # Limit of results returned from API

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

What is your Foursquare Client ID
What is your Foursquare Client Secret


Your credentails:
CLIENT_ID: Y1Z2K1T4JITHVIX5DSBZ2C3ULVFWW405BDLE2JTBE1H0HSGL
CLIENT_SECRET:VVSA5CRGQ0QLFRHDJHGC1JIMH1CRWPZ0G1MFGNVZ0HJQ4JSM


In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
toronto_venues = getNearbyVenues(names=df_assigned['Neighborhood'],
                                   latitudes=df_assigned['Latitude'],
                                   longitudes=df_assigned['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [13]:
# Some venues are called "Neighborhoods", drop these as we are defining our own
drop_neighborhoods = toronto_venues[toronto_venues['Venue Category']=='Neighborhood'].index
toronto_venues.drop(drop_neighborhoods , inplace=True)

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot.insert(0, 'Neighborhood', toronto_venues['Neighborhood'] )

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## K-Clustering

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Base clustering on top 10 most popular venues in each neighborhood

In [19]:
import numpy as np

num_top_venues = 10

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(1,num_top_venues+1):
    columns.append(ind)

# create a new dataframe
neighborhoods_venues_popular = pd.DataFrame(columns=columns)
neighborhoods_venues_popular['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_popular.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_popular_melt = pd.melt(neighborhoods_venues_popular, id_vars=['Neighborhood'], var_name='Popularity', value_name='Venue Category')
neighborhoods_venues_popular_onehot = pd.get_dummies(neighborhoods_venues_popular_melt[['Venue Category']], prefix="", prefix_sep="")
neighborhoods_venues_popular_onehot.insert(0, 'Neighborhood', neighborhoods_venues_popular['Neighborhood'] )
neighborhoods_venues_popular_grouped = neighborhoods_venues_popular_onehot.groupby('Neighborhood').mean().reset_index()

neighborhoods_venues_popular_grouped

In [21]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = neighborhoods_venues_popular_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(init = 'k-means++',n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

toronto_grouped_clustering.head()

Unnamed: 0,Accessories Store,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Asian Restaurant,Athletics & Sports,Auto Workshop,...,Swim School,Thai Restaurant,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
from sklearn.cluster import KMeans
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
# toronto_grouped[['Cluster Labels']] = toronto_grouped[['Cluster Labels']].astype("int64")

toronto_merged = pd.DataFrame()
toronto_merged = df_assigned

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighborhood'), on='Neighborhood')
toronto_merged[['Cluster Labels']] = toronto_merged[['Cluster Labels']].fillna(0).astype("int64")

toronto_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,1,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.022222
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,9,0.153846,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.076923,0.0,0.0,0.000000,0.0,0.000000
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,0.000000,0.000000,0.0,0.0,...,0.0,0.03125,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.031250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North",43.653654,-79.506944,0,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,1,0.000000,0.012658,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.025316
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558,0,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.062500
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,0,0.000000,0.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000


In [23]:
# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters