# **Part 1**

In [193]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [82]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [83]:
df = pd.read_html(wiki_url)
df = df[0]

new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

In [84]:
df.replace('Not assigned', np.nan, inplace=True)

In [99]:
df = df.dropna()

In [100]:
df.shape

(103, 3)

# **Part 2**

In [88]:
lat_long = pd.read_csv('Geospatial_Coordinates.csv')

lat_long.shape

(103, 3)

In [101]:
df2 = pd.merge(df, lat_long, on = 'Postal Code')

df2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# **Part 3**

In [128]:
! pip install -c conda-forge geopy

from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans

[31mCould not open requirements file: [Errno 2] No such file or directory: 'conda-forge'[0m
[33mYou are using pip version 10.0.1, however version 20.3b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [119]:
df2['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

# Getting the location of Toronto, ON

In [120]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_exp")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [125]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Get venues in Toronto

In [133]:
def getVenues(names, lats, longs, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, lats, longs):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            '3Q0HNQKYJCLTULALALVPYRVI3ATRWDKZ4HRI3SOXZNIXTS5E', 
            'OXXAJZMFJFX0Z5HXQRR0RRAP3MEEVPVCG4LRQFUDO5IDJN4Y', 
            '20180605', 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(venues)

In [141]:
toronto_venue = getVenues(names=df2['Neighbourhood'],
                                   lats=df2['Latitude'],
                                   longs=df2['Longitude']
                                  )
toronto_venue.shape

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

(2167, 7)

**Take a look to make sure column values were placed correctly**

In [171]:
toronto_venue.head(5)

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


**Taking a look at the count of venues per neighbourhood then creating a dummy dataframe for each venue**

In [151]:
toronto_venue['Neighbourhood'].value_counts()

Toronto Dominion Centre, Design Exchange                                                                                                  100
First Canadian Place, Underground city                                                                                                    100
Harbourfront East, Union Station, Toronto Islands                                                                                         100
Richmond, Adelaide, King                                                                                                                  100
Garden District, Ryerson                                                                                                                  100
Commerce Court, Victoria Hotel                                                                                                            100
Stn A PO Boxes                                                                                                                             96
St. Ja

In [156]:
toronto_dummy = pd.get_dummies(toronto_venue[['Venue Category']], prefix="", prefix_sep="")

toronto_dummy['Neighbourhood'] = toronto_venue['Neighbourhood'] 

fixed_columns = [toronto_dummy.columns[-1]] + list(toronto_dummy.columns[:-1])
toronto_dummy = toronto_dummy[fixed_columns]

toronto_dummy_group = toronto_dummy.groupby('Neighbourhood').mean().reset_index()

In [167]:
toronto_dummy_group.shape

(96, 279)

# Creating the K-means clustering model

In [214]:
toronto_no_neigh = toronto_dummy_group.drop('Neighbourhood', axis = 1)

clusters = 10

cluster = KMeans(n_clusters = clusters, random_state = 5)

predictions = cluster.fit(toronto_no_neigh)

In [215]:
neighbors = toronto_dummy_group['Neighbourhood']
neighbors = pd.DataFrame(neighbors)
neighbors['Labels'] = predictions.labels_
neighbors

map_set = pd.merge(df2, neighbors, on = 'Neighbourhood')

In [216]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, borough, color in zip(map_set['Latitude'], map_set['Longitude'], map_set['Neighbourhood'], map_set['Labels']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[color-1],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

**Clustering by venues and ammenities within a city is clearly not a very good way of finding boroughs. Geographical location is most likely the best method to use for classification in this regard. If this were larger scaled this would be a great clustering method to determine the differnce between cities, towns, and other areas.**