## 1. Code to scrape the following Wikipedia page

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
#!conda install -c conda-forge folium=0.5.0 --yes
from pandas.io.json import json_normalize
import folium

In [2]:
results = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
results

'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XptUTApAMMQAASM08X8AAABC","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":951325562,"wgRevisionId":951325562,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related l

In [3]:
soup = BeautifulSoup(results, "lxml")

class_ = soup.find('div', class_="mw-parser-output")
print(class_.prettify())

<div class="mw-parser-output">
 <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">
  Wikipedia list article
 </div>
 <p>
  This is a list of
  <a href="/wiki/Postal_codes_in_Canada" title="Postal codes in Canada">
   postal codes in Canada
  </a>
  where the first letter is M. Postal codes beginning with M are located within the city of
  <a href="/wiki/Toronto" title="Toronto">
   Toronto
  </a>
  in the province of
  <a href="/wiki/Ontario" title="Ontario">
   Ontario
  </a>
  . Only the first three characters are listed, corresponding to the Forward Sortation Area.
 </p>
 <p>
  <a href="/wiki/Canada_Post" title="Canada Post">
   Canada Post
  </a>
  provides a free postal code look-up tool on its website,
  <sup class="reference" id="cite_ref-1">
   <a href="#cite_note-1">
    [1]
   </a>
  </sup>
  via its
  <a href="/wiki/Mobile_app" title="Mobile app">
   applications
  </a>
  for such
  <a class="mw-redirect" href="/wiki/Smartphones" title="

In [4]:
table = class_.find('table', class_="wikitable sortable").tbody
print(table)

<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>East York
</td>
<td>Parkview Hill / W

</td></tr></tbody>


In [5]:
col = ['Postal Code', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns=col)
print(df)

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []


In [6]:
for tr in table.findAll('tr')[1:]:
    for i, td in zip(range(len(tr.findAll('td'))), tr.findAll('td')):
        if i == 0:
            postal_code = td.get_text().strip()
        if i == 1:
            borough = td.get_text().strip()          
        if i == 2:
            neighborhood = td.get_text().strip().replace(' /', ',')
    if borough.strip() != 'Not assigned':
        df = df.append({'Postal Code': postal_code,
                        'Borough': borough,
                        'Neighborhood': neighborhood}, ignore_index=True)

In [7]:
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 2. Get the latitude and the longitude coordinates of each neighborhood.

In [8]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
print(geo_df.shape)
geo_df.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df_sum = pd.merge(left=df, right=geo_df, left_on='Postal Code', right_on='Postal Code')
df_sum.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Explore and cluster the neighborhoods in Toronto.

3.1 I'll look at all parts borough and neighborhood of the city of Toronto.

In [10]:
latitude_tor = 43.753259
longitude_tor = -79.329656
map_tor = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_sum['Latitude'], df_sum['Longitude'], df_sum['Borough'], df_sum['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_sum['Borough'].unique()),
        df_sum.shape[0]
    )
)
print(df_sum['Borough'].unique())

The dataframe has 10 boroughs and 103 neighborhoods.
['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


3.2 I'll look at Central Toronto parts of the city of Toronto.

In [12]:
cent_tor_data = df_sum[df_sum['Borough'] == 'Central Toronto'].reset_index(drop=True)
cent_tor_data.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
4,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
5,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
6,M4S,Central Toronto,Davisville,43.704324,-79.38879
7,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
8,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [13]:
latitude_tor = 43.715383
longitude_tor = -79.405678
map_cent_tor = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=12)

for lat, lng, borough, neighborhood in zip(cent_tor_data['Latitude'], cent_tor_data['Longitude'], cent_tor_data['Borough'], cent_tor_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cent_tor)  
    
map_cent_tor

3.3  I'll see the venue at Central Toronto. No more than 5 venue in neighborhood.

In [14]:
CLIENT_ID = ''
CLIENT_SECRET = ''
VERSION = ''

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=700, LIMIT=5):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
# type your answer here

cent_tor_venues = getNearbyVenues(names=cent_tor_data['Neighborhood'],
                                  latitudes=cent_tor_data['Latitude'],
                                  longitudes=cent_tor_data['Longitude'])

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
North Toronto West
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


In [17]:
print(cent_tor_venues)
print(cent_tor_venues.shape)

                                         Neighborhood   Latitude  Longitude  \
0                                       Lawrence Park  43.728020 -79.388790   
1                                       Lawrence Park  43.728020 -79.388790   
2                                       Lawrence Park  43.728020 -79.388790   
3                                       Lawrence Park  43.728020 -79.388790   
4                                       Lawrence Park  43.728020 -79.388790   
5                                            Roselawn  43.711695 -79.416936   
6                                            Roselawn  43.711695 -79.416936   
7                                            Roselawn  43.711695 -79.416936   
8                                            Roselawn  43.711695 -79.416936   
9                                            Roselawn  43.711695 -79.416936   
10                                   Davisville North  43.712751 -79.390197   
11                                   Davisville Nort

In [18]:
latitude_tor = 43.715383
longitude_tor = -79.405678
map_cent_tor_venues = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=13)

for lat, lng, venue, venue_cat in zip(cent_tor_venues['Venue Latitude'], cent_tor_venues['Venue Longitude'], cent_tor_venues['Venue'], cent_tor_venues['Venue Category']):
    label = '{}, {}'.format(venue, venue_cat)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cent_tor_venues)  
    
map_cent_tor_venues