# PART 1

Importing required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.io.html import read_html

Using Pandas to scrape table from wikipidea Page

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table=read_html(url, attrs={'class':'wikitable'})

In [3]:
table[0].head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [4]:
table[0].shape

(287, 3)

In [5]:
table[0].Postcode.value_counts()

M8Y    8
M9V    8
M5V    7
M8Z    5
M4V    5
      ..
M2Y    1
M6X    1
M6Z    1
M1W    1
M4G    1
Name: Postcode, Length: 180, dtype: int64

In [6]:
df=table[0]

In [7]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


Grouping Postcode codes and corresponding Neighbourhoods

In [8]:
postcode_grp=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)

In [9]:
postcode_grp_df=pd.DataFrame(postcode_grp)

In [10]:
postcode_grp_df.reset_index(inplace=True)

In [11]:
postcode_grp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[Not assigned]
1,M1B,Scarborough,"[Rouge, Malvern]"
2,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[Woburn]
...,...,...,...
175,M9V,Etobicoke,"[Albion Gardens, Beaumond Heights, Humbergate,..."
176,M9W,Etobicoke,[Northwest]
177,M9X,Not assigned,[Not assigned]
178,M9Y,Not assigned,[Not assigned]


In [12]:
postcode_grp_df['Neighbourhood']=postcode_grp_df['Neighbourhood'].str.join(',')

In [13]:
postcode_grp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
...,...,...,...
175,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
176,M9W,Etobicoke,Northwest
177,M9X,Not assigned,Not assigned
178,M9Y,Not assigned,Not assigned


Converting 'Not assigned' to NaN and droping Boroughs with NaNs

In [14]:
postcode_grp_df.replace('Not assigned',np.nan,inplace=True)
# postcode_grp_df.info()

In [16]:
postcode_grp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postcode         180 non-null object
Borough          103 non-null object
Neighbourhood    102 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [17]:
postcode_grp_df.dropna(axis=0,subset=['Borough'], inplace=True)

Filling NaN Neighbourhoods with Adjacent Borough

In [18]:
postcode_grp_df[postcode_grp_df.isnull().any(axis=1)]

Unnamed: 0,Postcode,Borough,Neighbourhood
160,M9A,Queen's Park,


In [19]:
 #postcode_grp_df['Neighbourhood'].fillna(df['Borough'],inplace=True)

In [20]:
postcode_grp_df['Neighbourhood'][160]

nan

In [23]:
postcode_grp_df['Neighbourhood']=postcode_grp_df['Neighbourhood'].fillna(value=postcode_grp_df['Borough'])

In [24]:
postcode_grp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 1 to 176
Data columns (total 3 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 8.2+ KB


In [25]:
postcode_grp_df.loc[160,:]

Postcode                  M9A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 160, dtype: object

# PART 2

Importing the Geospatial Co-ordinates csv

In [31]:
geo=pd.read_csv('Geospatial_Coordinates.csv')

In [39]:
geo.head()
geo.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [40]:
geo

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Joining the Postcode Dataframe to the co-ordinates dataframe 

In [41]:
table=pd.merge(postcode_grp_df,geo,on='Postcode')

In [42]:
table

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


# PART 3

In [64]:
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
from pandas.io.json import json_normalize


Exploring Toronto's Neighbourhoods

In [44]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [45]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

In [48]:
for lat, lng, Borough, Neighbourhood in zip(table['Latitude'], table['Longitude'], table['Borough'], table['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)
map_Toronto

In [49]:
CLIENT_ID = 'SGMRMZJESM15R5IRZ5Y5RACNUNFIWE2VIEC25Z414CMFSKG0' # your Foursquare ID
CLIENT_SECRET = 'T5BO0ATDRUAZSN3JOH5LURRCISAPYXSQR2IRJ1DDBYTWLQ4I' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SGMRMZJESM15R5IRZ5Y5RACNUNFIWE2VIEC25Z414CMFSKG0
CLIENT_SECRET:T5BO0ATDRUAZSN3JOH5LURRCISAPYXSQR2IRJ1DDBYTWLQ4I


In [50]:
table.loc[0, 'Neighbourhood']

'Rouge,Malvern'

In [54]:
neighbourhood_latitude = table.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = table.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = table.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Rouge,Malvern are 43.806686299999996, -79.19435340000001.


In [58]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=SGMRMZJESM15R5IRZ5Y5RACNUNFIWE2VIEC25Z414CMFSKG0&client_secret=T5BO0ATDRUAZSN3JOH5LURRCISAPYXSQR2IRJ1DDBYTWLQ4I&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=100'

In [59]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e562eb1542890001bab9eb4'},
 'response': {'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 19,
  'suggestedBounds': {'ne': {'lat': 43.81568630900001,
    'lng': -79.18190576146081},
   'sw': {'lat': 43.797686290999984, 'lng': -79.20680103853921}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '579a91b3498e9bd833afa78a',
       'name': "Wendy's",
       'location': {'address': '8129 Sheppard Avenue',
        'lat': 43.8020084,
        'lng': -79.1980797,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.8020084,
          'lng': -79.1980797}],
        'distance': 600,
        'postalCode': 'M1B 6A3',
        'cc': 'CA',
        '

In [61]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [65]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.802008,-79.19808
1,Wendy's,Fast Food Restaurant,43.807448,-79.199056
2,Harvey's,Restaurant,43.80002,-79.198307
3,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
4,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777


In [66]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

19 venues were returned by Foursquare.


In [67]:
print('There are {} uniques categories.'.format(len(nearby_venues['categories'].unique())))

There are 16 uniques categories.


Explore other Neighborhoods in Toronto

In [69]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)