## Scraping Wikipedia page for the intended table

In [84]:
%%capture
!pip install lxml

In [85]:
import requests 
import lxml.html as lh
import pandas as pd

In [86]:
import numpy as np

In [87]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [88]:
page=requests.get(url)

In [89]:
doc = lh.fromstring(page.content)

In [90]:
tr_elements = doc.xpath('//tr')

In [91]:
col=[]
i=0
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('{i}:{name}'.format(i=i,name=name[:-1]))
    col.append((name[:-1],[]))

1:Postal Code
2:Borough
3:Neighborhood


In [92]:
for j in range(1,len(tr_elements)):
    T=tr_elements[j]
    
    if len(T)!=3:
        break
    
    i=0
    
    for t in T.iterchildren():
        data=t.text_content() 
        if i>=0:
            try:
                data=data[:-1]
            except:
                pass
        col[i][1].append(data)
        i+=1

## Converting the scraped table into dataframe

In [93]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [94]:
df.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,
180,,Canadian postal codes,


In [95]:
df.drop(index=180, axis=0, inplace=True)

In [96]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [97]:
df.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,


## Excluding rows whose *Borough* is *Not assigned* 

In [98]:
df=df[df['Borough']!='Not assigned'].reset_index(drop=True)

In [99]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [100]:
(rows, columns)=df.shape

In [101]:
print('Dimensions of the data frame: {rows} rows, {columns} columns'.format(rows=rows,columns=columns))

Dimensions of the data frame: 103 rows, 3 columns


# Latitude-longitude Dataframe of Neighborhoods

In [102]:
%%capture
!pip install geocoder

In [103]:
import geocoder # import geocoder

## Importing lat-long csv fime

In [104]:
lat_long_coords=pd.read_csv("https://cocl.us/Geospatial_data")

## Merging the Wikipedia-scraped dataframe with imported lat-long dataframe

In [105]:
df_combined=pd.merge(df,lat_long_coords, on='Postal Code')

In [106]:
df_combined.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Creating a Map of Toronto and Neighborhoods Superimposed on Top of it

In [107]:
%%capture
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [108]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [109]:
import folium

In [110]:
# create map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postalCode, borough, neighborhood in zip(df_combined['Latitude'],df_combined['Longitude'], df_combined['Postal Code'], df_combined['Borough'], df_combined['Neighborhood']):
    label = '{}, {},{}'.format(postalCode, borough,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

## Foursquare API Credential

In [111]:
CLIENT_ID = 'YSHJZKEA0LMOAKB4RGUGSPNJDIP3PWKC232SCFBKFEIBWDAD' # your Foursquare ID
CLIENT_SECRET = 'OBZHMIJVOVU0BUQ5X1NIS44V5XIPPFZF34WUEXEGCIZ1Z5BY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YSHJZKEA0LMOAKB4RGUGSPNJDIP3PWKC232SCFBKFEIBWDAD
CLIENT_SECRET:OBZHMIJVOVU0BUQ5X1NIS44V5XIPPFZF34WUEXEGCIZ1Z5BY


In [112]:
Venue_Exploring_URL=pd.DataFrame(columns=['Postal Code', 'Venue Exploring url'])

In [113]:
Venue_Exploring_URL.drop(Venue_Exploring_URL.index, inplace=True)

In [114]:
## Considering the top 100 venues
LIMIT = 100 # number of venues returned by Foursquare API

radius = 500 # radiums in meters
#-->

for postalCode, lat, lng in zip(df_combined['Postal Code'], df_combined['Latitude'], df_combined['Longitude']):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(\
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    lng, 
    radius, 
    LIMIT)
    
    Venue_Exploring_URL = Venue_Exploring_URL.append({'Postal Code': postalCode,
                                          'Venue Exploring url': url}, ignore_index=True)
        
Venue_Exploring_URL.head() # display URL

Unnamed: 0,Postal Code,Venue Exploring url
0,M3A,https://api.foursquare.com/v2/venues/explore?&...
1,M4A,https://api.foursquare.com/v2/venues/explore?&...
2,M5A,https://api.foursquare.com/v2/venues/explore?&...
3,M6A,https://api.foursquare.com/v2/venues/explore?&...
4,M7A,https://api.foursquare.com/v2/venues/explore?&...


In [115]:
def getNearbyVenues(postalCodes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postalCode, lat, lng in zip(postalCodes, latitudes, longitudes):
       # print(postalCodes)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postalCode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [116]:
# Obtaining venues around each postal code
postalCode_venues = getNearbyVenues(postalCodes=df_combined['Postal Code'],
                                   latitudes=df_combined['Latitude'],
                                   longitudes=df_combined['Longitude']
                                  )


In [117]:
postalCode_venues.head()

Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Sun Life,43.75476,-79.332783,Construction & Landscaping
2,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [118]:
postalCode_venues['Venue Category'].nunique()

268

In [119]:
postalCode_venues.groupby('Postal Code').count()

Unnamed: 0_level_0,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,1,1,1,1,1,1
M1C,2,2,2,2,2,2
M1E,7,7,7,7,7,7
M1G,3,3,3,3,3,3
M1H,9,9,9,9,9,9
...,...,...,...,...,...,...
M9N,2,2,2,2,2,2
M9P,7,7,7,7,7,7
M9R,4,4,4,4,4,4
M9V,9,9,9,9,9,9


## Analyzing each Postal Code

In [120]:
# one hot encoding
postalCode_onehot = pd.get_dummies(postalCode_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
postalCode_onehot['Postal Code'] = postalCode_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [postalCode_onehot.columns[-1]] + list(postalCode_onehot.columns[:-1])
postalCode_onehot = postalCode_onehot[fixed_columns]

postalCode_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
postalCode_grouped=postalCode_onehot.groupby('Postal Code').mean().reset_index()

In [122]:
postalCode_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [124]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postalCode_venues_sorted = pd.DataFrame(columns=columns)
postalCode_venues_sorted['Postal Code'] = postalCode_grouped['Postal Code']

for ind in np.arange(postalCode_grouped.shape[0]):
    postalCode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(postalCode_grouped.iloc[ind, :], num_top_venues)

postalCode_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Dessert Shop,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop
1,M1C,Moving Target,Bar,Yoga Studio,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Dim Sum Restaurant
2,M1E,Mexican Restaurant,Rental Car Location,Breakfast Spot,Electronics Store,Medical Center,Bank,Intersection,Yoga Studio,Distribution Center,Dog Run
3,M1G,Coffee Shop,Korean Restaurant,Eastern European Restaurant,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio
4,M1H,Hakka Restaurant,Bakery,Lounge,Caribbean Restaurant,Athletics & Sports,Fried Chicken Joint,Thai Restaurant,Gas Station,Bank,Donut Shop


## Postal Code Clustering

In [125]:
from sklearn.cluster import KMeans

In [126]:
# set number of clusters
kclusters = 5

postalCode_grouped_clustering = postalCode_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(postalCode_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [127]:
# Including cluster lables
postalCode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [128]:
postalCode_merged=pd.merge(df_combined,postalCode_venues_sorted, on='Postal Code')

In [129]:
postalCode_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Food & Drink Shop,Park,Construction & Landscaping,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Café,Theater,Spa,Shoe Store,Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Clothing Store,Vietnamese Restaurant,Miscellaneous Shop,Arts & Crafts Store,Coffee Shop,Accessories Store,Event Space,Boutique,Furniture / Home Store,Electronics Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,College Auditorium


### Visualizing clusters in Toronto

In [130]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [131]:
latitude, longitude

(43.6534817, -79.3839347)

In [132]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(postalCode_merged['Latitude'], postalCode_merged['Longitude'], postalCode_merged['Neighborhood'], postalCode_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters