# Extract the neiborhood data of Toronto from wikipedia

### Last week, we learnt download dataset from FourSqure API. This  time, let's scrap the information from Wikipedia and make them grouped

First, let's import the libary we need

In [6]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib
print('libraries imported successfully')

libraries imported successfully


Now, we are going to use beautiful soup to scrap the data from url

In [7]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
# make soup
soup = BeautifulSoup(page,'html.parser')
tables = soup.select('table')
table = tables[0].findAll('tr')

In [8]:
postcodes = [] #creat list to contain the data
broughs = []
neighborhoods = []
for row in tables[0].findAll('tr'): # find each row
    if row.findAll('td'):           # the data start with td
        postcodes.append(row.findAll('td')[0].contents[0]) # the first column is postcode.
        broughs.append(row.findAll('td')[1].contents[0].string.replace('\n',''))# the second column is broughs. delete the '\n' if ther is
        neighborhoods.append(row.findAll('td')[2].contents[0].string.replace('\n',''))# the third column is neighborhood.delete the '\n' if there is
# convert the data into dataframe
df = pd.DataFrame({'Postcode':postcodes, 'Broughs':broughs, 'Neighborhoods':neighborhoods})

#### let's clean the data further

In [9]:
df = df[df.Broughs!='Not assigned'] # let's delete those brough not assigned
nh_mask = df.Neighborhoods =='Not assigned' # where neighbor hood was not assigned
df["Neighborhoods"][nh_mask] = df['Broughs'][nh_mask]
df.shape

(211, 3)

In [10]:
df.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


We can find the row 2 and row 3 has the same Postocode and same brough with different neigborhood. We are suposed to merge them. So, we are going to use groupby()

In [11]:
# to define a function applied to groupby, here I acknowlwdge the bbs https://codeday.me/bug/20171205/104918.html
#It told me how to merge string if we jsut want to join them together
def f(x):
     return pd.DataFrame(dict(Postcode = np.unique(x['Postcode']),  #We extract the unique postcodes
                        Broughs = np.unique(x['Broughs']),          #We extract the unique broughs, since many are the same with the same postcodes
                        Neighborhoods = "%s" % ', '.join(x['Neighborhoods']))) # merge the neighborhood

In [13]:
df_group = df.groupby('Postcode').apply(f)

In [14]:
df_group.reset_index(drop = True, inplace = True) # let's drop the index

In [15]:
df_group.shape

(103, 3)

Now, we know there is 103 postcodes if we delete the unasigned broughs and neighborhoods

Let's print the first 5 postcodes and neighborhoods

In [16]:
df_group.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## the second task is to find the latitude and longtitude 

First, we would like to use google api

In [17]:
!pip install geocoder



In [18]:
# let's import the libraries
import geocoder
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    lat_lng_coords = None
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[0]))
        lat_lng_coord = g.latlng
    latitude.append(lat_lng_coord[0])
    longitude.append(lat_lng_coord[1])

KeyboardInterrupt: 

Ops! It seems the geocoder is not stable. Let's explore other way. Fine, we once used other API name as Noinatim, maybe it works

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    address = '{}, Toronto, Ontario'.format(postal_code[0])
    location = None
    while (location is None):
        geolocator = Nominatim(user_agent="To_explorer")
        location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print(latitude, longitude)

OK, we failed again. Anyway, we have backup. We can load the data from https://cocl.us/Geospatial_data

In [19]:
df_to = pd.read_csv('https://cocl.us/Geospatial_data')
df_to.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Great. Let's merge these two dataframes

In [20]:
df_toronto = pd.concat([df_group, df_to], axis = 1, join = 'inner')
df_toronto.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [21]:
# we can find two postal codes and we drop the second one
df_toronto.drop(columns = ["Postal Code"], axis = 1, inplace = True)

Let's have a look again

In [22]:
df_toronto.head(10)

Unnamed: 0,Postcode,Broughs,Neighborhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [24]:
df_toronto.shape

(103, 5)

congrats! we made it.In summary, what we did in this section is combing the two existence dataframe.
The concat method has several parameters, the first is the object to merge, second axis, we concate along the row if axis == 1 else along the column, join means intersaction(inner) or union(outter)

In [23]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'M1B, Toronto, Ontario'
geolocator = Nominatim(user_agent="To_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


We completed the cooridate data of toronto neighborhoods

## let's cluster the neighbor hood in Toronto 

In [26]:
# first let's import the map library
import folium

The first, we can just mark all the neiborhoods of toronto based on the broughs. 

In [45]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Broughs'], df_toronto['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [49]:
import requests

Second, let's categorize th Boroughs

In [46]:
CLIENT_ID = 'GPWKUVIRX0CM4QGME505R1CR21WVG21WN1VOI0WZSQPM0QIB' # your Foursquare ID
CLIENT_SECRET = 'IARW1LCM1GACKW1KJ5ZCBSDSIIIDD2M3DHFWJ0GZCVCJKXME' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 100

In [50]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [106]:
# we extract one dataframe consist of name and latitude
df_sel = df_toronto[['Broughs', 'Latitude','Longitude']]
df_sel = df_sel.groupby("Broughs").mean() # we take the mean of the longitude and latitude of these neighborhood as new ones
df_sel.head()

Unnamed: 0_level_0,Latitude,Longitude
Broughs,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,43.70198,-79.398954
Downtown Toronto,43.654169,-79.383665
East Toronto,43.669436,-79.324654
East York,43.700303,-79.335851
Etobicoke,43.660043,-79.542074


In [107]:
df_sel.reset_index(inplace = True)

In [63]:
Toronto_venues = getNearbyVenues(names = df_sel['Broughs'],
                                   latitudes = df_sel['Latitude'],
                                   longitudes = df_sel['Longitude']
                                  )

Central Toronto
Downtown Toronto
East Toronto
East York
Etobicoke
Mississauga
North York
Queen's Park
Scarborough
West Toronto
York


In [None]:
toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = Toronto_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [71]:
Toronto_grouped = toronto_onehot.groupby("Borough").mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,...,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0
1,Downtown Toronto,0.01,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.02,...,0.0,0.01,0.02,0.01,0.0,0.02,0.0,0.0,0.0,0.0
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### let's see the top 10 facility in each Borough

first, a function to return the most common venues

In [72]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [101]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Borough_venues_sorted = pd.DataFrame(columns=columns)
Borough_venues_sorted['Borough'] = Toronto_grouped['Borough']

for ind in np.arange(Toronto_grouped.shape[0]):
    Borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

Borough_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Italian Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Pizza Place,Bank,Middle Eastern Restaurant,Mexican Restaurant,Indonesian Restaurant,Indian Restaurant
1,Downtown Toronto,Coffee Shop,Clothing Store,Cosmetics Shop,American Restaurant,Tea Room,Bubble Tea Shop,Chinese Restaurant,Restaurant,Seafood Restaurant,Bakery
2,East Toronto,Indian Restaurant,Grocery Store,Café,Bistro,Women's Store,Indian Chinese Restaurant,Indie Theater,Egyptian Restaurant,Pakistani Restaurant,Park
3,East York,Park,Public Art,Trail,Yoga Studio,Falafel Restaurant,Creperie,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store
4,Etobicoke,Flower Shop,Park,Clothing Store,Tennis Court,Fast Food Restaurant,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop


## Cluster Boroughs

In [102]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
Borough_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Merge the data together. We found there is a typo in df_sel, so we correct it her

In [108]:
df_sel.rename(columns = {'Broughs':'Borough'},inplace = True)
df_sel.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654169,-79.383665
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074


In [112]:
Toronto_merged = df_sel.join(Borough_venues_sorted.set_index('Borough'), on='Borough')

In [113]:
Toronto_merged.head()

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,43.70198,-79.398954,0,Italian Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Pizza Place,Bank,Middle Eastern Restaurant,Mexican Restaurant,Indonesian Restaurant,Indian Restaurant
1,Downtown Toronto,43.654169,-79.383665,0,Coffee Shop,Clothing Store,Cosmetics Shop,American Restaurant,Tea Room,Bubble Tea Shop,Chinese Restaurant,Restaurant,Seafood Restaurant,Bakery
2,East Toronto,43.669436,-79.324654,0,Indian Restaurant,Grocery Store,Café,Bistro,Women's Store,Indian Chinese Restaurant,Indie Theater,Egyptian Restaurant,Pakistani Restaurant,Park
3,East York,43.700303,-79.335851,3,Park,Public Art,Trail,Yoga Studio,Falafel Restaurant,Creperie,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store
4,Etobicoke,43.660043,-79.542074,4,Flower Shop,Park,Clothing Store,Tennis Court,Fast Food Restaurant,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop


Let's visualize it on map

In [123]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = plt.cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Borough'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Thanks for reading this lab. This lab was written by Qiaozhou Xiong(E150022@en.ntu.edu.sg) for the learning of data science