#           IBM DATA SCIENCE CAPSTONE PROJECT

In [3]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

#!conda install -c conda-forge folium=0.5.0 --yes

import folium
print('Libraries succesfully imported')

Libraries succesfully imported


### Scrapping the web for Canada postal data

In [5]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'lxml')
table = soup.find_all('table')[0]
df_0 = pd.read_html(str(table))
df_1 = pd.DataFrame(df_0[0])

df_1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [9]:
df_2 = df_1.drop(df_1[df_1.Borough == 'Not assigned'].index)

for index, row in df_2.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] == row['Borough']

In [16]:
df_2g = df_2.groupby(['Postal code','Borough'], as_index=False).agg(lambda x: ','.join(x))
postal_codes = df_2g['Postal code'].to_list() # converting the postal code column to a list

df_2g.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Getting coordinates of all the postal codes in Canada

In [22]:
file = 'https://cocl.us/Geospatial_data'
coordinates_df = pd.read_csv(file)

# creating a dataframe that has the postal codes arranged in the same order as df_2g
col_names = ['Postcodes', 'Latitude', 'Longitude']
coord_df = pd.DataFrame(columns = col_names)

for codes in postal_codes:
    coord_df = coord_df.append(coordinates_df[coordinates_df['Postal Code']==codes], ignore_index=True)

cord_df = coord_df.drop(['Postcodes'], axis=1)

cord_df.head()

Unnamed: 0,Latitude,Longitude,Postal Code
0,43.806686,-79.194353,M1B
1,43.784535,-79.160497,M1C
2,43.763573,-79.188711,M1E
3,43.770992,-79.216917,M1G
4,43.773136,-79.239476,M1H


In [27]:
lat_lon_df = cord_df[['Latitude', 'Longitude']]
df_2g[['Latitude', 'Longitude']] = lat_lon_df
df_2g.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### We are interested in the state of Toronto, so I perform further cleaning 

In [34]:
borough_names = list(df_2g['Borough'].unique())

toronto_borough = []

for borough in borough_names:
    if 'toronto' in borough.lower():
        toronto_borough.append(borough)
        
tor_df = df_2g[df_2g['Borough'].isin(toronto_borough)].reset_index(drop=True)

tor_df.head().append(tor_df.tail())

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
34,M6P,West Toronto,High Park / The Junction South,43.661608,-79.464763
35,M6R,West Toronto,Parkdale / Roncesvalles,43.64896,-79.456325
36,M6S,West Toronto,Runnymede / Swansea,43.651571,-79.48445
37,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
38,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558


In [36]:
tor_lat = tor_df['Latitude'].mean()
tor_lon = tor_df['Longitude'].mean()
print('Toronto has a latitude:{0} and longitude:{1}'.format(tor_lat, tor_lon))

Toronto has a latitude:43.66713498717948 and longitude:-79.38987324871795


### Let us now visualize the data abouve using Folium

In [58]:
latitude = 43.653963
longitude = 79.387207

toronto_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['Borough'], tor_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

In [45]:
# Get number of neighbouroods in each Borough
print(tor_df.groupby('Borough').count()['Neighborhood'])

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighborhood, dtype: int64


### Extracting Data From four Square API

In [46]:
CLIENT_ID = 'W11FR0KFM0TIM4UIXPUGDVE2RPGNLUFT014GYBHLWS3JEE5P'  # Foursquare ID
CLIENT_SECRET = '5QOUITWTGCQ4CFDJRB1JEZNWQJNU1EVEJPV5302PXUEEJ5MX' # Foursquare Secret code(ain't no screct now ain't it?)
VERSION = '20180605' # Foursquare API version
LIMIT = 200 # Limit of venues rreturned by the API
RADIUS = 500 # Radius

In [47]:
# getting nearby venues
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [49]:
toronto_venues = get_nearby_venues(names=tor_df['Neighborhood'],
                                latitudes=tor_df['Latitude'],
                                longitudes=tor_df['Longitude'])

The Beaches
The Danforth West / Riverdale
India Bazaar / The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park / Summerhill East
Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park
Rosedale
St. James Town / Cabbagetown
Church and Wellesley
Regent Park / Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond / Adelaide / King
Harbourfront East / Union Station / Toronto Islands
Toronto Dominion Centre / Design Exchange
Commerce Court / Victoria Hotel
Roselawn
Forest Hill North & West
The Annex / North Midtown / Yorkville
University of Toronto / Harbord
Kensington Market / Chinatown / Grange Park
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst  Quay / South Niagara / Island airport
Stn A PO Boxes
First Canadian Place / Underground city
Christie
Dufferin / Dovercourt Village
Little Portugal / Trinity
Brockton / Parkdale Village / Exhibition Place
High Park /

In [56]:
toronto_venues.shape

(1682, 7)

In [53]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
Brockton / Parkdale Village / Exhibition Place,22,22,22,22,22,22
Business reply mail Processing CentrE,16,16,16,16,16,16
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,16,16,16,16,16,16
Central Bay Street,78,78,78,78,78,78
Christie,19,19,19,19,19,19
Church and Wellesley,79,79,79,79,79,79
Commerce Court / Victoria Hotel,100,100,100,100,100,100
Davisville,36,36,36,36,36,36
Davisville North,8,8,8,8,8,8


In [54]:
toronto_venues['Venue Category'].unique()[:100]

array(['Trail', 'Health Food Store', 'Pub', 'Neighborhood', 'Coffee Shop',
       'Greek Restaurant', 'Cosmetics Shop', 'Italian Restaurant',
       'Ice Cream Shop', 'Yoga Studio', 'Brewery',
       'Fruit & Vegetable Store', 'Pizza Place', 'Bookstore',
       'Restaurant', 'Dessert Shop', 'Juice Bar', 'Bubble Tea Shop',
       'Spa', 'Diner', 'Grocery Store', 'Furniture / Home Store', 'Café',
       'Bakery', 'Caribbean Restaurant', 'Indian Restaurant',
       'Frozen Yogurt Shop', 'Lounge', 'Liquor Store', 'Gym',
       'Fish & Chips Shop', 'Fast Food Restaurant', 'Sushi Restaurant',
       'Park', 'Pet Store', 'Steakhouse', 'Burrito Place',
       'Movie Theater', 'Sandwich Place', 'Intersection',
       'Food & Drink Shop', 'Fish Market', 'Gay Bar', 'Cheese Shop',
       'Middle Eastern Restaurant', 'Comfort Food Restaurant',
       'Thai Restaurant', 'Seafood Restaurant', 'American Restaurant',
       'Stationery Store', 'Coworking Space', 'Wine Bar', 'Bar',
       'Gym / Fitness

In [59]:
#check if Japanese restaurant is in the area
"Japanese Restaurant" in toronto_venues['Venue Category'].unique()

True

In [60]:
onehot_df = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot_df['Neighborhoods'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
onehot_df = onehot_df[fixed_columns]

print(onehot_df.shape)
onehot_df.head()

(1682, 236)


Unnamed: 0,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
grouped_df = onehot_df.groupby(["Neighborhoods"]).mean().reset_index()

print(grouped_df.shape)
grouped_df

(39, 236)


Unnamed: 0,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,Brockton / Parkdale Village / Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
3,CN Tower / King and Spadina / Railway Lands / ...,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,0.012821,0.0,0.012821
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025316
7,Commerce Court / Victoria Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,...,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
len(grouped_df[grouped_df["Japanese Restaurant"] > 0])

16

In [66]:
jap_df = grouped_df[["Neighborhoods","Japanese Restaurant"]]
jap_df.head(10)

Unnamed: 0,Neighborhoods,Japanese Restaurant
0,Berczy Park,0.018182
1,Brockton / Parkdale Village / Exhibition Place,0.0
2,Business reply mail Processing CentrE,0.0
3,CN Tower / King and Spadina / Railway Lands / ...,0.0
4,Central Bay Street,0.038462
5,Christie,0.0
6,Church and Wellesley,0.063291
7,Commerce Court / Victoria Hotel,0.03
8,Davisville,0.027778
9,Davisville North,0.0


### Applying the clustering algorithm

In [67]:
from sklearn.cluster import KMeans
k = 3

clustering_df = jap_df.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit_transform(clustering_df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

array([0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0],
      dtype=int32)

In [69]:
# adding cluster labels to the list
merged_df = jap_df.copy()

# add clustering labels
merged_df["Cluster Labels"] = kmeans.labels_

In [70]:
merged_df.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
merged_df.head(5)

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels
0,Berczy Park,0.018182,0
1,Brockton / Parkdale Village / Exhibition Place,0.0,1
2,Business reply mail Processing CentrE,0.0,1
3,CN Tower / King and Spadina / Railway Lands / ...,0.0,1
4,Central Bay Street,0.038462,0


In [71]:
merged_df = merged_df.join(toronto_venues.set_index("Neighborhood"), on="Neighborhood")

print(merged_df.shape)
merged_df.head()

(1682, 9)


Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.018182,0,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
0,Berczy Park,0.018182,0,43.644771,-79.373306,The Keg Steakhouse + Bar - Esplanade,43.646712,-79.374768,Restaurant
0,Berczy Park,0.018182,0,43.644771,-79.373306,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
0,Berczy Park,0.018182,0,43.644771,-79.373306,Meridian Hall,43.646292,-79.376022,Concert Hall
0,Berczy Park,0.018182,0,43.644771,-79.373306,Starbucks,43.644285,-79.369771,Coffee Shop


In [72]:
merged_df.sort_values(["Cluster Labels"], inplace=True)
merged_df.head()

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.018182,0,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Chatime 日出茶太,43.655542,-79.384684,Bubble Tea Shop
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,KAKA,43.657457,-79.384192,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Uncle Tetsu's Cheesecake (Uncle Tetsu's Japane...,43.656063,-79.383695,Dessert Shop
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,MAC Cosmetics,43.654055,-79.380714,Cosmetics Shop


### Visualizing the data

In [73]:
map_clusters = folium.Map(location=[tor_lat, tor_lon],zoom_start=12)

# set color scheme for the clusters


# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lon, cluster in zip(merged_df['Neighborhood Latitude'], merged_df['Neighborhood Longitude'], merged_df['Cluster Labels']):
    
    
    folium.features.CircleMarker(
        [lat, lon],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Clusters

In [74]:
#Cluster 1
merged_df.loc[(merged_df['Cluster Labels'] ==0) & (merged_df['Venue Category'] == 'Japanese Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,KAKA,43.657457,-79.384192,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Katsuya,43.65986,-79.378788,Japanese Restaurant
31,Stn A PO Boxes,0.031579,0,43.646435,-79.374846,Ki Modern Japanese + Bar,43.647167,-79.379608,Japanese Restaurant
31,Stn A PO Boxes,0.031579,0,43.646435,-79.374846,NAMI,43.650853,-79.375887,Japanese Restaurant
11,First Canadian Place / Underground city,0.03,0,43.648429,-79.38228,Fune Japanese Restaurant,43.648514,-79.386457,Japanese Restaurant
11,First Canadian Place / Underground city,0.03,0,43.648429,-79.38228,Chotto Matte,43.646473,-79.378782,Japanese Restaurant
11,First Canadian Place / Underground city,0.03,0,43.648429,-79.38228,Ki Modern Japanese + Bar,43.647167,-79.379608,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Kinka Izakaya Original,43.660596,-79.378891,Japanese Restaurant
19,Little Portugal / Trinity,0.02,0,43.647927,-79.41975,Bazara,43.648535,-79.420521,Japanese Restaurant
29,St. James Town,0.03,0,43.651494,-79.375418,NAMI,43.650853,-79.375887,Japanese Restaurant


In [75]:
#Cluster 2
merged_df.loc[(merged_df['Cluster Labels'] ==1) & (merged_df['Venue Category'] == 'Japanese Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
14,Harbourfront East / Union Station / Toronto Is...,0.01,1,43.640816,-79.381752,Miku,43.641374,-79.377531,Japanese Restaurant
25,Richmond / Adelaide / King,0.01,1,43.650571,-79.384568,Fune Japanese Restaurant,43.648514,-79.386457,Japanese Restaurant
17,Kensington Market / Chinatown / Grange Park,0.013158,1,43.653206,-79.400049,Gushi,43.652258,-79.404884,Japanese Restaurant


In [76]:
#Cluster 3
merged_df.loc[(merged_df['Cluster Labels'] ==2) & (merged_df['Venue Category'] == 'Japanese Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
38,University of Toronto / Harbord,0.057143,2,43.662696,-79.400049,Yasu,43.662837,-79.403217,Japanese Restaurant
38,University of Toronto / Harbord,0.057143,2,43.662696,-79.400049,Gyubee,43.667088,-79.400571,Japanese Restaurant
6,Church and Wellesley,0.063291,2,43.66586,-79.38316,Kokoni Izakaya,43.664181,-79.380258,Japanese Restaurant
6,Church and Wellesley,0.063291,2,43.66586,-79.38316,Onnki Donburi,43.669757,-79.384574,Japanese Restaurant
6,Church and Wellesley,0.063291,2,43.66586,-79.38316,Okonomi House お好みハウス,43.668448,-79.386884,Japanese Restaurant
6,Church and Wellesley,0.063291,2,43.66586,-79.38316,Tokyo Kitchen,43.668783,-79.385153,Japanese Restaurant
6,Church and Wellesley,0.063291,2,43.66586,-79.38316,Kawa Sushi,43.663894,-79.38021,Japanese Restaurant
