# Capstone Project 

*Where to move in Toronto, Ontario*

## Table of contents
* [Getting the data](#section1)
* [Get close venues for each neighborhood](#section2)

<a id='section1'></a>
## Getting the data

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup # this module helps in web scrapping.
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import os
from os.path import isfile, join

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(URL).text
soup = BeautifulSoup(data,"html5lib")

#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>
print(f"There are {len(tables)} tables on this website.")

# target table is table 0
targetTable = tables[0]
#print(tables[0].prettify())

There are 3 tables on this website.


In [3]:
postal_codes_dict = {}
# Creating Dataframe from scraped data
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass
    
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
data = pd.DataFrame(columns=columns)

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    data = data.append({"PostalCode": postal_code, 
                                "Borough": borough, 
                                "Neighborhood": neighborhood},
                                ignore_index=True)

data.shape

(103, 3)

In [4]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


## Visualize data on map

### Add Latitude and Longitude

In [5]:
geodata = pd.read_csv("Geospatial_Coordinates.csv")
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
lat = []
long = []

for postalCode in data['PostalCode']:
    lat.append(geodata[geodata['Postal Code'] == postalCode]["Latitude"].values[0])
    long.append(geodata[geodata['Postal Code'] == postalCode]["Longitude"].values[0])

data['Latitude'] = lat
data['Longitude'] = long

In [7]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [8]:
class style:
   BOLD = '\033[1m'
   END = '\033[0m'

### Create map

In [9]:
# create map of New York using latitude and longitude values
map_data = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=f"<i>{neighborhood}, <b>{borough}</b></i>",
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_data)  
    
map_data

<a id='section2'></a>
# Get close venues for each neighborhood

In this project i will only take a look at the general categories (e.g. restaurants) not subcategories like Asian-Restaurant.

### Get list of all possible categories

In [10]:
URL = "https://developer.foursquare.com/docs/build-with-foursquare/categories/"
response  = requests.get(URL)
soup = BeautifulSoup(response.content,"html5lib")

In [11]:
content = soup.find(class_="documentTemplate__Content-sc-5mpekp-0 bbBBoE")

categories = []

for item in content.ul.children:
    categories.append(item.div.h3.text)
    
categories.sort()
categories

['Arts & Entertainment',
 'College & University',
 'Event',
 'Food',
 'Nightlife Spot',
 'Outdoors & Recreation',
 'Professional & Other Places',
 'Residence',
 'Shop & Service',
 'Travel & Transport']

### Create dictionary to lookup category IDs


In [12]:
# Create a dictionary class
class my_dictionary(dict):

    # __init__ function
    def __init__(self):
        self = dict()
          
    # Function to add key:value
    def add(self, key, value):
        self[key] = value

In [13]:
categoriesDict = my_dictionary()

for item in content.ul.children:
    category = item.div.h3.text
    categoryID = item.div.p.text
    categoriesDict.add(category, categoryID)

categoriesDict

{'Arts & Entertainment': '4d4b7104d754a06370d81259',
 'College & University': '4d4b7105d754a06372d81259',
 'Event': '4d4b7105d754a06373d81259',
 'Food': '4d4b7105d754a06374d81259',
 'Nightlife Spot': '4d4b7105d754a06376d81259',
 'Outdoors & Recreation': '4d4b7105d754a06377d81259',
 'Professional & Other Places': '4d4b7105d754a06375d81259',
 'Residence': '4e67e38e036454776db1fb3a',
 'Shop & Service': '4d4b7105d754a06378d81259',
 'Travel & Transport': '4d4b7105d754a06379d81259'}

## Cluster data

In [14]:
def load_file(foldername, filename):
    with open(f'./data/{foldername}/{filename}.txt') as f:
        df = json.load(f)
    
    return df

In [15]:
with open("./Credentials/client_ID.txt") as file:
    CLIENT_ID = file.read() # your Foursquare ID
with open("./Credentials/client_secret.txt") as file:
    CLIENT_SECRET = file.read() # your Foursquare Secret 
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

In [16]:
def scrape_and_store_data(categories, names, latitudes, longitudes, radius=500):
    
    #get counter of last scrape to start scraping where it has ended last time. This is necessary because the scrape requests exceed the maximum allowed scrapes per day.
    with open("./data/scrape_index.txt") as file:
        lastSavePoint = int(file.read()) 
        file.close()
    
    # get all files' and folders' names in the current directory
    filenames = os.listdir ("./data") 

    counter = 0
    scrape_index = 0 #the additional scrapes that are made in this session.
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        if name not in filenames:
            #create folder
            path = os.path.join(os.getcwd() + "/data", f"{name}")
            os.mkdir(path)            
        
        for category in categories:
            #skip data that has already been scraped
            if counter <= lastSavePoint:
                counter += 1
                continue
                
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?categoryId={}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                categoriesDict[category],
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)

            # make the GET request to get amount of items of each category
            status_code = requests.get(url).json()["meta"]["code"]
            #check if scrape was successfull
            if status_code == 200:
                #raise scrape index to avoid scraping data twice
                scrape_index += 1 
                
                scraped_data = requests.get(url).json()["response"]
                
            else:
                print("Unseccessful request. Code: ", status_code)
                
                scraped_data = []
            
            
            try:
                #create new file to store scraped data
                with open(f"./data/{name}/{category}.txt", "x") as tmp_file:
                    json.dump(scraped_data, tmp_file)
                    tmp_file.close()
            
            except FileExistsError:
                with open(f"./data/{name}/{category}.txt", "w") as tmp_file:
                    json.dump(scraped_data, tmp_file)
                    tmp_file.close()
    
    required_scrapes = len(data) * len(categories)
    scrapes_done = scrape_index
    print(scrapes_done, "/", required_scrapes, " performed.")
    
    #save updated counter
    with open("./data/scrape_index.txt", "w") as file:
        file.write(str(lastSavePoint + scrape_index))
        file.close()
                
    return 

In [17]:
scrape_and_store_data(categories,
                      names=data['Neighborhood'],
                      latitudes=data['Latitude'],
                      longitudes=data['Longitude'])

Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessful request. Code:  429
Unseccessf

In [18]:
def load_scraped_data(data, categories):
    
    nearby_venues = pd.DataFrame(columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude'] + categories)

    for idx, rowdata in data.iterrows():
        
        tmpDict = my_dictionary()
        tmpDict.add("Neighborhood",rowdata["Neighborhood"])
        tmpDict.add("Neighborhood Latitude", rowdata["Latitude"])
        tmpDict.add("Neighborhood Longitude", rowdata["Longitude"])
        
        for category in categories:
            
            #load scraped data
            scraped_data = load_file(rowdata["Neighborhood"], category)
            
            if len(scraped_data) > 0:
                results = scraped_data["venues"]
            else:
                results = []

            #count quantity of items of each category
            quantity = len(results)

            #add categories to dictionary
            tmpDict.add(category, quantity)

        nearby_venues = nearby_venues.append(tmpDict, ignore_index=True)
    
    
    return nearby_venues

In [19]:
venue_data = load_scraped_data(data, categories)
venue_data

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Travel & Transport
0,Parkwoods,43.753259,-79.329656,0,1,0,1,0,4,9,4,4,4
1,Victoria Village,43.725882,-79.315572,2,1,0,5,0,4,19,7,18,4
2,"Regent Park, Harbourfront",43.654260,-79.360636,49,8,3,49,29,44,46,38,41,33
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4,1,0,13,2,6,27,0,34,1
4,Ontario Provincial Government,43.662301,-79.389494,45,48,0,50,42,43,49,49,46,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,0,0,0,0,0,0,0,0,0,0
99,Church and Wellesley,43.665860,-79.383160,0,0,0,0,0,0,0,0,0,0
100,Enclave of M4L,43.662744,-79.321558,0,0,0,0,0,0,0,0,0,0
101,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,0,0,0,0,0,0,0,0,0,0


# Cluster neighborhoods

In [20]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import pickle

## Cluster data without respect to categories

In [21]:
def perform_clustering_all_categories(venue_data): 
    
    #store columns separately
    neighborhoods = venue_data["Neighborhood"].copy()
    lats = venue_data["Neighborhood Latitude"].copy()
    lngs = venue_data["Neighborhood Longitude"].copy()
    venue_data = venue_data.drop(["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude"], 1)
    
    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venue_data)

    # check cluster labels generated for each row in the dataframe
    kmeans.labels_[0:10] 
    
    #insert labels and separately stored datas
    venue_data.insert(0, "Neighborhood", neighborhoods)
    venue_data.insert(1, "Neighborhood Latitude", lats)
    venue_data.insert(2, "Neighborhood Longitude", lngs)
    venue_data.insert(3, f'Cluster Labels', kmeans.labels_)
    
    return venue_data

In [22]:
kclusters = 5 #kcluster = number of clusters
clustered_data = perform_clustering_all_categories(venue_data)
clustered_data.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Travel & Transport
0,Parkwoods,43.753259,-79.329656,0,0,1,0,1,0,4,9,4,4,4
1,Victoria Village,43.725882,-79.315572,4,2,1,0,5,0,4,19,7,18,4
2,"Regent Park, Harbourfront",43.65426,-79.360636,3,49,8,3,49,29,44,46,38,41,33
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4,4,1,0,13,2,6,27,0,34,1
4,Ontario Provincial Government,43.662301,-79.389494,1,45,48,0,50,42,43,49,49,46,33


### Visualizing different clusters

In [23]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(clustered_data["Neighborhood Latitude"], clustered_data["Neighborhood Longitude"], clustered_data["Neighborhood"], clustered_data['Cluster Labels']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=f"<i>{str(poi)}, <b> Cluster {str(cluster)}</b></i>",
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Gaining insights on clusters

In [24]:
def get_top_n_categories(df, n):
    output = df[["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude"]].copy()
    df = df.drop(["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", "Cluster Labels"], axis=1)
    
    #append columns that contain n largest categories
    while n > 0:
        means = df.loc[-1]
        n_largest_category = means[means == max(means)].index[0]
        
        #add n_largest_category to output df
        output[n_largest_category] = df[n_largest_category]
        
        #remove n_largest_category from original df
        df = df.drop(n_largest_category, axis=1)
        
        n-=1
   
    return output

In [25]:
clusters_top_categories = my_dictionary()

for clusterlabel in range(kclusters):
    cluster_data = clustered_data[clustered_data["Cluster Labels"] == clusterlabel].copy()
    cluster_data.loc[-1] = cluster_data.mean(axis=0)
    
    clusters_top_categories.add(f"Cluster {clusterlabel}", get_top_n_categories(cluster_data, 5))

# Store data

In [67]:
files = os.listdir("./data")

In [27]:
#store complete data
if "clustered_data_df.txt" in files:
    os.remove("./data/clustered_data_df.txt")
    
clustered_data.to_pickle("./data/clustered_data_df.txt")

In [28]:
#store top category data
if "clustered_data_dict.p" in files:
    os.remove("./data/clustered_data_dict.p")
    
pickle.dump(clusters_top_categories, open("./data/clustered_data_dict.p", "wb") )

## Cluster data with respect to categories

In [29]:
def perform_category_specific_clustering(venue_data, category, kclusters=5): #kcluster = number of clusters
    
    #create dummy dataframe to avoid transforming original dataframe
    tmp = venue_data.copy()
    
    tmp_data_clustering = tmp[category]

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(np.array(tmp_data_clustering).reshape(-1,1))

    # check cluster labels generated for each row in the dataframe
    kmeans.labels_[0:10] 
    
    tmp.insert(3, f'Cluster Labels {category}', kmeans.labels_)
    
    return tmp

In [30]:
def get_span(a):
    return a.max()-a.min()

In [31]:
#Due to the fact that it is hard to choose a decent value for kcluster,
# i want to plot the number of clusters over the span in each cluster to find out the best k.

def get_optimal_kcluster(tmp, category, minCluster, maxCluster):
    spansDict = my_dictionary()
    for i in range(minCluster, maxCluster):
        result = perform_clustering(tmp, category, i)
        
        tmpDict = my_dictionary()
        for cluster_Label in result[f"Cluster Labels {category}"].unique():
            category_data = result[result[f"Cluster Labels {category}"] == cluster_Label][category]
            tmpDict.add(cluster_Label, get_span(category_data))
        
        spansDict.add(i, tmpDict)
        
    return spansDict