# Applied Data Science Capstone Project

This notebook will be used for the capstone project for the applied data science course on coursera.

## Segmenting and Clustering Neighborhoods in Toronto

### Install and import libraries for this project

In [8]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!pip install geocoder
import geocoder #use geocoder.google

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

import statistics as stat


print('Libraries imported.')

Libraries imported.


### Load table with postal codes from wikipedia page

In [84]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df_list = pd.read_html(html)
pc_toronto = df_list[0]    #postal codes are in the first table of the page
pc_toronto.columns = ['PostalCode','Borough','Neighborhood']
pc_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [85]:
pc_toronto.shape

(180, 3)

### Clean dataframe
Loop through dataframe and clean according the task:
- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.
- If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [86]:
pc_toronto= pc_toronto.sort_values(by=['PostalCode'], ascending = True )
pc_cleaned = pd.DataFrame(columns = pc_toronto.columns)

prev_pc = ""
for pc, br, nb in zip(pc_toronto['PostalCode'],pc_toronto['Borough'],pc_toronto['Neighborhood']):
    if(nb == 'Not assigned'): 
        nb = br #borough name as neighborhood name for unassigned neighborhoods
    if(br == 'Not assigned'):
        #skip this item
        next
    elif(pc == prev_pc):
        #add neighbourhood to existing postal code
        pc_cleaned.iloc[-1,2]=(pc_cleaned.iloc[-1,2] + ', ' + nb)
        prev_pc = pc
    else :
        #add new postal code
        pc_cleaned = pc_cleaned.append({'Borough': br,
                            'Neighborhood': nb,
                            'PostalCode': pc}, 
                              ignore_index=True)
        prev_pc = pc
pc_toronto = pc_cleaned
pc_toronto.shape

(103, 3)

### Get the coordinates of the neighborhoods

- Clean neighborhood names with special characters
- Iterate through each postal code
    - (Directly search by postal code was too unreliable)
    - Iterate through each postal code
    - Take median value of the neighborhood locations
    - If no location was found, search again for the location of the borough


In [87]:
#Clean neighborhood names with special characters
pc_toronto = pc_toronto.replace('Caledonia-Fairbanks','Caledonia, Fairbanks')
pc_toronto.iloc[85,2] = 'Queens Park, Ontario Provincial Government'

In [91]:
geolocator = Nominatim(user_agent='toronto')
pc_toronto['Latitude'] = ''
pc_toronto['Longitude'] = ''
print('Search locations running')

#Iterate through each postal code
for index, row in pc_toronto.iterrows(): 
    lt_ls = []
    lo_ls = []
    nb_ls = row['Neighborhood']
    
    #Iterate through each postal code
    for nb in nb_ls.split(','):
        address =  nb + ', Toronto' 
        location = geolocator.geocode(address)
        try:
            lt_ls.append(location.latitude)
            lo_ls.append(location.longitude)
        except:
            pass
    try:
        #Take median value of the neighborhood locations
        row['Latitude'] = stat.median(lt_ls)
        row['Longitude'] = stat.median(lo_ls)
    except:
        #If no location was found, search again for the location of the borough
        address =  row['Borough'] + ', Toronto' 
        location = geolocator.geocode(address)
        try:
            row['Latitude'] = location.latitude
            row['Longitude'] = location.longitude
        except:
            print('No location found for index {}: {}'.format(index, row['PostalCode']))
            row['Latitude'] = 0
            row['Longitude'] = 0
            
print('Search locations done')

Search locations running
Search locations done


In [134]:
backupquery = pc_toronto
#pc_toronto

### Plot map to verify locations of neighborhood

In [135]:
# set color scheme for the boroughs
boroughs = pc_toronto['Borough'].unique()
x = np.arange(len(boroughs))
ys = [i + x + (i*x)**2 for i in range(len(boroughs))]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# create map of Toronto using latitude and longitude values
location = geolocator.geocode('Toronto')
map_toronto = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

# add markers to map
for lat, lng, label, br in zip(pc_toronto['Latitude'], pc_toronto['Longitude'], pc_toronto['Neighborhood'], pc_toronto['Borough']):
    label = label + '('+ br +')'
    label = folium.Popup(label, parse_html=True)
    cl_idx = np.where(boroughs == br)
    cl = rainbow[int(cl_idx[0])]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=cl,
        fill=True,
        fill_color=cl,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto