# Applied Data Science Capstone

## Week 3 Peer-Graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries

In [1]:
import numpy as np # import numpy to handle data in a vectorized manner

import pandas as pd # import pandas for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# import plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means
from sklearn.cluster import KMeans

# import Beautiful Soup 
from bs4 import BeautifulSoup

import json # import json to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # from an address into latitude and longitude values

import requests # import library to handle requests
from pandas.io.json import json_normalize # from JSON file into a pandas dataframe

import xml

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # import folium for rendering

print('The libraries loaded successfully.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

### Scrapping Wikipedia page

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
bs = BeautifulSoup(url,'lxml')

### Creating the dataframe with three columns: Postalcode, Borough, and Neighbourhood

In [3]:
tbl_post = bs.find('table')
cols= tbl_post.find_all('td')

postcode = []
borough = []
neighbourhood = []

for i in range(0, len(cols), 3):
    postcode.append(cols[i].text.strip())
    borough.append(cols[i+1].text.strip())
    neighbourhood.append(cols[i+2].text.strip())
        
df_post = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
df_post.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [4]:
### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
df_post['Borough'].replace('Not assigned', np.nan, inplace=True)
df_post.dropna(subset=['Borough'], inplace=True)

df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [7]:
df_postna = df_post.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_postna.columns = ['Postcode', 'Borough', 'Neighbourhood']
df_postna

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [8]:
df_postna['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)

df_postna

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [9]:
df_postna.shape

(103, 3)

In [10]:
df_gp = pd.read_csv('http://cocl.us/Geospatial_data')
df_gp.columns = ['Postcode', 'Latitude', 'Longitude']

In [11]:
df_merge = pd.merge(df_postna, df_gp, on=['Postcode'], how='inner')

df_result = df_merge[['Borough', 'Neighbourhood', 'Postcode', 'Latitude', 'Longitude']].copy()

df_result.head()

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
1,Scarborough,Rouge Hill / Port Union / Highland Creek,M1C,43.784535,-79.160497
2,Scarborough,Guildwood / Morningside / West Hill,M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [12]:
address = 'Toronto, Canada'

geo_locator = Nominatim()
location = geo_locator.geocode(address)
latitude_loc = location.latitude
longitude_loc = location.longitude
print('The geograpical coordinate of Toronto City: {}, {}.'.format(latitude_loc, longitude_loc))

  app.launch_new_instance()


The geograpical coordinate of Toronto City: 43.6534817, -79.3839347.


In [14]:
# Create the map of Toronto City by using latitude and longitude values
toronto_map = folium.Map(location=[latitude_loc, longitude_loc], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_result['Latitude'], df_result['Longitude'], df_result['Borough'], df_result['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

## Exploring the Neighbourhoods of Toronto 
#### Defineing Foursquare Credentials and Version

In [15]:
CLIENT_ID = 'not going to share that' # Foursquare ID
CLIENT_SECRET = 'or this' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('The credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

The credentails:
CLIENT_ID: not going to share that
CLIENT_SECRET:or this


### Just Selecting the Neighbourhoods of Toronto 

In [16]:
df_t4 = df_result[df_result['Borough'].str.contains('Toronto')]

to_data = df_t4.reset_index(drop=True)
to_data            

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,East Toronto,The Danforth West / Riverdale,M4K,43.679557,-79.352188
2,East Toronto,India Bazaar / The Beaches West,M4L,43.668999,-79.315572
3,East Toronto,Studio District,M4M,43.659526,-79.340923
4,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879
5,Central Toronto,Davisville North,M4P,43.712751,-79.390197
6,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
7,Central Toronto,Davisville,M4S,43.704324,-79.38879
8,Central Toronto,Moore Park / Summerhill East,M4T,43.689574,-79.38316
9,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,M4V,43.686412,-79.400049


### Just for the map of Toronto's Neighbourhoods

In [19]:
# Creating the map of Toronto by using latitude and longitude values
nhood_map = folium.Map(location=[latitude_loc, longitude_loc], zoom_start=10)

# adding markers to map
for lat, lng, borough, neighborhood in zip(to_data['Latitude'], to_data['Longitude'], to_data['Borough'], to_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(nhood_map)  
    
nhood_map

### For the first neighbourhood

In [20]:
to_data.loc[0, 'Neighbourhood']

'The Beaches'

### For the neighbourhood long and lat values

In [21]:
neighbourhood_latitude = to_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = to_data.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = to_data.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


### For the top 100 venus within a radius from the centroid of 500 meters

In [32]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=not going to share that&client_secret=or this&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [28]:
results = requests.get(url).json()
results

{'meta': {'code': 400,
  'errorType': 'invalid_auth',
  'errorDetail': 'Missing access credentials. See https://developer.foursquare.com/docs/api/configuration/authentication for details.',
  'requestId': '5e9f611802a1720020126ee9'},
 'response': {}}

In [29]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
### Cleaning json and structuring into a pandas dataframe

In [30]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filtering columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filtering the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# cleaning columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

### How many venus were returned from Foursquare?

In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

NameError: name 'nearby_venues' is not defined