# Segmenting and Clustering Neighborhoods in Toronto Part 3

##                   PART 1

#### Installing important package

In [1]:
# install beautifulsoup4
!conda install -c conda-forge beautifulsoup4 --yes

# Install lxml parser
!conda install -c conda-forge lxml --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



#### Importing Liberary

In [2]:
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
import requests
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

##### 2. Scrape the Toronto post codes Wikipedia page with Beautiful Soup
 * Read the Toronto post codes wikipedia page with Beautiful Soup

In [3]:

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

# Get the table with the postal codes [class = 'wikitable sortable']
table = soup.find('table', class_='wikitable sortable')

#### converting into Dataframe

In [4]:
postal_codes_initial_df = pd.read_html(str(table))[0]


In [5]:
#3 replacing name
postal_codes_initial_df.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
print(postal_codes_initial_df.head(10), '\n')
print('Initial Toronto postal codes dataframe shape: ', postal_codes_initial_df.shape, '\n')

  Postal code           Borough                                  Neighborhood
0         M1A      Not assigned                                           NaN
1         M2A      Not assigned                                           NaN
2         M3A        North York                                     Parkwoods
3         M4A        North York                              Victoria Village
4         M5A  Downtown Toronto                    Regent Park / Harbourfront
5         M6A        North York             Lawrence Manor / Lawrence Heights
6         M7A  Downtown Toronto  Queen's Park / Ontario Provincial Government
7         M8A      Not assigned                                           NaN
8         M9A         Etobicoke                              Islington Avenue
9         M1B       Scarborough                               Malvern / Rouge 

Initial Toronto postal codes dataframe shape:  (180, 3) 



#### 3. Process the initial Toronto post codes Data Frame
* Drop the rows with the 'Borough' value 'Not Assigned'

In [6]:

postal_codes_initial_df.drop(postal_codes_initial_df[postal_codes_initial_df['Borough'] == 'Not assigned'].index,
                             axis=0, inplace=True)

* Replace the 'Not assigned' value in 'Neighborhood' with the corresponding value of 'Borough'

In [7]:

postal_codes_initial_df.loc[postal_codes_initial_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = postal_codes_initial_df.loc[postal_codes_initial_df['Neighborhood'] == 'Not assigned']['Borough']
postal_codes_initial_df.head(10) # Notice the Queen's Park Neighborhood

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don Mills
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


* Create a new Data Frame to combine the neighborhoods of the same postal code

In [8]:

postal_codes_grouped_df = pd.DataFrame(postal_codes_initial_df.groupby('Postal code')['Neighborhood'].apply(lambda tags: ', '.join(tags))).reset_index()

In [9]:
postal_codes_grouped_df.head(15)

Unnamed: 0,Postal code,Neighborhood
0,M1B,Malvern / Rouge
1,M1C,Rouge Hill / Port Union / Highland Creek
2,M1E,Guildwood / Morningside / West Hill
3,M1G,Woburn
4,M1H,Cedarbrae
5,M1J,Scarborough Village
6,M1K,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Golden Mile / Clairlea / Oakridge
8,M1M,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Birch Cliff / Cliffside West


* Now merge the initial and combined neighborhoods data frames to the final data frame

In [10]:

toronto_postal_codes_df = pd.merge(postal_codes_initial_df, postal_codes_grouped_df, how='inner', 
                                   on=['Postal code', 'Postal code'])

# Drop the single neighborhood column
toronto_postal_codes_df.drop('Neighborhood_x', axis=1, inplace=True)

# Now drop the created duplicate rows
toronto_postal_codes_df.drop_duplicates(inplace=True)

# Rename the created Neighborhood_y to Neighborhood
toronto_postal_codes_df.rename(columns={'Neighborhood_y': 'Neighborhood'}, inplace=True)

# Reset the data frame index
toronto_postal_codes_df.reset_index(drop=True, inplace=True)

* Check an example from the instruction where one postal code has many neighborhoods

In [11]:

# Set the pandas dataframe display.max_colwidth to -1 so that the full data frame columns are shown
pd.set_option('display.max_colwidth', -1)
toronto_postal_codes_df[toronto_postal_codes_df['Postal code'] == 'M5V']

Unnamed: 0,Postal code,Borough,Neighborhood
87,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport


In [12]:
toronto_postal_codes_df.head(20)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Finally print the shape of the final data frame

In [13]:
print('The final dataframe has {} rows.'.format(toronto_postal_codes_df.shape[0]))

The final dataframe has 103 rows.


## PART 2

##### 1. Read the geospatial coordinates file
* I tried to use the geocoder package but it was not working properly. So I downloaded 'Geospatial_Coordinates.csv' (manually on my PC - I don't have wget installed)

* Read the geospatial coordinates file into a data frame

In [14]:
toronto_latlng_df = pd.read_csv('Geospatial_Coordinates.csv')

# Check the created dataframe
toronto_latlng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


* Merge the two data neighborhood and geospatial coordinates data frame

In [15]:

# Merge data frames
toronto_postal_latlng_df = pd.merge(toronto_postal_codes_df, toronto_latlng_df, how='inner', left_on='Postal code', right_on='Postal Code')
toronto_postal_latlng_df.drop('Postal code', axis=1, inplace=True)
toronto_postal_latlng_df.head(20)

Unnamed: 0,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,Regent Park / Harbourfront,M5A,43.65426,-79.360636
3,North York,Lawrence Manor / Lawrence Heights,M6A,43.718518,-79.464763
4,Downtown Toronto,Queen's Park / Ontario Provincial Government,M7A,43.662301,-79.389494
5,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
7,North York,Don Mills,M3B,43.745906,-79.352188
8,East York,Parkview Hill / Woodbine Gardens,M4B,43.706397,-79.309937
9,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937


## PART 3

* In this part of the assignment we will segment the boroughs of Toronto and explore them based ON POSTAL CODES INSTEAD OF NEIGHBORHOODS, since we only have geospatial data corresponding to postal codes. As we saw in the previous parts of the assignment, one postal code may correspond to many neighborhoods. The data is contained in the data frame that was previously created: toronto_postal_latlng_df

### 1. Install and import the necessary libraries
* Install necessary libraries [geopy, folium]

In [16]:

!conda install -c conda-forge geopy --yes

#!conda install -c conda-forge folium=0.5.0
!conda install -c conda-forge folium --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [17]:
import numpy as np # library to handle data in a vectorized manner

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


### 2. Create a map of Toronto with boroughs superimposed on top.
* Use geopy library to get the latitude and longitude values of Toronto.
* In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent toronto_explorer

In [18]:

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


* Now, let's create the map of Toronto and its boroughs

In [48]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, postal_code, neighborhoods in zip(toronto_postal_latlng_df['Latitude'],
                                                         toronto_postal_latlng_df['Longitude'],
                                                         toronto_postal_latlng_df['Borough'],
                                                         toronto_postal_latlng_df['Postal Code'],
                                                         toronto_postal_latlng_df['Neighborhood']):
    label = '{}, {}, {}'.format(borough, postal_code, neighborhoods)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_toronto)
    
map_toronto

### 3. Explore Neighborhoods (Postal Codes) in Toronto

In [37]:
# Foursquare credentials and version
CLIENT_ID = 'DHBE22VPGL02ZITFEVU31JVYCV0SZJ3EGFHIUMRTKRTE4VIJ' # your Foursquare ID
CLIENT_SECRET = 'MCUC5VDGQMKFR40MMXYOWRDRGYUCPCA4MHIKRMGYW2DX0VGZ' # your Foursquare Secret
VERSION = '20190425' # Foursquare API version

radius = 500
LIMIT = 100

In [46]:
def getNearbyVenues(postal_codes, neighborhoods, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postal_code, neighborhood, lat, lng in zip(postal_codes, neighborhoods, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal_code,
            neighborhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                  'Neighborhood',
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

* Run the above function on each postal code and create a new dataframe called toronto_venues

In [47]:
toronto_venues = getNearbyVenues(postal_codes=toronto_postal_latlng_df['Postal Code'],
                                 neighborhoods=toronto_postal_latlng_df['Neighborhood'],
                                 latitudes=toronto_postal_latlng_df['Latitude'],
                                 longitudes=toronto_postal_latlng_df['Longitude'])

KeyError: 'groups'

In [None]:
print(toronto_venues.shape)
toronto_venues.head(10)