# Wk3 : Neighborhood segmentation and clustering

#### Importing libraries

In [1]:
import numpy as np 
import pandas as pd

import json

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

print('Libraries imported.')

Libraries imported.


## 1. Clean up of the data

#### Making a dataframe from the raw data of Postal Code data from Toronto's Wikipedia

In [2]:
df_wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
#df_wiki

#### Dropping the rows with "Not assigned" in Borough

In [72]:
df_wiki.drop(df_wiki[df_wiki['Borough']=='Not assigned'].index, axis=0, inplace=True)
df_wiki = df_wiki.reset_index(drop=True)
df_wiki

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


#### Number of the rows in the dataframe after the clean-up

In [69]:
df_wiki.shape

(103, 3)

## 2. Acquisition of geographical coordinates for each Postal Code

#### 1. Read the csv file of geographical coordinates of each postal code

In [74]:
df_Geocsv = pd.read_csv('Geospatial_Coordinates.csv')
df_Geocsv = df_Geocsv.reset_index(drop=True)
df_Geocsv

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


#### 2. Merging wiki data and geographical coordinates data

#### Preparation of dataframe template

In [119]:
col = ['Postal Code','Borough','Neighborhood','Latitude','Longitude']
d_pc = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A']

In [120]:
df = pd.DataFrame(index=[0,1,2,3,4,5,6,7,8,9,10,11],columns=col)

In [121]:
for i in range(len(d_pc)):
    df.at[i,'Postal Code'] = d_pc[i]

#### Merging correct cells from wiki dataframe

In [122]:
for i in range(0,df.shape[0]):
    for k in range(0,df_wiki.shape[0]):
        if (df_wiki['Postal Code'][k] == d_pc[i]):
            df['Borough'][i] = df_wiki['Borough'][k]
            df['Neighborhood'][i] = df_wiki['Neighborhood'][k]
            k =+ 1

#### Merging correct cells from geographical dataframe

In [112]:
for i in range(0,df.shape[0]):
    for k in range(0,df_Geocsv.shape[0]):
        if (df_Geocsv['Postal Code'][k] == d_pc[i]):
            df['Latitude'][i] = df_Geocsv['Latitude'][k]
            df['Longitude'][i] = df_Geocsv['Longitude'][k]
            k =+ 1

#### Prepared dataframe including Postal Code, Borough, Neighborhood and geographical coordinates

In [132]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,,
1,M2H,North York,Hillcrest Village,,
2,M4B,East York,"Parkview Hill, Woodbine Gardens",,
3,M1J,Scarborough,Scarborough Village,,
4,M4G,East York,Leaside,,
5,M4M,East Toronto,Studio District,,
6,M1R,Scarborough,"Wexford, Maryvale",,
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",,
8,M9L,North York,Humber Summit,,
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",,


## 3. Exploring around Toronto

#### 1. Making the dataframe that contains "Toronto" in "Borough" cells

In [134]:
df_wiki['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [138]:
df_trnt=df_wiki[df_wiki['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_trnt['Latitude']=""
df_trnt['Longitude']=""
#df_trnt

In [140]:
for i in range(0,df_trnt.shape[0]):
    for k in range(0,df_Geocsv.shape[0]):
        if (df_Geocsv['Postal Code'][k] == df_trnt['Postal Code'][i]):
            df_trnt['Latitude'][i] = df_Geocsv['Latitude'][k]
            df_trnt['Longitude'][i] = df_Geocsv['Longitude'][k]
            k =+ 1

df_trnt

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3789
3,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
4,M4E,East Toronto,The Beaches,43.6764,-79.293
5,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
6,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
7,M6G,Downtown Toronto,Christie,43.6695,-79.4226
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6506,-79.3846
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669,-79.4423


#### Longitude and latitude of Toronto

In [149]:
lat_trnt = 43.651070
lon_trnt = -79.347015

#### Create a map of Tpronto with neighborhoods superimposed on top

In [152]:
# create map of Toronto using latitude and longitude values
map_trnt = folium.Map(location=[lat_trnt, lon_trnt], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_trnt['Latitude'], df_trnt['Longitude'], df_trnt['Borough'], df_trnt['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#2233cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_trnt)
    
map_trnt