## 1. Loading the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
df = dfs[0]
df



Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Remove boroughs with 'not assigned' values

In [3]:
df = df[df.Borough != 'Not assigned']
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Let's see how many 'not assigned' Neighbourhoods there are

In [4]:
df[df['Neighbourhood'] == 'not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


#### There are not 'not assigned' values in Neighbourhoods column

In [5]:
print(f'Dataframe has {df.shape[0]} rows and {df.shape[1]} columns')

Dataframe has 103 rows and 3 columns


#### There are currently 103 rows in dataframe

## 2. Getting latitude and longitude coordinates of postal codes

#### I will use the csv file

In [6]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df_n = df.merge(df_data_1, how='inner', on='Postal Code')
df_n

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## 3. Explore and cluster the neighborhoods in Toronto

#### Let's work only with Toronto's Boroughs

In [8]:
toronto_index_list = []
for index, el in enumerate(df_n['Borough'].values):
    if 'toronto' in el.lower():
        toronto_index_list.append(index)
toronto_index_list
df_n = df_n.loc[toronto_index_list, :]
df_n

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [9]:
df_n.shape

(39, 5)

### Segmenting dataframe based on Boroughs so that i can use them to analyze Neighbourhoods

In [11]:
tor_onehot = pd.get_dummies(df_n[['Borough']])
tor_onehot['Neighbourhood'] = df_n['Neighbourhood']
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]
tor_onehot.head()

Unnamed: 0,Neighbourhood,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto
2,"Regent Park, Harbourfront",0,1,0,0
4,"Queen's Park, Ontario Provincial Government",0,1,0,0
9,"Garden District, Ryerson",0,1,0,0
15,St. James Town,0,1,0,0
19,The Beaches,0,0,1,0


### Clustering

In [12]:
from sklearn.cluster import KMeans
k_clusters = 4
toronto_clustering = tor_onehot.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters = k_clusters, random_state=0).fit(toronto_clustering)
labels = kmeans.labels_

#### I will use new df to store the final result

In [13]:
final_df = pd.DataFrame(columns=['Neighbourhood', 'Label', 'Latitude', 'Longitude'])

In [14]:
final_df['Neighbourhood'] = tor_onehot['Neighbourhood']
final_df['Label'] = labels
final_df['Latitude'] = df_n['Latitude']
final_df['Longitude'] = df_n['Longitude']
final_df.reset_index(drop=True, inplace=True)
final_df

Unnamed: 0,Neighbourhood,Label,Latitude,Longitude
0,"Regent Park, Harbourfront",0,43.65426,-79.360636
1,"Queen's Park, Ontario Provincial Government",0,43.662301,-79.389494
2,"Garden District, Ryerson",0,43.657162,-79.378937
3,St. James Town,0,43.651494,-79.375418
4,The Beaches,3,43.676357,-79.293031
5,Berczy Park,0,43.644771,-79.373306
6,Central Bay Street,0,43.657952,-79.387383
7,Christie,0,43.669542,-79.422564
8,"Richmond, Adelaide, King",0,43.650571,-79.384568
9,"Dufferin, Dovercourt Village",2,43.669005,-79.442259


## 4. Visualize 

In [15]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Libraries are ready to use!')

Libraries are ready to use!


#### I will get coordinates of Toronto by web

In [16]:
toronto_latitude = 43.651070
toronto_longitude = -79.347015
map_tor = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

In [17]:
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(final_df['Latitude'], final_df['Longitude'], final_df['Neighbourhood'], final_df['Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_tor)
       
map_tor