# Applied Data Science Capstone - Week 3 Assignment


#### By: Osama Mohsen
#### February 15, 2021

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
import matplotlib.colors as colors

pd.set_option('display.max_rows', 1000, 'display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)

import folium
import geocoder
from sklearn.cluster import KMeans
print("Importing libraries is complete.")

Importing libraries is complete.


# Part 1

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# reading the table in the webpage and store it in a dataframe
df = pd.read_html(url)[0]
print("The shape of the dataframe is {} rows by {} columns".format(df.shape[0], df.shape[1]))
display(df.head(10))

The shape of the dataframe is 180 rows by 3 columns


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [3]:
# For each row in the dataframe, check if the borough exit but the neighborhood is 'Not assigned'
# in this case, the neighborhood will be the same as the borough
for index, row in df.iterrows():
    if (row['Borough'] != 'Not assigned') and (row['Neighbourhood'] == 'Not assigned'):
        df.at[index, 'Neighbourhood'] = row['Borough']

# remove rows where 'Borough' is 'Not assigned'
df = df.loc[(df['Borough'] != 'Not assigned')]
# sorting rows by 'Postal Code'
df = df.sort_values(by='Postal Code').reset_index(drop=True)
display(df.head(10))

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
print("After cleaning, the shape of the dataframe is {} rows by {} columns".format(df.shape[0], df.shape[1]))

After cleaning, the shape of the dataframe is 103 rows by 3 columns


# Part 2

In [5]:
# reading the csv file containing the geographical coordinates of each postal code in Toronto
geo_coor_csv = 'http://cocl.us/Geospatial_data'
df_geo = pd.read_csv(geo_coor_csv)

In [6]:
# sorting rows in the geographical coordinates dataframe by 'Postal Code'
df_geo = df_geo.sort_values(by='Postal Code').reset_index(drop=True)

# merging both dataframes: 1) borogh dataframe and 2) the geographical coordinates dataframe
df_final = df.join(df_geo[['Latitude', 'Longitude']])
df_final.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Part 3

In [7]:
# Working with only boroguhs hat contain the word 'Toronto'
df_toronto = df_final.loc[df_final['Borough'].str.contains('Toronto')]

# resetting the index
df_toronto.reset_index(drop=True, inplace=True)
df_toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


---

### Exploring Data

Because there exist more than one neighborhood in one postal code area, the data is split  in order to find the total count of neighborhoods. To do this, I loop over each row, split by ',' and then update the counter.

In [8]:
# number of boroghs
n_borough = df_toronto['Borough'].nunique()

# number of postal codes
n_postal = df_toronto['Postal Code'].nunique()

# number of neighborhoods
n_neighbor = 0
for index, row in df_toronto['Neighbourhood'].iteritems():
    count = len(row.split(','))      
    n_neighbor += count

print('The dataframe has {} boroughs, {} postal codes, and {} neighborhoods.'.format(n_borough, n_postal, n_neighbor))

The dataframe has 4 boroughs, 39 postal codes, and 78 neighborhoods.


### Create a map of Toronto with postal codes superimposed on top

In [9]:
# latitude and longitude of Toronto
latitdue = 43.6532
longitude = -79.3832

# create a map of Toronto using the latitude and longitude values above
map_toronto = folium.Map(location=[latitdue, longitude], zoom_start=12)

# add markers to map
for lat, lng, postal in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Postal Code']):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       tooltip=postal,
                       color='blue',
                       fill=True,
                       fill_color='orange',
                       fill_opacity=0.8).add_to(map_toronto)
map_toronto

### Clustering of Neighborhoods by postal code

In [10]:
uniqueBorough = df_toronto['Borough'].unique().tolist()
integerBoroguh = [1, 2, 3, 4]

# dropping columns with string content
df_cluster = df_toronto.drop(columns=['Postal Code', 'Neighbourhood'])

# replacing 'Borough' values with integers
df_cluster['Borough'].replace(to_replace=uniqueBorough, value=integerBoroguh, inplace=True)

# calculating distance to the city centre and add to the dataframe
df_cluster['Distance to Centre'] = np.sqrt((df_toronto['Latitude']-latitdue)**2 + (df_toronto['Longitude']-longitude)**2)
df_cluster.head()

Unnamed: 0,Borough,Latitude,Longitude,Distance to Centre
0,1,43.676357,-79.293031,0.093095
1,1,43.679557,-79.352188,0.040699
2,1,43.668999,-79.315572,0.069449
3,1,43.659526,-79.340923,0.042748
4,2,43.72802,-79.38879,0.075029


In [11]:
# setting the number of clusters
k = 3

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=66).fit(df_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 1])

In [12]:
# add clustering labels to the dataframe
df_cluster.insert(0, 'Cluster Labels', kmeans.labels_)

In [13]:
# latitude and longitude of Toronto
latitdue = 43.6532
longitude = -79.3832

# create a map of Toronto using the latitude and longitude values above
map_cluster = folium.Map(location=[latitdue, longitude], zoom_start=12)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(.1, 0.9, k))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, cluster in zip(df_cluster['Latitude'], df_cluster['Longitude'], df_cluster['Cluster Labels']):
    folium.CircleMarker([lat, lng],
                       radius=5,
                       tooltip= 'Cluster ' + str(cluster),
                       color='blue',
                       fill=True,
                       fill_color=rainbow[cluster-1],
                       fill_opacity=0.8).add_to(map_cluster)
map_cluster