<h1>Coursera Capstone Project</h1>
<h2>First part : Create Toronto Dataframe</h2>

In [147]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #for document handling
import requests as requests #for web scrapping
import re #for regular expressions

<h3>Scrapping wikipedia page</h3>

In [154]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
try:
    page = requests.get(url)
except:
    print("An error occured.")

soup = BeautifulSoup(page.text, 'html.parser')
#print(soup.prettify())

<h3>Getting the table in the document and creating the dataframe</h3>

In [170]:
#getting the content of the table in a list
html_table = soup.find("table")

#we extract the values of the table in table_content
table_content = html_table.find_all('td')
v_list = []

#we iterate over table_content to extract the values of the three columns for each td
for td in table_content:
    content = td.find_next('p')
    #value of postal code
    c1 = content.find('b')
    v_list.append(c1.getText().split('\n')[0])
    c2 = content.find('a')
    #value of borhood and neiborhood null if no a elements in the td value
    if c2 is None:
        v_list.append("")
        v_list.append("")
    #if borhood value is not null we add the value of borhood and concatenate values of neiborhood
    else: 
        v_list.append(c2.getText().split('\n')[0])
        c3 = content.find_all('a')
        sub = ""
        count = 0
        for a in c3:
            if sub == "" and count >=1:
                sub = a.getText()
                count +=1
            elif sub != "" and count >=1:
                sub = sub + ',' + a.getText()
                count +=1
            else:
                count +=1
        v_list.append(sub)

#Create a dataframe with the list
#lets convert our table in a numpy array
num = np.array(v_list)
table = pd.DataFrame(num)
#currently its shape is single dimensional, lets change that to a two dimensional matrix
reshaped = num.reshape(180,3)
#now we construct our table
table = pd.DataFrame(reshaped, columns=['Postal Code','Borough','Neighborhood'])
#display the dataframe
pd.DataFrame(table)



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,"Mimico,The Queensway"


<h3>Cleaning the dataframe</h3>

In [171]:
#Erase empty borhoud lines
table = table[table['Borough']!='']
#Fill neighborhood with borhood values when neighborhood is empty 
table['Neighborhood'] = np.where(table['Neighborhood'] == '', table['Borough'], table['Neighborhood'])
table

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
5,M6A,North York,"Lawrence Manor,Lawrence Heights"
6,M7A,Queen's Park,Queen's Park
...,...,...,...
160,M8X,Etobicoke,"The Kingsway,Old Mill"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,Business reply mail,Business reply mail
169,M8Y,Etobicoke,"Old Mill,Sunnylea,Humber Bay,Mimico,The Queensway"


<h3>Printing the size of the dataframe</h3>

In [172]:
table.shape

(101, 3)

<h2>Second part : getting geographical data</h2>

<p>We are going to use pgeocode which is a very useful library to use</p>

In [173]:
#For example with M3A
import pgeocode
nomi = pgeocode.Nominatim('ca')
nomi.query_postal_code("M3A")

postal_code                                                     M3A
country_code                                                     CA
place_name        North York (York Heights / Victoria Village / ...
state_name                                                  Ontario
state_code                                                       ON
county_name                                             North York 
county_code                                                     NaN
community_name                                                  NaN
community_code                                                  NaN
latitude                                                    43.7545
longitude                                                    -79.33
accuracy                                                        1.0
Name: 0, dtype: object

<h3>Getting coordinates values</h3>

In [175]:
adresses = table["Postal Code"]
latitude = []
longitude = []
#Looping through the postal codes
for i in adresses:
    data = nomi.query_postal_code(i)
    latitude.append(data.latitude)
    longitude.append(data.longitude)
print(latitude)
print(longitude)

    

[43.7545, 43.7276, 43.6555, 43.7223, 43.6641, 43.6662, 43.8113, 43.745, 43.7063, 43.6572, 43.7081, 43.6505, 43.7878, 43.7334, 43.6913, 43.6513, 43.6915, 43.6437, 43.7678, 43.6784, 43.6456, 43.6889, 43.7712, 43.7124, 43.6564, 43.6683, 43.7686, 43.8015, 43.7535, 43.7059, 43.6496, 43.6655, 43.7464, 43.7801, 43.7694, 43.6872, 43.623, 43.648, 43.7298, 43.7797, 43.739, 43.6803, 43.6469, 43.6383, 43.7122, 43.7547, 43.7334, 43.6693, 43.6492, 43.7137, 43.7598, 43.7247, 43.7915, 43.7319, 43.7335, 43.6934, 43.7366, 43.6952, 43.7673, 43.7568, 43.7301, 43.6748, 43.7068, 43.7612, 43.75, 43.7135, 43.6966, 43.6605, 43.6949, 43.7507, 43.7786, 43.7143, 43.6736, 43.6469, nan, 43.6898, 43.7946, 43.702, 43.6629, 43.6512, 43.7812, 43.6899, 43.6541, 43.8177, 43.6861, 43.6404, 43.6075, 43.7432, 43.8016, 43.6827, 43.6437, 43.6021, 43.7144, 43.834, 43.6684, 43.6492, 43.6518, 43.6656, 43.7804, 43.6325, 43.6256]
[-79.33, -79.3148, -79.3626, -79.4504, -79.3889, -79.5282, -79.193, -79.359, -79.3094, -79.3783, -79.4

<h3>Adding coordinates to dataframe</h3>

<p>So we have our two lists with latitude and longitude data, now lets add this to our dataframe</p>

In [176]:
#Adding the columns to the dataframe
table['Latitude'] = latitude
table['Longitude'] = longitude
table

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7545,-79.3300
3,M4A,North York,Victoria Village,43.7276,-79.3148
4,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.6555,-79.3626
5,M6A,North York,"Lawrence Manor,Lawrence Heights",43.7223,-79.4504
6,M7A,Queen's Park,Queen's Park,43.6641,-79.3889
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway,Old Mill",43.6518,-79.5076
165,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
168,M7Y,Business reply mail,Business reply mail,43.7804,-79.2505
169,M8Y,Etobicoke,"Old Mill,Sunnylea,Humber Bay,Mimico,The Queensway",43.6325,-79.4939


<p>We have seen that there are empty values in our coordinates. Lets drop the columns with empty coordinates</p>

In [177]:
table.dropna(subset = ["Latitude"], inplace=True)
table.dropna(subset = ["Longitude"], inplace=True)

<h2>Third part: Clustering</h2>

<p>Let's apply the same strategy as the new york dataframe</p>

<h3>Getting the coordinates of Toronto</h3>

In [178]:
from geopy.geocoders import Nominatim
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<h3>Mapping the neighborhood values</h3>

In [179]:
import folium
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighborhood in zip(table['Latitude'], table['Longitude'], table['Borough'], table['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters)
map_clusters

<h3>k-means clustering</h3>

In [180]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

#select latitude and longitude of table
table_coor = table[["Latitude", "Longitude"]]

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(table_coor)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 0, 4, 0, 2, 3, 1, 1, 0], dtype=int32)

<h3>Visualizing the clusters</h3>

In [181]:
# add clustering labels
table.insert(0, 'Cluster Labels', kmeans.labels_)
table

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,1,M3A,North York,Parkwoods,43.7545,-79.3300
3,1,M4A,North York,Victoria Village,43.7276,-79.3148
4,0,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.6555,-79.3626
5,4,M6A,North York,"Lawrence Manor,Lawrence Heights",43.7223,-79.4504
6,0,M7A,Queen's Park,Queen's Park,43.6641,-79.3889
...,...,...,...,...,...,...
160,2,M8X,Etobicoke,"The Kingsway,Old Mill",43.6518,-79.5076
165,0,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
168,3,M7Y,Business reply mail,Business reply mail,43.7804,-79.2505
169,2,M8Y,Etobicoke,"Old Mill,Sunnylea,Humber Bay,Mimico,The Queensway",43.6325,-79.4939


<p>We have added the cluster labels in our dataframe in column 1</p>

In [183]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(table['Latitude'], table['Longitude'], table['Borough'], table['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<p>We have our clustered Neighborhoods on the map ! </p>