# Coursera Clustering Capstone

### Notebook on repo for scraping postal codes in Canada and then applying lat longs to cluster.

# Scraping Website - BeautifulSoup

#### Imports

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

#### BS Code to Ingest HTML

In [2]:
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

postal_all = requests.get(page)

postal_soup = bs(postal_all.content, 'html.parser')

#### BS code to extract table in HTML

In [3]:
postal_table = postal_soup.find(class_ = "wikitable sortable")

#### Converting HTML to Dataframe

In [4]:
postal_codes = pd.read_html(str(postal_table))

In [5]:
postal_codes_df = postal_codes[0]
postal_codes_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Drop rows with unassigned Postal Codes for the Borough and reindex

In [6]:
postal_codes_df = postal_codes_df[postal_codes_df['Borough'] != 'Not assigned']
postal_codes_df = postal_codes_df.reset_index(drop = True)
postal_codes_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# Joining Latitudes and Longitudes to Postal Codes

In [7]:
lat_longs = pd.read_csv('coords.csv')
lat_longs

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [8]:
postal_codes = postal_codes_df.merge(lat_longs, left_on = 'Postal Code', right_on = 'Postal Code')

In [16]:
postal_codes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Postal Code   103 non-null    object 
 1   Borough       103 non-null    object 
 2   Neighborhood  103 non-null    object 
 3   Latitude      103 non-null    float64
 4   Longitude     103 non-null    float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


# Mapping Neighborhoods

#### Creating basic map of neighborhoods 

In [65]:
import folium
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

locator = Nominatim(user_agent="myGeoencoder")

location = locator.geocode('Toronto, Ontario')
latitude = location.latitude
longitude = location.longitude

map1 = folium.Map(location=[latitude, longitude], zoom_starts = 12)

for lat, lng, borough, neighborhood in zip(postal_codes['Latitude'], postal_codes['Longitude'], postal_codes['Borough'], postal_codes['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = True).add_to(map1)


map1

In [40]:
# Adding columns to cluster by the number in the Postal Code 

for i in range(1,10):
    postal_codes['Name_{}'.format(i)] = postal_codes['Postal Code'].str.contains('{}'.format(i)).astype(int)

#### Creating Clusters based on Numbers in Postal Code

In [41]:
postal_numbers = postal_codes.drop(['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], axis = 1)

In [44]:
postal_numbers

Unnamed: 0,Name_Toronto,Name_1,Name_2,Name_3,Name_4,Name_5,Name_6,Name_7,Name_8,Name_9
0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
98,0,0,0,0,0,0,0,0,1,0
99,1,0,0,0,1,0,0,0,0,0
100,1,0,0,0,0,0,0,1,0,0
101,0,0,0,0,0,0,0,0,1,0


In [47]:
# Set number of clusters
k = 10

#run K-means
kmeans = KMeans(n_clusters = k, random_state = 42).fit(postal_numbers)

In [57]:
kmeans.labels_

array([6, 3, 1, 5, 8, 0, 2, 6, 3, 1, 5, 0, 2, 6, 3, 1, 5, 0, 2, 3, 1, 5,
       2, 3, 1, 9, 2, 4, 6, 3, 1, 9, 2, 4, 6, 3, 1, 9, 2, 4, 6, 3, 1, 9,
       2, 4, 6, 3, 1, 5, 0, 2, 4, 6, 3, 1, 5, 0, 2, 4, 6, 3, 1, 5, 0, 2,
       4, 3, 1, 9, 0, 2, 4, 3, 1, 9, 8, 0, 2, 3, 1, 9, 2, 3, 1, 2, 3, 1,
       7, 0, 2, 3, 1, 7, 0, 2, 3, 1, 7, 3, 8, 7, 7], dtype=int32)

#### Joining the Cluster Grops from KMeans to the neighborhood data

In [63]:
postal_codes['Cluster'] = pd.Series(kmeans.labels_, index = postal_codes.index)

#### Mapping clusters of neighborhoods based on the integers contained in the Postal Codes

In [71]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_starts = 100)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


for lat, lng, borough, cluster in zip(postal_codes['Latitude'], postal_codes['Longitude'], postal_codes['Borough'], postal_codes['Cluster']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(str(borough) + ' Cluster' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.2).add_to(map_clusters)


map_clusters