<b>This notebook scraps a data (Toronto neighborhoods) from Wikipedia page, converts that into pandas dataframe and clusters the neighborhoods of Toronto on a map.</b>

<i> (1) Import libraries</i>

In [1]:
# import libraries
 
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
print('Libraries loaded.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

<i> (2) Scrape the data</i>

In [2]:
#  scrape the data from the wiki page to a table

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find('table')

<i> (3) Clean the data</i>

In [3]:
# clean the table

Postcode      = []
Borough       = []
Neighborhood = []

for i in table.find_all('tr'):
    
    x = 1
    Postcode_      = -1
    Borough_       = -1
    Neighborhood_ = -1
    
    for j in i.find_all('td'):
        if x == 1: 
            Postcode_ = j.text
        if x == 2: 
            Borough_ = j.text
            tag_a_Borough = j.find('a')
            
        if x == 3: 
            Neighborhood_ = str(j.text).strip()
            tag_a_Neighborhood = j.find('a')
            
        x +=1
        
        if (Postcode_ == 'Not assigned' or Borough_ == 'Not assigned' or Neighborhood_ == 'Not assigned'):
            
            continue
        
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighborhood is None)):
            
            continue
           
    except:
        
        pass
    
    if(Postcode_ == -1 or Borough_ == -1 or Neighborhood_ == -1):
        
        continue

    Postcode.append(Postcode_)
    Borough.append(Borough_)
    Neighborhood.append(Neighborhood_)
    
raw_boroughs_table = {'PostalCode':Postcode, 'Borough':Borough, 'Neighborhood':Neighborhood}

<i> (4) Wrap the data to a dataframe</i>

In [4]:
# create a dataframe from the table

df = pd.DataFrame.from_dict(raw_boroughs_table)
df.to_csv('toronto_part1.csv')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


<i> (5) Group duplicate Neighborhoods</i>

In [5]:
# group duplicate neighborhoods

df.sort_values(['PostalCode'])

duplics=df['PostalCode'].duplicated()

df['Duplic'] = duplics

i=len(df['PostalCode'])

while i>1:
    i=i-1
    if df['Duplic'].values[i]:
        temp_str = df['Neighborhood'].values[i-1] + ", "+df['Neighborhood'].values[i]
        df['Neighborhood'].values[i-1] = temp_str
        df.drop(i, inplace = True)
        
df.reset_index(drop = True, inplace = True)
df = df.drop('Duplic', 1)

<i> (6) Describe the output dataframe</i>

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [7]:
df.shape

(77, 3)

<i>(7) Add geo coordinates </i>

In [7]:
csv_path = 'http://cocl.us/Geospatial_data'
geo_df = pd.read_csv(csv_path)
geo_df.head()

Lat_list  = []
Lon_list = []

for i in range(0,len(df['PostalCode'])):
    x = geo_df.loc[geo_df['Postal Code'] == df['PostalCode'].values[i]]
    Lat_list.append(x['Latitude'].values[0])
    Lon_list.append(x['Longitude'].values[0])
    
df['Latitude'] = Lat_list
df['Longitude'] = Lon_list

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


<i>(8) Vizualize the data on a map</i>

In [32]:
#first, limit the dataframe to Boroughs containing "Toronto"

df2 = df[df['Borough'].str.contains('Toronto',regex=False)]

Toronto_map = folium.Map(location=[43.68, -79.4], zoom_start=12) 

for lat,lng,borough,neighborhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'],df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(Toronto_map)

#for i in range(1,len(df2['PostalCode'])):
#for i in range(1,len(df['PostalCode'])):
#    lat = df2['Latitude'].values[i]
#    lng = df2['Longitude'].values[i]
#    label = df2['Neighborhood'].values[i]
#    folium.features.CircleMarker([lat, lng], radius=5, color='blue', popup=label, fill = True, fill_color='red', fill_opacity=0.6).add_to(Toronto_map)

Toronto_map


<i> (9)  Cluster the neighborhoods</i>

In [40]:
k=4  #number of clusters
df2 = df[df['Borough'].str.contains('Toronto',regex=False)]
df_clusters = df2.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(df_clusters)
kmeans.labels_
df2.insert(0, 'Cluster Labels', kmeans.labels_)

# create map
Toronto_map_clustered = folium.Map(location=[43.68, -79.4], zoom_start=12) 

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood'], df2['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Toronto_map_clustered)
       
Toronto_map_clustered