[View in Colaboratory](https://colab.research.google.com/github/Sidsharik/Coursera_Capstone/blob/master/Web_scraping_.ipynb)

Importing the required libraries.

In [0]:
import numpy as np
import pandas as pd
from urllib.request import urlopen # library used to query a website 
from bs4 import BeautifulSoup # library to used to parse the data returned from the website
#!pip install geocoder
#import geocoder
import matplotlib.cm as cm
import matplotlib.colors as colors

Scraping the Wikipedia page for the required table data.

In [3]:
# URL fo scraping

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Query the website and return the html to the vaiable 'page'

page = urlopen(url)

# Parse the html in the page variable, and store it in a Beautiful Soup format

soup = BeautifulSoup(page)

#  function “prettify” to look at nested structure of HTML page

print(soup.prettify())




<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":861324217,"wgRevisionId":861324217,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

Once the Webpage is Scraped, find the required table and extract it from the Webpage. The tables in the webpage are present in `<table> </table>` tag. 

In [4]:
# Finding the table from the above table. 'find_all' finds all the <table> </table>

all_tables = soup.find_all('table')

# Now to identify the right table, use attribute “class” of table and use it to filter the right table.
# check the class name by right click on the required table of web page 

table_required = soup.find('table',class_='wikitable sortable')

table_required


<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

Transforming the data into a pandas dataframe.


In [0]:
# Empty lists

PostalCode=[]
Borough=[]
Neighborhood=[]

# Adding the table data to the empty lists

for row in table_required.findAll("tr"):
    cells = row.findAll("td")
 
    if len(cells) == 3:
        
        PostalCode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))
        

Creating pandas DataFrame and appending the list's to the DataFrame

In [0]:
headers = ['PostalCode','Borough','Neighborhood']

# Appending the list's to the DataFrame

df_can = pd.DataFrame(columns = headers)
df_can['PostalCode'] = PostalCode
df_can['Borough'] = Borough
df_can['Neighborhood'] = Neighborhood


Converting "Not assigned" values -> NaN

In [0]:

# Converting "Not assigned" values -> NaN

df_can.replace('Not assigned',np.nan,inplace=True)


Dropping the NaN values from the Borough column

In [0]:
#Dropping the NaN values from the Borough column

df_can.dropna(subset=['Borough'],axis=0,inplace = True)


Grouping the data and applying ',' in between the grouped data

In [0]:
# Grouping the data and applying ',' in between the grouped data.

df_can = df_can.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()


Cleaning the data, by removing '\n' from the Dataset

In [0]:
# Cleaning the data, by removing '\n' from the Dataset

df_can.replace(r'\n', '', regex = True, inplace = True)


'Not assigned' value in the Neighborhood column is replaced with the corresponding Borough value.

In [0]:

# 'Not assigned' value in the Neighborhood column is replaced with the corresponding Borough value.

df_can['Neighborhood'].replace('Not assigned',df_can['Borough'],inplace=True)

Dimension of the matrix.

In [12]:
df_can.shape
df_can.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [39]:
# uploading the geospatial coordinates csv files to the google colab jupyter notebook

from google.colab import files

uploaded = files.upload()





Saving Geospatial_Coordinates.csv to Geospatial_Coordinates (1).csv


In [18]:
import io

# Reading the CSV file

df_lat_lng = pd.read_csv(io.StringIO(uploaded['Geospatial_Coordinates.csv'].decode('utf-8')))

#df_data_1.rename(columns={'Postal Code':'PostalCode'})

df_lat_lng.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
df_can_lat_lng = df_can.join(df_lat_lng.set_index('Postal Code'),on='PostalCode')

# The above code can be executed only once and then the dataframe is modified.


df_can_lat_lng


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


Importing k-means from sklearn and folium library


In [0]:
from sklearn.cluster import KMeans
#!pip install folium
import folium
from folium import plugins

Creating a map of Toronto with Nighborhoods super-imposed on top.

In [51]:
#coordinates of Toronto are 43.6532 , -79.3832

Latitude,Longitude = 43.6532,-79.3832
map_toronto = folium.Map(location=[Latitude,Longitude],zoom_start=10)

# add markers to the map.

for lat,lng,borough,neighborhood in zip(df_can_lat_lng['Latitude'],df_can_lat_lng['Longitude'],df_can_lat_lng['Borough'],df_can_lat_lng['Neighborhood']):
    label = '{}, {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)
    
map_toronto




Clustering the neighborhoods.

In [52]:
# set number of clusters
kclusters = 6

# Clustering the neighborhood on basis of the features - latitude, longitude, and cluster labels

toronto_neighborhood_cluster = df_can_lat_lng.drop(['PostalCode','Borough','Neighborhood'],axis=1)

toronto_neighborhood_cluster.head()

Unnamed: 0,Latitude,Longitude,Cluster labels
0,43.806686,-79.194353,3
1,43.784535,-79.160497,3
2,43.763573,-79.188711,3
3,43.770992,-79.216917,3
4,43.773136,-79.239476,3


cluster labels generated for each row in the dataframe

In [49]:
# run k-means clustering

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_neighborhood_cluster)

# check cluster labels generated for each row in the dataframe

kmeans.labels_[0:10]

array([3, 3, 3, 3, 3, 3, 3, 2, 3, 2], dtype=int32)

In [50]:
df_can_lat_lng_cluster=df_can_lat_lng

# add clustering labels

df_can_lat_lng_cluster['Cluster labels'] = kmeans.labels_


df_can_lat_lng_cluster.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster labels
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,3
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,3
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3


Clustering the neighborhoods in Toronto by generating maps to visualize the neighborhoods and how they cluster together.

In [30]:
# create map
map_clusters = folium.Map(location=[Latitude, Longitude], zoom_start=11)

# set color scheme for the clusters.
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_can_lat_lng_cluster['Latitude'], df_can_lat_lng_cluster['Longitude'], df_can_lat_lng_cluster['Neighborhood'], df_can_lat_lng_cluster['Cluster labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

