### Webscraping

#### Importing the Dependencies:

In [1]:
import requests #For Request Handling
from bs4 import BeautifulSoup

import pandas as pd # data analsysis library:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

from pandas.io.json import json_normalize

import geopy
from geopy.geocoders import Nominatim

import numpy as np
import json # for JSON files

from sklearn.cluster import KMeans

import folium

import matplotlib.cm as cm
import matplotlib.colors as colors

#### Instantiate the url as an object:

In [2]:
link_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Link Data as a Beautiful Soup Object:

In [3]:
soup = BeautifulSoup(link_data, 'html.parser')

#### Lists to store data:

In [4]:
List_postalCode = []
List_borough = []
List_neighborhood = []

#### Capture the table from the Website:

In [5]:
table = soup.find('table').find_all('tr')
#table

#### Capture data in each row of the table:

In [6]:
for row in table:
    cells = row.find_all('td')

#### Adding table data to their respective lists:

In [7]:
for row in table:
    cells = row.find_all('td')
    if(len(cells) > 0):
        List_postalCode.append(cells[0].text.rstrip('\n'))
        List_borough.append(cells[1].text.rstrip('\n'))
        List_neighborhood.append(cells[2].text.rstrip('\n'))

#### Create the Pandas DF from the table data:

In [8]:
df_toronto = pd.DataFrame({"PostalCode": List_postalCode, "Borough": List_borough, "Neighborhood": List_neighborhood})
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove Cells with Boroughs that are 'Not assigned':

In [9]:
df_toronto = df_toronto[df_toronto.Borough != "Not assigned"].reset_index(drop=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Grouping Neighborhoods that are in the same borough together:

In [10]:
grouped_df_toronto = df_toronto.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
grouped_df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
for index, row in grouped_df_toronto.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
grouped_df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
