## 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_row", None)

## 2. Scrapping Data from Wikipedia and build a DataFrame

In [3]:
df = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [4]:
bs = BeautifulSoup(df, 'html.parser')

In [5]:
postalcodelist = []
boroughlist = []
neighborhoodlist = []

In [6]:
for row in bs.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcodelist.append(cells[0].text)
        boroughlist.append(cells[1].text)
        neighborhoodlist.append(cells[2].text.rstrip('\n'))

In [7]:
df1 = pd.DataFrame({"PostalCode": postalcodelist,
                    "Borough": boroughlist,
                    "Neighborhood": neighborhoodlist})

df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,
1,M2A\n,Not assigned\n,
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront


## 3. Data Preprocessing by dropping "Not assigned" cells

In [8]:
df1_dropna = df1[df1.Borough != 'Not assigned\n'].reset_index(drop=True)
df1_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods
1,M4A\n,North York\n,Victoria Village
2,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront
3,M6A\n,North York\n,Lawrence Manor / Lawrence Heights
4,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government


## 4. Grouping neighborhoods in same borough

In [9]:
df1_grouped = df1_dropna.groupby(["PostalCode","Borough"], as_index=False).agg(lambda x: ", ".join(x))
df1_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B\n,Scarborough\n,Malvern / Rouge
1,M1C\n,Scarborough\n,Rouge Hill / Port Union / Highland Creek
2,M1E\n,Scarborough\n,Guildwood / Morningside / West Hill
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae


## 5. Making same value as borough for the neighborhood with "Not assigned"

In [10]:
for index, row in df1_grouped.iterrows():
    if row['Neighborhood'] == 'Not assigned\n':
        row['Neighborhood'] = row['Borough']

df1_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B\n,Scarborough\n,Malvern / Rouge
1,M1C\n,Scarborough\n,Rouge Hill / Port Union / Highland Creek
2,M1E\n,Scarborough\n,Guildwood / Morningside / West Hill
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae


## 6. Building the necessary DataFrame

In [11]:
column_names = ['PostalCode','Borough','Neighborhood']
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G\n", "M2H\n", "M4B\n", "M1J\n", "M4G\n", "M4M\n", "M1R\n", "M9V\n", "M9L\n", "M5V\n", "M1B\n", "M5A\n"]

for postcode in test_list:
    test_df = test_df.append(df1_grouped[df1_grouped['PostalCode'] == postcode], ignore_index=True)
    
test_df['PostalCode'].replace(r'\s+|\\n', ' ',regex=True, inplace=True)
test_df['Borough'].replace(r'\s+|\\n', ' ',regex=True, inplace=True)
test_df['Neighborhood'].replace("/", ",",regex=True, inplace=True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill , Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford , Maryvale"
7,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower , King and Spadina , Railway Lands , ..."


## 7. Checked the shape of df1_grouped

In [12]:
df1_grouped.shape

(103, 3)