## A. Importing Libraries

In [2]:
#import all libraries

#Data Processing
import pandas as pd

import numpy as np

#Coordinates
from geopy.geocoders import Nominatim
import requests

#Webscraping
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize

# Visualizations
import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
import folium

# Clustering/Machine Learning
from sklearn.cluster import KMeans

## 1. Scrape Data

In [3]:
#GET Request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [4]:
#soupify data
soup = BeautifulSoup(data, 'html.parser')

## 2. Create Dataframe

In [5]:
#store data via lists
postalcodes = []
boroughs = []
neighborhoods = []

In [6]:
# go through text and populate lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcodes.append(cells[0].text.rstrip('\n'))
        boroughs.append(cells[1].text.rstrip('\n'))
        neighborhoods.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

In [7]:
#create data frame from lists
df = pd.DataFrame({"PostalCode": postalcodes,
                           "Borough": boroughs,
                           "Neighborhood": neighborhoods})

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 3. Cleaning Data Frame

In [8]:
#drop cells with a borough that is not assigned
df_dropna = df[df.Borough != "Not assigned"].reset_index(drop=True)
df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
# group neighborhoods in the same borough
df_grouped = df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
#Checking work
df_grouped[df_grouped['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [11]:
#checking work
df_grouped['Neighborhood'].value_counts()

Downsview                                                4
Don Mills                                                2
Dorset Park, Wexford Heights, Scarborough Town Centre    1
The Annex, North Midtown, Yorkville                      1
Humber Summit                                            1
                                                        ..
Brockton, Parkdale Village, Exhibition Place             1
Golden Mile, Clairlea, Oakridge                          1
Christie                                                 1
Berczy Park                                              1
North Toronto West,  Lawrence Park                       1
Name: Neighborhood, Length: 99, dtype: int64

In [14]:
#replicate dataframe provided in example
column_names = ["PostalCode", "Borough", "Neighborhood"]
df2 = pd.DataFrame(columns=column_names)
df2
list1 = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in list1:
    df2 = df2.append(df_grouped[df_grouped["PostalCode"]==postcode], ignore_index=True)
    
df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [15]:
df_grouped.shape

(103, 3)