## Imports

In [22]:
import pandas as pd
import numpy as np
import json

from geopy.geocoders import Nominatim # to conver address to lat and long values

import requests # handle requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize # JSON to pandas dataframe

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

import folium # for maps

# PART 1

## Scraping data from Wikipedia

In [23]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [24]:
soup = BeautifulSoup(data, 'html.parser') # BeautifulSoup object to parse data
postalCodes = []
boroughs = []
neighborhoods = []

In [25]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0): # condition because some rows may be empty && first row is header
        postalCodes.append(cells[0].text.rstrip('\n'))
        boroughs.append(cells[1].text.rstrip('\n'))
        neighborhoods.append(cells[2].text.rstrip('\n'))

In [26]:
toronto_df = pd.DataFrame({'PostalCode': postalCodes,
                           'Borough': boroughs,
                           'Neighborhood': neighborhoods})
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Note:
<p> as of this date (28th June 2020) the neighborhoods have already been grouped together by borough on the Wikipedia page; the code to do it has been included anyway </p>

## Drop cells with Neighborhood == "Not Assigned"

In [27]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop = True)
print(toronto_df.shape)
toronto_df.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Group neighborhoods in same borough

In [28]:
toronto_df = toronto_df.groupby(["PostalCode", "Borough"], as_index = False).agg(lambda x: ", ".join(x))
print(toronto_df.shape)
toronto_df.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## for unassigned neighborhoods change value to Borough

In [29]:
for index, row in toronto_df.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] == row['Borough']
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Verify according to the dataframe image in the question.

In [30]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
test_df = pd.DataFrame(columns = cols)

test_list = ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']

for postalcode in test_list:
    test_df = test_df.append(toronto_df[toronto_df['PostalCode'] == postalcode], ignore_index = True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


## Finally, print shape of the dataframe

In [31]:
toronto_df.shape

(103, 3)

# Part 2

## load location coordinates from coursera

In [34]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
coordinates.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merge coordinates and toronto_df

In [36]:
final_df = toronto_df.merge(coordinates, on = "PostalCode", how = "left")
final_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Creating output as shown on submission page

In [37]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
test_df = pd.DataFrame(columns = cols)

test_list = ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']

for postalcode in test_list:
    test_df = test_df.append(final_df[final_df['PostalCode'] == postalcode], ignore_index = True)
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


# Part 3

## using geopy to get LAT & LONG of Toronto

In [38]:
address = 'Toronto'
geolocator = Nominatim(user_agent = 'lol-app') # for the purpose of assignment; give any name to user_agent
location= geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("Toronto coordinates : {}, {}".format(latitude, longitude))

Toronto coordinates : 43.6534817, -79.3839347


## Create a map of Toronto

In [49]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10.3) #creates map
#add markers to map

for lat, long, borough, neighborhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity=0.7
    ).add_to(map_toronto)
    
map_toronto