In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import folium
import requests
import urllib.request
#!conda install python3-bs4 --yes
!conda install beautifulsoup4 --yes
!conda install lxml --yes
from bs4 import BeautifulSoup
!conda install -c conda-forge geopy --yes
from geopy import geocoders
from geopy.geocoders import Nominatim
print("libraries imported")

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - beautifulsoup4


The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    conda-forge::ca-certificates-2020.4.5~ --> pkgs/main::ca-certificates-2020.1.1-0
  certifi            conda-forge::certifi-2020.4.5.1-py36h~ --> pkgs/main::certifi-2020.4.5.1-py36_0
  openssl            conda-forge::openssl-1.1.1g-h516909a_0 --> pkgs/main::openssl-1.1.1g-h7b6447c_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    

In [2]:
wiki='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(wiki, header=0)[0]

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [3]:
uns_borough = df[df['Borough']!="Not assigned"]
uns_borough.head(50)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#count the number of unique postal codes

In [4]:
print(len(df['Postal Code'].unique()))

180


Here we can see that all the postal codes are unique, so the corresponding neighborhoods are assigned to them

In [5]:
#check for neighborhood if not assigned value is there
for notAssign in uns_borough['Neighborhood']:
    if notAssign == "Not assigned":
        print(uns_borough[notAssign])

As we have checked that no "Not assigned" value is there in neighborhood column just for logic now we will now fill the Not assigned neighborhood with the corresponding borough

In [6]:
for bor, neigh in zip(uns_borough['Borough'],uns_borough['Neighborhood']):
    if neigh=='Not assigned':
        uns_borough[neigh] = uns_borough[bor]
        
uns_borough.tail()        

Unnamed: 0,Postal Code,Borough,Neighborhood
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing Centre
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Submission 1: Shape of the DataFrame

In [7]:
print("Shape of the DataFrame is:",uns_borough.shape)

Shape of the DataFrame is: (103, 3)


In [8]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Get the shape of the excel sheet

In [9]:
df_geo.shape

(103, 3)

We will check if both the dataframes(uns_borough and df_geo) have same sequence of postal codes

In [10]:
#For that let's first sort the dataframes based on Postal Code column
df1 = uns_borough.sort_values('Postal Code')
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M1B,Scarborough,"Malvern, Rouge"
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
36,M1G,Scarborough,Woburn
45,M1H,Scarborough,Cedarbrae


In [11]:
df2 = df_geo.sort_values('Postal Code')
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now we will append the values of Latitude and Longitude in First dataframe

In [12]:
for pc1, pc2 in zip(df1['Postal Code'], df2['Postal Code']):
    if pc1 == pc2:
        df1['Latitude'] = df2['Latitude']
        df1['Longitude'] = df2['Longitude']
df1.head()        

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
9,M1B,Scarborough,"Malvern, Rouge",43.692657,-79.264848
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.778517,-79.346556
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7259,-79.340923
36,M1G,Scarborough,Woburn,43.695344,-79.318389
45,M1H,Scarborough,Cedarbrae,43.712751,-79.390197


In [20]:
loc_data = df1.reset_index(drop=True) 
loc_data.isnull().sum()

Postal Code      0
Borough          0
Neighborhood     0
Latitude        35
Longitude       35
dtype: int64

Now we will create tha map

In [21]:
#check for the null values and if null values are present, then we can remove the rows
#we can't replace the null values with null as in that case we will get multiple locations with same values
loc_data = loc_data.dropna(axis=0)
loc_data.shape

(68, 5)

In [22]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Lets create a map, with location points as Neighborhood and Borough

In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat,lon,borough,neighor in zip(loc_data['Latitude'],loc_data['Longitude'],loc_data['Borough'],loc_data['Neighborhood']):
    label = '{}, {}'.format(neighor, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We can observe that the area near west toronto has more neighborhoods.