### 1. Importing necessary libraries.

In [1]:
import requests 
import pandas as pd 
import numpy as np 
import random 

# !conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

# !conda install -c conda-forge folium=0.5.0 --yes
import folium
from bs4 import BeautifulSoup

print('Folium installed')
print('Libraries imported.')

Folium installed
Libraries imported.


### Scraping the table from Wikipedia! :)

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(url, 'lxml')

In [4]:
table = soup.find('table', class_= 'wikitable sortable')

In [5]:
table.find('tbody').th

<th>Postal Code
</th>

In [6]:
table_header = [header.text for header in table.find('tbody').find_all('th')]
table_header = [i.strip('\n') for i in table_header]
table_header

['Postal Code', 'Borough', 'Neighborhood']

In [7]:
table_rows = table.find_all('tr')

In [8]:
final = []

for tr in table_rows:
    td = tr.find_all('td')
    try:
        row = [row_data.text for row_data in td]
    except Exception as e:
        row = None
    final.append(row)

### Creating a pandas dataframe from the extracted table!!!

In [9]:
df = pd.DataFrame(final, columns=table_header[0:])
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
...,...,...,...
176,M5Z\n,Not assigned\n,Not assigned\n
177,M6Z\n,Not assigned\n,Not assigned\n
178,M7Z\n,Not assigned\n,Not assigned\n
179,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


### 4. For Neighborhood="Not assigned", make the value the same as Borough!!

In [10]:
for index, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


### 5. Cleaning the given data frame, ie ignoring the 'Not Assigned' values!

In [11]:
df.drop(df.index[0], inplace = True)
for column in table_header:
    df[column] = df[column].apply(lambda x: x.strip('\n'))
    df[column] = df[column].replace('Not assigned', np.nan)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,,
2,M2A,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
df.isnull().sum()

Postal Code      0
Borough         77
Neighborhood    77
dtype: int64

In [13]:
df[df.isna().any(axis=1)]

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,,
2,M2A,,
8,M8A,,
11,M2B,,
16,M7B,,
...,...,...,...
175,M4Z,,
176,M5Z,,
177,M6Z,,
178,M7Z,,


In [14]:
final_df = df.dropna(how='all', subset = ['Borough', 'Neighborhood'])
final_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
final_df.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

In [16]:
final_df.shape

(103, 3)

### 6. Combine rows with duplicate postal codes!

In [17]:
final_df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [18]:
final_df.loc[final_df.duplicated(subset = ['Postal Code']) == True]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [19]:
new_df = final_df.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 7. Changing column names according to the question!!

In [20]:
new_df.rename(columns ={'Postal Code': 'PostalCode'}, inplace = True)
new_df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


### Printing the shape of the cleaned dataframe!!!

In [21]:
new_df.shape

(103, 3)

### 8. Getting Latitude and Longitude of each Neighborhood!!

Note: I tried using the geocoder API to get the location, but they took a lot of time for each call and returned None for most of the neighborhoods. Therefore, I had to download the csv file given in the course to create the dataframe. 

In [22]:
# import geocoder 

# lat_lng_coords = None
# postal_code = 'M5G'
# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [23]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [24]:
(new_df['PostalCode'] == geo_data['Postal Code'])

0      True
1      True
2      True
3      True
4      True
       ... 
98     True
99     True
100    True
101    True
102    True
Length: 103, dtype: bool

In [25]:
toronto = pd.concat([new_df, geo_data[['Latitude', 'Longitude']] ], axis= 1)
toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [26]:
toronto.isnull().sum()

PostalCode      0
Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

In [27]:
toronto.shape

(103, 5)