### Installing the required libraries

In [None]:
#pip install beautifulsoup4
#pip install lxml
#pip install requests

### Importing the required libraries
re is used to clean the str data, with the .split() method it was found challenging and time consuming to remove '\r' and '\n' strings from scraped data (when appending to the .csv file).

In [9]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import re

### Setting the required variables
Also, creating the soup object from which the table data is to be parsed

In [13]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
article = soup.find('body')
table = article.find('table', class_='wikitable sortable').tbody

### Creating a .csv file, iterating through the soup table object and parsing data, appending the data, and closing the .csv file

In [14]:
#Creates .csv file
csv_file = open('toronto_data.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['PostalCode','Borough', 'Neighborhood'])

#Iterates through table to extract, parse, and append data
for row in table.find_all('tr'):
    table_row = row

    list = []
    
    for cell in row.find_all('td'):
        table_cell = re.sub(r'(\r+|\n)', '', cell.text)
        list.append(table_cell)
    
    csv_writer.writerow(list)

#Closes the csv file
csv_file.close()

### Taking a look at the raw data prior to processing / cleaning

In [15]:
raw_dataset = pd.read_csv('toronto_data.csv')
raw_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Filtering the out raw data where Borough is not assigned as instructed

In [16]:
# Get names of indexes for which Borough = 'Not assigned'
indexNames = raw_dataset[raw_dataset['Borough'] == 'Not assigned' ].index

# Delete these row indexes from dataFrame
raw_dataset.drop(indexNames , inplace=True)

#Quick inspection
raw_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Looks good!

### And, setting not assigned neighborhoods to the borough value
...upon inspection it was found that no neighborhoods were set to "not assigned", a few had NaN values, however, all of these were attributed to observations where the Borough was not assigned. As such, they were filtered out, and this task is done.

### Reseting index and printing shape of df

In [17]:
raw_dataset.reset_index(drop=True, inplace=True)
print(f'The raw data set is of dimensions {raw_dataset.shape}')

The raw data set is of dimensions (103, 3)


In [18]:
# Check that index is good
raw_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Importing the geocoder

In [19]:
#pip install geocoding
import geocoder

In [None]:
# initialize your variable to None
lat_lng_coords = None

postal_code = 'MG5'

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

### I was finding that the while loop above was running infinitely.
Because of this, I instead used the provided csv file with the data provided.

In [21]:
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Setting up the final df

In [24]:
#Creating two more columns for Latitude and Longitude, and initializing values as None (for post-processing inspection purposes)
raw_dataset['Latitude'] = None
raw_dataset['Longitude'] = None

raw_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


### Using enumerate to return a tuple of both index and str value, the code below iterates through and sets the appropriate values to the latitude and longitude!

In [55]:
for postCode in enumerate(raw_dataset['PostalCode']):
    
    #retrieving the index of the postal code
    geospatial_index = geospatial_data.index[geospatial_data['Postal Code'] == postCode[1]].tolist()[0]
    
    #setting the latitude
    raw_dataset.iloc[postCode[0]][3] = geospatial_data.iloc[geospatial_index][1]
    
    #setting the longtitude
    raw_dataset.iloc[postCode[0]][4] = geospatial_data.iloc[geospatial_index][2]

### Quick inspection below (looks good!)

In [61]:
raw_dataset.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895
