# Submission for Part-2 
## Adding Location Data to the DataFrame

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

### requests library lets us get the html part of a website, .text retrives all its text content

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### BeautifulSoup function helps working with website scrapping easier, it has methods like pretiffy() that makes the content clearer

In [3]:
soup=BeautifulSoup(source, 'lxml')

## The contents needed are present in the soup under table

In [4]:
table = soup.find('table').text

### The values of the table are all separated by '\n\n' in the string 'table'

In [5]:
values = table.split('\n\n')

### Remove the headings of the table from the list

In [6]:
col=values[1:4]
col

['Postal Code', 'Borough', 'Neighborhood']

In [7]:
values=values[4:]

## The values are to be split into various columns.

In [8]:
length = len(values)
length

540

### There are total 540 values meaning that there are 180 values for each column.

### a,b and c are the indices of values of each column respectively.

In [9]:
col1=[]
col2=[]
col3=[]
a=0
b=1
c=2
while a<180:
    col1.append(values[a])
    col2.append(values[b])
    col3.append(values[c])
    a=a+3
    b=b+3
    c=c+3

In [10]:
col1[:5]

['\nM1A', '\nM2A', '\nM3A', '\nM4A', '\nM5A']

In [11]:
col2[:5]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [12]:
col3[:5]

['', '', 'Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront']

### All the values have a \n in front of the value.

In [13]:
col11=[]
for a in col1:
    spl = a.split('\n')
    col11.append(spl[1])

col11[:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [14]:
col1=col11.copy()

## Create a Data Frame

In [15]:
df=pd.DataFrame(col1)
df.head()

Unnamed: 0,0
0,M1A
1,M2A
2,M3A
3,M4A
4,M5A


In [16]:
df[1]=col2
df[2]=col3


In [17]:
df.columns=col

In [18]:
df=df[df["Borough"]!='Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [19]:
df=df.reset_index()

In [20]:
df.shape

(38, 4)

In [21]:
df.columns

Index(['index', 'Postal Code', 'Borough', 'Neighborhood'], dtype='object')

In [22]:
df=df[col]

In [23]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [24]:
df['Neighborhood'].notnull().count()

38

# Part-2
## Adding Location to the DataFrame

In [25]:
location_data = pd.read_csv('Geospatial_Coordinates.csv')
location_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
x=df['Postal Code'].values
x[:5]

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A'], dtype=object)

In [27]:
#Define a empty list.
y=[]

#Loop through the keys present in df.
for a in x:
    q=location_data[location_data['Postal Code'].str.contains(a)]
    y.append(q['Latitude'].values)

y[:5]

[array([43.7532586]),
 array([43.7258823]),
 array([43.6542599]),
 array([43.718518]),
 array([43.6623015])]

In [28]:
df['Latitude']=y
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude
0,M3A,North York,Parkwoods,[43.7532586]
1,M4A,North York,Victoria Village,[43.725882299999995]
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",[43.6542599]
3,M6A,North York,"Lawrence Manor, Lawrence Heights",[43.718517999999996]
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",[43.6623015]


In [29]:
df['Latitude']=df['Latitude'].astype('float')

In [30]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude
0,M3A,North York,Parkwoods,43.753259
1,M4A,North York,Victoria Village,43.725882
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301


## We repeat the process for Longitude

In [31]:
#Define a empty list.
z=[]

#Loop through the keys present in df.
for a in x:
    q=location_data[location_data['Postal Code'].str.contains(a)]
    z.append(q['Longitude'].values)

df['Longitude']=z
df['Longitude']=df['Longitude'].astype('float')

In [32]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
