# Scraping website using pandas

### Import Pandas Library

In [None]:
import pandas as pd

### Use pd.read_html() to get all the tables on the website 

In [26]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url)

### We get a list of dataframe, if we have multiple tables on the website. Choose the required table from the list.

In [30]:
list_toronto = data[0]
list_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignoring rows with Not assigned values for column Borough .

In [38]:
list_toronto = list_toronto[list_toronto['Borough'] != 'Not assigned']
list_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Check the shape of the dataframe

In [37]:
list_toronto.shape

(103, 3)

### Install Geocoder

In [40]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 1.7MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


### define method to get Coordinates

In [42]:
import geocoder


def get_location(postal_code):
    
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        
        return lat_lng_coords
    


### Getting Coordinates using Geocoder

In [None]:
post_code = list(list_toronto['Postal Code'].unique())
latlog = {}
for i in post_code:
    coords = get_location(i)
    latlog[i]=coords

### Above took extremely long time. So, alternative csv was used.

In [48]:
location_data = pd.read_csv("/Users/SujayaJU/Downloads/Geospatial_Coordinates.csv") ## if, running the code please give the path where you have stored the file
location_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Using pd.merge() to join two dataframes based on Postal Code

In [55]:
list_toronto_latlog = pd.merge(list_toronto, location_data, on='Postal Code')
list_toronto_latlog.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
