# scraping wikipedia page using Pandas

####    1. install prerequesties lxml, html5lib and beautifulsoup4

In [3]:
pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 6.7MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.0 soupsieve-2.0
Note: you may need to restart the kernel to use updated packages.


In [5]:

pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 6.2MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import requests 

In [9]:
### put the url in variable
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'


wiki_page= requests.get(url).text

soup= BeautifulSoup(wiki_page, "html5lib")

#print(soup.prettify())


#### extracting table from th webpage

In [10]:

postcode=[]
borough=[]
neighborhood=[]

rtable= soup.find('table', {'class':'wikitable sortable'})

for row in rtable.find_all('tr'):
    cells=row.find_all('td')
    if len(cells)==3:
        postcode.append(cells[0].find(text = True))
        
        borough.append(cells[1].find(text=True))
        neighborhood.append(cells[2].find(text=True))




#### transforming our table int pandas data frame

In [11]:
df=pd.DataFrame(postcode, columns=['Postalcode'])
df['Borough']= borough
df['Neighborhood']= neighborhood

df.head(5)
#df.shape

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [12]:
# remove \n from our data frame

df=df.replace('\n', '', regex= True)

#### remove not assigned rows

In [13]:
# drop N\A values
df = df.dropna()
empty = 'Not assigned'
df = df[(df.Postalcode != empty ) & (df.Borough != empty) & (df.Neighborhood != empty)]


In [14]:
#combine neighborhoods that have same postal code area. 

df2= df.groupby('Postalcode').agg(lambda x: ','.join(x))


In [15]:

df2.shape

(103, 2)

In [16]:
# reset index and show the dataframe
df2=df2.reset_index()
df2.head()


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Question 2
####  Adding latitude and longitude to the data

In [7]:
geo_df=pd.read_csv('https://cocl.us/Geospatial_data')

In [8]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
#change the name of so we can merge it later 
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)

# merge geo data with our data frame

merge_df=pd.merge(geo_df, df2, on='Postalcode' )


In [20]:
geo_data=merge_df[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]


In [21]:
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [22]:
geo_data.shape

(103, 5)