 ## Code  for Data Scraping 

In [1]:
#Importing Libraries
from bs4 import BeautifulSoup  #Package for scraping data
import pandas as pd
import requests  #To get the desired web page

## Data Scraping

website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
wiki=BeautifulSoup(website_url,'lxml')
my_table=wiki.find('table',{'class':'wikitable sortable'}) #Function/parameters used to get the table

## Converting into a DataFrame

#In the following section the conversion to data frame is divided into 2 parts:
 #- Title
 #- Body

### Creating the Title List

TitleLink=my_table.findAll('th') # In HTML the title font is found in the <th></th> 
TitleLink=list(TitleLink)
TitleLink=list(map(str,TitleLink))
Title=[]
for i in range(len(TitleLink)):
    Title.append(TitleLink[i][4:-5]) # Slicing the required part 
Title[2]=Title[2][:-1]               # To remove '/n' at the end of neighborhood 
#print(Title)

### Creating the Body List

TableLink=my_table.find_all('td')   # In HTML the Row font is found in the <td></td> 


row1=[] #List containing 'Postcode'
row2=[] #List containing 'Borough'
row3=[] #List containing 'Neighbourhood'


for i in range(0,len(TableLink),3):   #Every 3 list element contained values of one row
    row1.append(TableLink[i])
    row2.append(TableLink[i+1])
    row3.append(TableLink[i+2])


row1=list(map(str,row1))              #Converting values to string
row2=list(map(str,row2))    
row3=list(map(str,row3))

for i in range(len(row1)):
    row1[i]=row1[i][4:-5]         #To remove <td></td> and /n
    row2[i]=row2[i][4:-5]
    row3[i]=row3[i][4:-6]



row1[0:5]
row2[0:5]
row3[0:5]

for i,v in enumerate(row2):
    if v.startswith('<a')==True:
        row2[i]=row2[i][row2[i].index('>')+1:-4]  #The strings are sliced from the last letter of Borough to the character '>'
                                                  #All values for slicing is based on observation
row2[0:5]

for i,v in enumerate(row3):
    if v.startswith('<a')==True:
        row3[i]=row3[i][row3[i].index('>')+1:-4] # Same as above
    
row3[0:5]

## Data Manipulation



data=pd.DataFrame({'1':row1,'2':row2,'3':row3})  # Creating a Data frame
data.columns=Title
data.head()

data = data[data.Borough != 'Not assigned'] #Removing all rows with 'Not assigned' Borough
data.reset_index(drop=True,inplace=True)
data.head()

for i in range(2,len(data)):                # Changing all 'Not assigned' Neighborhood with same name as Borough
    if data.Neighbourhood[i]=='Not assigned':
        data.iloc[i]['Neighbourhood']=data.iloc[i]['Borough']

data = data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()  #To join Neighbourhood column based on Postcode
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Using Geo code to get Latitude and Longtitude for the different postal code

Since the above code didn't seem to be resoponding for different trials we will be using csv file for latitude and longtitude

## Reading and Merging dataset with Latitude and Longtitude

In [2]:
import os

In [3]:
os.chdir('D:\\Capstone\Clustering')

In [5]:
latlngdata=pd.read_csv('Geospatial_Coordinates.csv')
latlngdata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
latlngdata.columns=['Postcode', 'Latitude', 'Longitude'] #Changing the names in order to merge
latlngdata.columns

Index(['Postcode', 'Latitude', 'Longitude'], dtype='object')

In [12]:
final=pd.merge(data,latlngdata,on='Postcode')
final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
final.shape

(103, 5)