### Importing Libraries

In [2]:
!conda install beautifulsoup4

from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.9.0       |           py36_0         165 KB

The following packages will be UPDATED:

    beautifulsoup4: 4.7.1-py36_1 --> 4.9.0-py36_0


Downloading and Extracting Packages
beautifulsoup4-4.9.0 | 165 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


### Setting maxcolwidth to 800

In [3]:
pd.set_option('max_colwidth', 800)

### Assigning the source web page to the variable and initializing Beautiful Soup to the soup variable 

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source, 'lxml')

### Initializing csv_writer object and writing the names of the columns in it as the first row

In [5]:
csv_file = open('toronto_details.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])

32

### Scraping the web page and extracting the data table

In [6]:
table = soup.find('table', class_ = 'wikitable sortable')
rows = table.find_all('tr')

postcodes = []
boroughs = []
neighbourhoods = []

for row in rows:    
    columns = row.find_all('td')
    try :
        if columns[1].text != 'Not assigned' :
            
            Postcode = columns[0].text
            postcodes.append(Postcode)
            
            Borough = columns[1].text
            boroughs.append(Borough)
            
            Neighbourhood = columns[2].text.split('\n')[0]
            
            if Neighbourhood == 'Not assigned':
                Neighbourhood = Borough            
                
            neighbourhoods.append(Neighbourhood)
             
    except Exception as e :
        pass 
    
postcode_explored = []
for index_i, postcode_i in enumerate(postcodes) :   
    if postcode_i not in postcode_explored :
        nbds = neighbourhoods[index_i]
        for index_f, postcode_f in enumerate(postcodes) :
            if postcode_i == postcode_f and index_i != index_f:
                nbds = nbds + ', ' + neighbourhoods[index_f]
        csv_writer.writerow([postcode_i, boroughs[index_i], nbds])
        postcode_explored.append(postcode_i)
     

### Closing the csv file

In [7]:
csv_file.close()

### Creating the pandas dataframe and displaying it

In [8]:
df = pd.read_csv('toronto_details.csv')
df1 = df[df.Borough != 'Not assigned\n']
df1.set_index('Postcode', inplace = True)
df1.reset_index(inplace = True)
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A\n,North York\n,Parkwoods
1,M4A\n,North York\n,Victoria Village
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"


### Sorting the dataframe in the ascending order wrt Postcode and thereby matching the coordinates with 

In [15]:
df1 = df1.sort_values('Postcode')
df1.set_index('Postcode', inplace = True)
df1.reset_index(inplace = True)
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B\n,Scarborough\n,"Malvern, Rouge"
1,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
2,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
3,M1G\n,Scarborough\n,Woburn
4,M1H\n,Scarborough\n,Cedarbrae


### Displaying the shape of the dataframe

In [16]:
df1.shape

(103, 3)

### Dataframe highlighting Geographical Coordinates of each Postal Code

In [17]:
df2 = pd.read_csv('http://cocl.us/Geospatial_data')
df2.drop('Postal Code', axis = 1, inplace = True)
df2.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


In [21]:
df3 = pd.concat([df1, df2], axis = 1)
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B\n,Scarborough\n,"Malvern, Rouge",43.806686,-79.194353
1,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G\n,Scarborough\n,Woburn,43.770992,-79.216917
4,M1H\n,Scarborough\n,Cedarbrae,43.773136,-79.239476
