In [78]:
import pandas as pd
import numpy as np

## Part 1: Building the dataframe

In [79]:
#Get the html source
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
#df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
print (df.head())

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


### Cleaning the data

#### Remove rows with "Not assigned" values in Borough

In [81]:
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Join the rows with the same Postcode and Borough and put the different values in Neighbourhood in the same row separated by comma

In [82]:
df_g = df.groupby(['Postcode', 'Borough'], as_index=False).agg(', '.join)
#df_g.loc[df_g['Postcode']=='M5V']
df_g.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Assign the "Not assigned" value of Neighbourhood with the value of Borough

In [83]:
df_g.loc[df_g['Neighbourhood']=='Not assigned' , 'Neighbourhood'] =  df_g.loc[df_g['Neighbourhood']=='Not assigned' , 'Borough']
#df_g.loc[df_g['Postcode']=='M7A']

#### Print the shape

In [84]:
df_g.shape

(103, 3)

## Part2: Obtain Longitude and Latitude

#### Get the logngitute and latitude

In [87]:
!wget -q -O 'long_lat.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

long_lat = pd.read_csv('long_lat.csv')
#df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)

# rename postal code column to Postcode

long_lat.rename(columns={'Postal Code':'Postcode'}, inplace=True)

print (long_lat.head())

Data downloaded!
  Postcode   Latitude  Longitude
0      M1B  43.806686 -79.194353
1      M1C  43.784535 -79.160497
2      M1E  43.763573 -79.188711
3      M1G  43.770992 -79.216917
4      M1H  43.773136 -79.239476


#### Add the long and lat columns to the original dataframe

In [93]:
df_final = pd.merge(df_g, long_lat, how='inner', on = 'Postcode')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part3:  Clustering