## Part-1&2 Installed lib for web scrapping and furthuer analysis

In [1]:
pip install lxml html5lib beautifulsoup4




In [2]:
import pandas as pd

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url, index_col=0) # setting postal code as index column

print(len(dfs))

3


In [4]:
df = dfs[0] #selecting our borough-neighbourhood dataframe for analysis.
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Part -3 Creating the dataframe

In [5]:
df = df[df.Borough != 'Not assigned'] # Removing the 'Not assigned' values from borough
df.head(10)

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill, Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.reset_index(inplace=True)

In [7]:
df.shape #Shape i.e rows*columns in the given dataframe

(103, 3)

## Part 4 Link to project and Lat-Long Co-ordinates

In [8]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df_cord = pd.read_csv('Geospatial_Coordinates.csv')
df_cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_cord.shape

(103, 3)

In [11]:
df_lat_long = pd.merge(df,df_cord, on='Postal Code')
df_lat_long.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [12]:
df_lat_long.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [13]:
df_lat_long.shape

(103, 5)

## ` Part 5 Clustering neighbourhoods and mapping them

In [31]:
df_lat_long.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [44]:
df_toronto = df_lat_long.loc[df_lat_long.Borough.str.contains('Toronto')] ##Selecting Boroughs that have thw word 'toronto' in it.

In [57]:
df_toronto.sample(5) ##Verifying or query

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
69,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
79,M4S,Central Toronto,Davisville,43.704324,-79.38879
97,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188


In [105]:
df_toronto.shape

(39, 5)

In [62]:
#Importing import libraries
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [103]:
#plotting map of toronto with its coordinates taken from net

map_toronto = folium.Map(width=550, height=450, location=[43.651070,-79.347015], zoom_start=11, min_zoom=10, max_zoom=14)

folium.TileLayer('Stamen Terrain').add_to(map_toronto)#adding different types of terrain layers
folium.TileLayer('Stamen Toner').add_to(map_toronto)
folium.TileLayer('Stamen Water Color').add_to(map_toronto)
folium.TileLayer('cartodbdark_matter').add_to(map_toronto)
folium.LayerControl().add_to(map_toronto)  #stacking them neatly .i.e which one to use from top right corner icon.
map_toronto

In [104]:
locations = df_toronto[['Latitude', 'Longitude']] # to map all location in my df
locationlist = locations.values.tolist()  # adding all values in list
len(locationlist)


39

In [110]:
map = folium.Map(location=[43.651070, -79.347015], zoom_start=12) #plotting it in the map 
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point], popup=df_toronto['Neighbourhood']).add_to(map)
map

In [111]:
# set number of clusters
kclusters = 5

toronto_clustering = locationlist

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 1, 0, 0, 3, 0, 2])