## Web scrap data from a html table

In [0]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
import folium

In [233]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# request the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)

In [0]:
# Get the table data
soup = BeautifulSoup(req.text, "html.parser")

table = soup.find('table', {'class': 'wikitable sortable'})

In [0]:
# Get all rows from the table
rows = table.findAll("tr")

In [0]:
# Create an array with the rows
data = []
for row in rows:
  data.append(row.text.strip().split('\n'))
data = np.array(data)

In [0]:
# Create a DataFrame with the data
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.drop(df.index[0])

#### Clean the data > 'Not assigned' values

In [239]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [0]:
df = df[df['Borough']!='Not assigned']

In [0]:
df['Neighbourhood'] = df['Borough'].where(df['Neighbourhood']=='Not assigned', df['Neighbourhood'])

In [0]:
new_df = pd.DataFrame(df.groupby('Postcode')['Neighbourhood'].apply(', '.join))
new_df.reset_index(inplace=True)
df.drop(labels=['Neighbourhood'], axis=1, inplace=True)
df.drop_duplicates(subset='Postcode', inplace=True)

In [0]:
df = df.merge(new_df, on='Postcode', sort=True)

In [244]:
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [245]:
df.shape

(103, 3)

## Get latitude and longitude

In [0]:
# Read coordinates csv fron each postal code
geo_coor = pd.read_csv('/content/drive/My Drive/GITHUB REPO/segmenting-and-clustering-NBHD-in-Toronto/Geospatial_Coordinates.csv')

In [247]:
geo_coor.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
geo_coor.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Join two dataframes by Postcode
data = df.merge(geo_coor, on='Postcode')

In [0]:
# Create dataframe with Borough that contain 'Toronto'
toronto_data = data[data['Borough'].str.contains('Toronto')]

In [250]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [251]:
# Number of Boroughs > 'value of clusters'
len(toronto_data['Borough'].unique())

4

### Cluster the neighborhoods in Toronto

In [252]:
km_cluster = KMeans(n_clusters=4, random_state=42)
# km_cluster.fit(np.stack((toronto_data['Longitude'], toronto_data['Latitude']), axis=1))
km_cluster.fit(toronto_data[['Latitude', 'Longitude']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [253]:
toronto_data['Cluster'] = km_cluster.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
toronto_data.head()

In [0]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=13)

cmap = ['#f0ed37', '#1e3ad9', '#11b302', '#f22272']

for latitude, longitude, cluster in zip(toronto_data['Latitude'],
                                        toronto_data['Longitude'],
                                        toronto_data['Cluster']):
  folium.CircleMarker([latitude, longitude],
                      color='#47b3ff',
                      fill=True,
                      fill_color=cmap[cluster],
                      fill_opacity=0.6,
                      radius=7).add_to(toronto_map)
