# INSTALL THE LIBRARIES

In [None]:
!pip install overpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install folium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install scikit-learn-extra

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.3 MB/s 
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.2.0


# Import the libraries

In [None]:
import pandas as pd
import folium
import overpy
from sklearn.cluster import *
from sklearn import metrics
import numpy as np

# Overpass API Query to fetch city




In [None]:
api = overpy.Overpass()
result = api.query("""[out:json];
    area[name="Vancouver"][boundary=administrative]->.searchArea;
    (node["amenity"](area.searchArea);
     way["amenity"](area.searchArea);
     relation["amenity"](area.searchArea);
    );
    out center;
    """)

# Convert to Dataframe and Remove Unnecessary Amenities

In [None]:
tags = []
for i in result.nodes:
    if len(i.tags) is not 0:
        i_tags = i.tags
        i_tags['node_id'] = i.id
        i_tags['lat'] = float(i.lat)
        i_tags['lon'] = float(i.lon)
        tags.append(i.tags)
df = pd.DataFrame(tags)
len(df)

In [None]:
remove_amenity =  [
    'arts_centre',
    "Ayurvedic Hospital",
    "baby_hatch",
    "bench",
    "bicycle_parking",
    "bicycle_rental",
    "bicycle_repair_station",
    "bureau_de_change",
    "car_rental",
    "car_wash",
    "charging_station",
    "fountain",
    "grave_yard",
    "House",
    "language_school",
    "meditation_centre",
    "motorcycle_parking",
    "orphanage",
    "payment_terminal",
    "photo_booth",
    "post_depot",
    "recycling",
    "shelter",
    "social_centre",
    "social_facility",
    "telephone",
    "training",
    "tuition",
    "vending_machine",
    "veterinary",
    "waste_basket",
    "waste_disposal",
    "waste_transfer_station",
    "water_point",
    "weighbridge",
]
for val in remove_amenity:
    df = df[df.amenity != val]

print(df.shape)
df.head()

(4964, 361)


Unnamed: 0,addr:housenumber,addr:street,amenity,name,opening_hours,operator,operator:wikidata,operator:wikipedia,website,node_id,...,level:ref,maxheight:signed,addr:floor,building,note:address_buildingnumber,tower:type,payment:insurance,payment:venmo,source:maxheight,ref:gbfs
0,1950.0,Argyle Drive,library,VPL Fraserview Branch,"Tu,We 10:00-21:00; Th-Sa 10:00-18:00; Su 13:00...",Vancouver Public Library,Q1376408,en:Vancouver Public Library,https://www.vpl.ca/location/fraser-branch,249407303,...,,,,,,,,,,
1,1396.0,East 41st Avenue,fuel,Shell,,Shell,,,,255435370,...,,,,,,,,,,
2,300.0,Cambie Street,pub,The Cambie,,,,,,286316995,...,,,,,,,,,,
3,,,toilets,,dawn-dusk,,,,,289809375,...,,,,,,,,,,
4,2398.0,East 1st Avenue,fuel,Petro-Canada,Mo-Fr 06:30-23:00; Sa-Su 07:00-23:00,Petro-Canada,,,,291466861,...,,,,,,,,,,


# Converting lat and lon to numpy

In [None]:
coords=df[['lat','lon']].to_numpy()
coords

array([[  49.2196414, -123.0668448],
       [  49.2324294, -123.0774537],
       [  49.2831924, -123.1090499],
       ...,
       [  49.2114385, -123.1024948],
       [  49.2636185, -123.1250807],
       [  49.27911  , -123.119003 ]])

# DBSCAN for outlier removal

In [None]:
kms_per_radian = 6371.0088
epsilon = 0.5/kms_per_radian
dbsc = DBSCAN(eps=epsilon,min_samples=10, algorithm='ball_tree', metric='haversine')
y_dbsc=dbsc.fit(np.radians(coords))
cluster_labels = dbsc.labels_
num_clusters = len(set(cluster_labels))
print(num_clusters)

Silhouette Coefficient: 0.034


In [None]:
clusters = pd.Series([ coords[ cluster_labels == n ] for n in range (num_clusters) ])
core_samples = np.zeros_like(cluster_labels, dtype='bool')
core_samples[dbsc.core_sample_indices_] = True
np.unique(core_samples, return_counts=True)

In [None]:
s = pd.Series(core_samples, name='bools')
data = df[['lat', 'lon']]
data[s.values].shape, data.shape

((4489, 2), (4964, 2))

# Dataframe after removal of outliers

In [None]:
data_amenities=df.copy()
df_amenities = data_amenities[s.values]
print(df_amenities.shape)
df_amenities.head()

(4489, 361)


Unnamed: 0,addr:housenumber,addr:street,amenity,name,opening_hours,operator,operator:wikidata,operator:wikipedia,website,node_id,...,level:ref,maxheight:signed,addr:floor,building,note:address_buildingnumber,tower:type,payment:insurance,payment:venmo,source:maxheight,ref:gbfs
0,1950.0,Argyle Drive,library,VPL Fraserview Branch,"Tu,We 10:00-21:00; Th-Sa 10:00-18:00; Su 13:00...",Vancouver Public Library,Q1376408,en:Vancouver Public Library,https://www.vpl.ca/location/fraser-branch,249407303,...,,,,,,,,,,
2,300.0,Cambie Street,pub,The Cambie,,,,,,286316995,...,,,,,,,,,,
3,,,toilets,,dawn-dusk,,,,,289809375,...,,,,,,,,,,
4,2398.0,East 1st Avenue,fuel,Petro-Canada,Mo-Fr 06:30-23:00; Sa-Su 07:00-23:00,Petro-Canada,,,,291466861,...,,,,,,,,,,
5,,,fuel,Chevron,,Chevron,,,,291466862,...,,,,,,,,,,


In [None]:
df01=df_amenities[['lat','lon']]
coords1=df_amenities[['lat','lon']].to_numpy()

# KMeans with random initialization

In [None]:
kmeans = KMeans(num_clusters, init = 'random', random_state = 42)
y_kmeans = kmeans.fit_predict(np.radians(coords1))
kmean_labels = kmeans.labels_
k_clusters = len(set(kmean_labels))
print(k_clusters)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(np.radians(coords1), kmean_labels, metric='haversine'))


263
Silhouette Coefficient: 0.435


# KMeans++

In [None]:

kmeans = KMeans(num_clusters, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(np.radians(coords1))
kmean_labels = kmeans.labels_
k_clusters = len(set(kmean_labels))
print(k_clusters)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(np.radians(coords1), kmean_labels, metric='haversine'))


27
Silhouette Coefficient: 0.422



# K-Mediods



In [None]:
from sklearn_extra.cluster import KMedoids
kmedoids = KMedoids(num_clusters, random_state=0).fit(np.radians(coords1))
kmed_labels=kmedoids.labels_
kmed_clusters = len(set(kmed_labels))
print(kmed_clusters)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(np.radians(coords1), kmed_labels, metric='haversine'))

27
Silhouette Coefficient: 0.318


# OPTICS

In [None]:
opt=OPTICS(min_samples=10,metric="haversine",cluster_method='xi')
y_opt=opt.fit(np.radians(coords1))
optcluster_labels = opt.labels_
print(len(set(optcluster_labels)))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(np.radians(coords1), optcluster_labels, metric='haversine'))

130
Silhouette Coefficient: -0.092


# DBSCAN

In [None]:
dbsc = DBSCAN(eps=epsilon,min_samples=10, algorithm='ball_tree', metric='haversine')
y_dbsc=dbsc.fit(np.radians(coords1))
cluster_labels = dbsc.labels_
print(len(set(cluster_labels)))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(coords1, cluster_labels, metric='haversine'))