In [2]:
!pip install hdbscan
!pip install contextily
!pip install shapely
!pip install geopandas

Collecting contextily
  Downloading contextily-1.6.2-py3-none-any.whl.metadata (2.9 kB)
Collecting geopy (from contextily)
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting mercantile (from contextily)
  Downloading mercantile-1.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting rasterio (from contextily)
  Downloading rasterio-1.4.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (9.1 kB)
Collecting geographiclib<3,>=1.52 (from geopy->contextily)
  Downloading geographiclib-2.1-py3-none-any.whl.metadata (1.6 kB)
Collecting affine (from rasterio->contextily)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio->contextily)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio->contextily)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading contextily-1.6.2-py3-none-any.whl (17 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
Downloading me

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import hdbscan
from sklearn.cluster import DBSCAN


import geopandas as gpd   # pandas dataframe-like geodataframes for geographical data
import contextily as ctx   # used for obtianing a basemap of Canada
from shapely.geometry import Point


import warnings
warnings.filterwarnings('ignore')

## Download the canada map

In [6]:
import requests
import zipfile
import io
import os

zip_url_file = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/YcUk-ytgrPkmvZAh5bf7zA/Canada.zip'

output_dir = './'

os.makedirs(output_dir, exist_ok = True)

# Download the zip file
response = requests.get(zip_url_file)
response.raise_for_status() # ensure the request was successful

# open the zip file in memory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
    for file_name in zip_ref.namelist():
        if file_name.endswith('.tif'):
            zip_ref.extract(file_name,output_dir)
            print(f"Download and extracted: {file_name}")

Download and extracted: Canada.tif


## Include a plotting Function

In [7]:
def plot_clustered_locations(df,title = "Museums Clustered by Proximity"):
    """
    Plots clustered locations and overlays on a basemap.


    Parameters:
    - df.Dataframe containing 'Latitude', 'Longitude, and 'Clustercolumns
    -title: str, title of the plot
    
    """

    # Load the coordinates into a dataFrame
    gdf = gpd.GeoDataFrame(df,geometry = gpd.points_from_xy(df['Longitude'], df['Latitude']), crs = 'EPSG:4326')

    # Reproject to web mercator to align with basemap
    gdf = gdf.to_crs(epsg = 3857)

    # Create the plot
    fig,ax = plt.subplots(figsize = (15,10))

    # Separate non-noise, or clustered points from noise, or unclustered points
    non_noise = gdf[gdf['Cluster'] != -1]
    noise = gdf[gdf['Cluster'] == -1]
    
    # Plot noise points 
    noise.plot(ax=ax, color='k', markersize=30, ec='r', alpha=1, label='Noise')
    
    # Plot clustered points, colured by 'Cluster' number
    non_noise.plot(ax=ax, column='Cluster', cmap='tab10', markersize=30, ec='k', legend=False, alpha=0.6)
    
    # Add basemap of  Canada
    ctx.add_basemap(ax, source='./Canada.tif', zoom=4)
    
    # Format plot
    plt.title(title, )
    plt.xlabel('Longitude', )
    plt.ylabel('Latitude', )
    ax.set_xticks([])
    ax.set_yticks([])
    plt.tight_layout()
    
    # Show the plot
    plt.show()


In [8]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/r-maSj5Yegvw2sJraT15FA/ODCAF-v1-0.csv'
df = pd.read_csv(url, encoding = "ISO-8859-1")

In [9]:
df.head()

Unnamed: 0,Index,Facility_Name,Source_Facility_Type,ODCAF_Facility_Type,Provider,Unit,Street_No,Street_Name,Postal_Code,City,Prov_Terr,Source_Format_Address,CSD_Name,CSDUID,PRUID,Latitude,Longitude
0,1,#Hashtag Gallery,..,gallery,toronto,..,801,dundas st w,M6J 1V2,toronto,on,801 dundas st w,Toronto,3520005,35,43.65169472,-79.40803272
1,2,'Ksan Historical Village & Museum,historic site-building or park,museum,canadian museums association,..,1500,62 hwy,V0J 1Y0,hazelton,bc,1500 hwy 62 hazelton british columbia v0j 1y0 ...,Hazelton,5949022,59,55.2645508,-127.6428124
2,3,'School Days' Museum,community/regional museum,museum,canadian museums association,..,427,queen st,E3B 5R6,fredericton,nb,427 queen st fredericton new brunswick e3b 5r6...,Fredericton,1310032,13,45.963283,-66.6419017
3,4,10 Austin Street,built heritage properties,heritage or historic site,moncton,..,10,austin st,E1C 1Z6,moncton,nb,10 austin st,Moncton,1307022,13,46.09247776,-64.78022946
4,5,10 Gates Dancing Inc.,arts,miscellaneous,ottawa,..,..,..,..,ottawa,on,..,Ottawa,3506008,35,45.40856224,-75.71536766


In [11]:
df.columns

Index(['Index', 'Facility_Name', 'Source_Facility_Type', 'ODCAF_Facility_Type',
       'Provider', 'Unit', 'Street_No', 'Street_Name', 'Postal_Code', 'City',
       'Prov_Terr', 'Source_Format_Address', 'CSD_Name', 'CSDUID', 'PRUID',
       'Latitude', 'Longitude'],
      dtype='object')

In [12]:
df.shape

(7972, 17)

In [13]:
# Finding missing values
df.isnull().sum()

Index                    0
Facility_Name            0
Source_Facility_Type     0
ODCAF_Facility_Type      0
Provider                 0
Unit                     0
Street_No                0
Street_Name              0
Postal_Code              0
City                     0
Prov_Terr                0
Source_Format_Address    0
CSD_Name                 0
CSDUID                   0
PRUID                    0
Latitude                 0
Longitude                0
dtype: int64

In [26]:
df['ODCAF_Facility_Type'].value_counts()

ODCAF_Facility_Type
library or archives                     3013
museum                                  1938
gallery                                  810
heritage or historic site                620
theatre/performance and concert hall     583
festival site                            346
miscellaneous                            343
art or cultural centre                   225
artist                                    94
Name: count, dtype: int64

In [29]:
new_df = df[df['ODCAF_Facility_Type']== 'museum']

In [30]:
new_df.head()

Unnamed: 0,Index,Facility_Name,Source_Facility_Type,ODCAF_Facility_Type,Provider,Unit,Street_No,Street_Name,Postal_Code,City,Prov_Terr,Source_Format_Address,CSD_Name,CSDUID,PRUID,Latitude,Longitude
1,2,'Ksan Historical Village & Museum,historic site-building or park,museum,canadian museums association,..,1500,62 hwy,V0J 1Y0,hazelton,bc,1500 hwy 62 hazelton british columbia v0j 1y0 ...,Hazelton,5949022,59,55.2645508,-127.6428124
2,3,'School Days' Museum,community/regional museum,museum,canadian museums association,..,427,queen st,E3B 5R6,fredericton,nb,427 queen st fredericton new brunswick e3b 5r6...,Fredericton,1310032,13,45.963283,-66.6419017
8,10,12 Service Battalion Museum,military museum or fort,museum,canadian museums association,..,5500,no 4 rd,V6X 3L5,richmond,bc,5500 no. 4 rd the sherman armoury richmond bri...,Richmond,5915015,59,49.1763542,-123.112783
13,15,15th Field Artillery Regiment Museum And Archives,museum/gallery,museum,vancouver,..,2025,11th av w,V6J 2C7,vancouver,bc,2025 w 11th av vancouver bc v6j 2c7,Vancouver,5915022,59,49.261938,-123.151123
15,18,17 Wing Heritage Collection,aeronautics and space museum transportation mu...,museum,canadian museums association,..,..,..,R3J 3Y5,winnipeg,mb,air heritage park air force way winnipeg manit...,Winnipeg,4611040,46,49.88955855,-97.23574396


In [31]:
new_df.shape

(1938, 17)

In [32]:
X = new_df[['Latitude', 'Longitude']].to_numpy()

In [33]:
X.shape

(1938, 2)

In [34]:
type(X[:,1])

numpy.ndarray

## Build a DBSCAN Model

In [45]:
# In this case we know how to scale the coordinates. Using standardization would be an error becaues we aren't using the full range of the lat/lng coordinates.
# Since latitude has a range of +/- 90 degrees and longitude ranges from 0 to 360 degrees, the correct scaling is to double the longitude coordinates (or half the Latitudes)

coords_scaled = df.copy()
coords_scaled['Latitude'] = 2 * coords_scaled['Latitude']

## Apply DBSCAN with Eucledian distance to the 

In [47]:
coords_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7972 entries, 0 to 7971
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Index                  7972 non-null   int64 
 1   Facility_Name          7972 non-null   object
 2   Source_Facility_Type   7972 non-null   object
 3   ODCAF_Facility_Type    7972 non-null   object
 4   Provider               7972 non-null   object
 5   Unit                   7972 non-null   object
 6   Street_No              7972 non-null   object
 7   Street_Name            7972 non-null   object
 8   Postal_Code            7972 non-null   object
 9   City                   7972 non-null   object
 10  Prov_Terr              7972 non-null   object
 11  Source_Format_Address  7972 non-null   object
 12  CSD_Name               7972 non-null   object
 13  CSDUID                 7972 non-null   object
 14  PRUID                  7972 non-null   object
 15  Latitude             

In [42]:
min_samples = 3 # minimum number of samples needed to form a neighborhood
eps = 1.0
metric = 'euclidean'

dbscan = DBSCAN(eps = eps, min_samples = min_samples, metric = metric)
