### Introduction to DBScan algorithm
1. Density Based Spatial Cluster of Application with Noise (DBSCAN) algorithm
2. It is used to define clusters based on density of the datapoint and neighbour datapoints 

In [1]:
#Importing libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [4]:
#Data collection 
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/r-maSj5Yegvw2sJraT15FA/ODCAF-v1-0.csv"
df = pd.read_csv(url, encoding= 'ISO-8859-1')
df.head(10)

Unnamed: 0,Index,Facility_Name,Source_Facility_Type,ODCAF_Facility_Type,Provider,Unit,Street_No,Street_Name,Postal_Code,City,Prov_Terr,Source_Format_Address,CSD_Name,CSDUID,PRUID,Latitude,Longitude
0,1,#Hashtag Gallery,..,gallery,toronto,..,801,dundas st w,M6J 1V2,toronto,on,801 dundas st w,Toronto,3520005,35,43.65169472,-79.40803272
1,2,'Ksan Historical Village & Museum,historic site-building or park,museum,canadian museums association,..,1500,62 hwy,V0J 1Y0,hazelton,bc,1500 hwy 62 hazelton british columbia v0j 1y0 ...,Hazelton,5949022,59,55.2645508,-127.6428124
2,3,'School Days' Museum,community/regional museum,museum,canadian museums association,..,427,queen st,E3B 5R6,fredericton,nb,427 queen st fredericton new brunswick e3b 5r6...,Fredericton,1310032,13,45.963283,-66.6419017
3,4,10 Austin Street,built heritage properties,heritage or historic site,moncton,..,10,austin st,E1C 1Z6,moncton,nb,10 austin st,Moncton,1307022,13,46.09247776,-64.78022946
4,5,10 Gates Dancing Inc.,arts,miscellaneous,ottawa,..,..,..,..,ottawa,on,..,Ottawa,3506008,35,45.40856224,-75.71536766
5,6,100 Mile House Branch,cariboo regional district library system,library or archives,government of british columbia,..,449,birch avenue s,V0K 2E0,pender island,bc,449 s. birch avenue,One Hundred Mile House,5941005,59,51.64139,-121.29562
6,8,114 Alma St.,built heritage properties,heritage or historic site,moncton,..,114,alma st,..,moncton,nb,114 alma st,Moncton,1307022,13,46.09186844,-64.77860387
7,9,118 Mountain Road,built heritage properties,heritage or historic site,moncton,..,118,mountain rd,E1C 2K7,moncton,nb,118 mountain rd,Moncton,1307022,13,46.09448386,-64.77990581
8,10,12 Service Battalion Museum,military museum or fort,museum,canadian museums association,..,5500,no 4 rd,V6X 3L5,richmond,bc,5500 no. 4 rd the sherman armoury richmond bri...,Richmond,5915015,59,49.1763542,-123.112783
9,11,134 Church Street,built heritage properties,heritage or historic site,moncton,..,134,church st,E1C 4Z6,moncton,nb,134 church st,Moncton,1307022,13,46.09253232,-64.77997633


In [18]:
#Count Each facitiy_name with it's value of occurances 
values = df[df.ODCAF_Facility_Type.values == 'museum']
values.ODCAF_Facility_Type.value_counts()

ODCAF_Facility_Type
museum    1938
Name: count, dtype: int64

In [19]:
#Count Each facitiy_name with it's value of occurances 
values = df['ODCAF_Facility_Type'].values
count = 0
for val in values:
    if (val == 'museum'):
        count += 1

print(count)

1938


### Feature Selection of this problem 
1. We will be using latitude and longitude of the datapoints to make clusters 
2. these features will help us to calcuate minPts and epsilon values for DBSCAN approach 

In [28]:
#Feature Selection 
feature_values = df[['Latitude', 'Longitude']]
feature_values.info()

#Count if any not-null value
feature_values.notnull().count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7972 entries, 0 to 7971
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Latitude   7972 non-null   object
 1   Longitude  7972 non-null   object
dtypes: object(2)
memory usage: 124.7+ KB


Latitude     7972
Longitude    7972
dtype: int64

In [44]:
feature_values.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6748 entries, 0 to 7971
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Latitude   6748 non-null   object
 1   Longitude  6748 non-null   object
dtypes: object(2)
memory usage: 158.2+ KB


In [None]:
#convert feature selected values to datatype as float64 values 
df = feature_values
#Remove not floating values 
df = df[df.Latitude != '..']

#Need to strip ',' for the last of each row to make features pure numerical 
df['Latitude'] = df['Latitude'].astype(str).str.rstrip(',').astype(float)

df['Longitude'] = df['Longitude'].astype(str).str.rstrip('.').astype(float)

In [75]:
#Scaling the coordinates to make it comparable 
#Latitude: - +/- 90, Longitude: - 360 degree 
#So, to make it comparable we need to scale latitude by factor of 2 
scaled_coord = df.copy()
scaled_coord['Latitude'] = 2 * scaled_coord['Latitude']

In [76]:
#Using DBScan method to build cluster system with coordinates 
from sklearn.cluster import DBSCAN

min_samples = 3
metric = "euclidean"
eps = 0.5

scan = DBSCAN(eps= eps, metric= metric, min_samples= min_samples)


In [78]:
#Model Fitting
scan.fit(scaled_coord)

0,1,2
,eps,0.5
,min_samples,3
,metric,'euclidean'
,metric_params,
,algorithm,'auto'
,leaf_size,30
,p,
,n_jobs,


In [79]:
#Model Prediction and cluster making 
df['Cluster'] = scan.fit_predict(scaled_coord)

df['Cluster'].value_counts()

Cluster
 0      3853
 5       601
-1       329
 3       317
 13      185
        ... 
 117       3
 119       3
 120       3
 121       3
 122       3
Name: count, Length: 124, dtype: int64