In [11]:
# dir(sklearn.cluster.DBSCAN)
help(DBSCAN)

Help on class DBSCAN in module sklearn.cluster._dbscan:

class DBSCAN(sklearn.base.ClusterMixin, sklearn.base.BaseEstimator)
 |  DBSCAN(eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)
 |  
 |  Perform DBSCAN clustering from vector array or distance matrix.
 |  
 |  DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
 |  Finds core samples of high density and expands clusters from them.
 |  Good for data which contains clusters of similar density.
 |  
 |  Read more in the :ref:`User Guide <dbscan>`.
 |  
 |  Parameters
 |  ----------
 |  eps : float, default=0.5
 |      The maximum distance between two samples for one to be considered
 |      as in the neighborhood of the other. This is not a maximum bound
 |      on the distances of points within a cluster. This is the most
 |      important DBSCAN parameter to choose appropriately for your data set
 |      and distance function.
 |  
 |  min_sam

In [1]:
#Import the libraries
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [22]:
# Import .csv file and convert it to a DataFrame object
df = pd.read_csv("Wholesale customers data.csv");

df.head()


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [4]:
#df.drop(['Channel','Region'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Fresh             440 non-null    int64
 1   Milk              440 non-null    int64
 2   Grocery           440 non-null    int64
 3   Frozen            440 non-null    int64
 4   Detergents_Paper  440 non-null    int64
 5   Delicassen        440 non-null    int64
dtypes: int64(6)
memory usage: 20.8 KB


In [23]:
array=df.values
array

array([[    2,     3, 12669, ...,   214,  2674,  1338],
       [    2,     3,  7057, ...,  1762,  3293,  1776],
       [    2,     3,  6353, ...,  2405,  3516,  7844],
       ...,
       [    2,     3, 14531, ...,   437, 14841,  1867],
       [    1,     3, 10290, ...,  1038,   168,  2125],
       [    1,     3,  2787, ...,    65,   477,    52]], dtype=int64)

In [24]:
stscaler = StandardScaler()
X = stscaler.fit_transform(array)

In [27]:
dbscan = DBSCAN(eps=1, min_samples=7)
labels=pd.Series(dbscan.fit_predict(X)).value_counts()

In [28]:
labels

 1    187
-1     96
 0     82
 2     50
 3     25
Name: count, dtype: int64

In [29]:
#Noisy samples are given the label -1.
dbscan.labels_

array([ 0,  0, -1,  1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,
        1,  0,  1,  0,  1, -1, -1, -1,  0,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  0,  1,  0,  0, -1, -1,  1,  0,  0,  0, -1,  0, -1,  0, -1,  1,
        1, -1,  0,  1,  1, -1,  0,  1,  1,  0, -1,  0,  0,  1, -1,  1,  0,
        1,  1,  1, -1,  1, -1,  0,  1,  1, -1,  1,  1,  1,  0,  0,  1,  0,
       -1, -1, -1,  1,  1,  1,  1, -1, -1,  0,  1,  0,  1,  1,  1,  0,  0,
        0, -1,  1,  1,  0,  0,  0, -1,  1,  0,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  0,  1, -1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  0,  0,  1,  0,  0,  0,  1,  1, -1,  0,  0,  0,  1,  1,  1,
        0, -1,  1,  0,  1,  0, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,
        1,  0,  0,  1,  1,  1,  0,  1,  1, -1, -1,  2,  2, -1, -1, -1,  2,
        2, -1,  2, -1,  2, -1,  2, -1,  2,  2, -1,  2, -1,  2, -1,  2,  2,
        2,  2, -1,  2,  2

In [30]:
df['cluster']=dbscan.labels_
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,cluster
0,2,3,12669,9656,7561,214,2674,1338,0
1,2,3,7057,9810,9568,1762,3293,1776,0
2,2,3,6353,8808,7684,2405,3516,7844,-1
3,1,3,13265,1196,4221,6404,507,1788,1
4,2,3,22615,5410,7198,3915,1777,5185,0


In [31]:
# Use pandas filtering and get noisy datapoints -1
df[df['cluster']==-1]

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,cluster
2,2,3,6353,8808,7684,2405,3516,7844,-1
22,1,3,31276,1917,4469,9408,2381,4334,-1
23,2,3,26373,36423,22019,5154,4337,16523,-1
24,2,3,22647,9776,13792,2915,4482,5778,-1
28,2,3,4113,20484,25957,1158,8604,5206,-1
...,...,...,...,...,...,...,...,...,...
426,1,3,13134,9347,14316,3141,5079,1894,-1
427,1,3,31012,16687,5429,15082,439,1163,-1
431,1,3,8533,5506,5160,13486,1377,1498,-1
435,1,3,29703,12051,16027,13135,182,2204,-1


In [33]:
from sklearn.metrics import silhouette_score

silhouette_score(X,dbscan.labels_)

0.1911603749691365

In [37]:
data=pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv",header=None)


In [43]:
ls=["Alcohol","Malic acid","Ash"," Alcalinity of ash", "Magnesium","Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins",
	"Color intensity","Hue","OD280/OD315 of diluted wines","Proline","target"]

In [45]:
data.columns=ls

In [46]:
data.head()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,target
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1
