# DBSCAN Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
import plotly.express as px

In [2]:
sns.set_theme(color_codes=True)
sns.set_style('whitegrid')
%matplotlib inline
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
two_blobs = pd.read_csv('cluster-two-blobs.csv')

In [4]:
two_blobs_outliers = pd.read_csv('cluster-two-blobs-outliers.csv')

In [5]:
px.scatter(two_blobs, x = 'X1' ,y = 'X2')

In [6]:
px.scatter(two_blobs_outliers, x = 'X1' ,y = 'X2')

In [7]:
def display_categories(model,data):
    
    labels =  model.fit_predict(data)
    fig =  px.scatter(data , x = 'X1' , y = 'X2' , color=labels, height=500)
    fig.show()

In [8]:
from sklearn.cluster import DBSCAN

In [9]:
dbscan_model = DBSCAN()

In [10]:
display_categories(dbscan_model,two_blobs)

In [11]:
display_categories(dbscan_model,two_blobs_outliers)

In [12]:
dbscan_model = DBSCAN(eps=0.001)

In [13]:
display_categories(dbscan_model,two_blobs_outliers)

In [14]:
dbscan_model = DBSCAN(eps=10)
display_categories(dbscan_model,two_blobs_outliers)

In [15]:
dbscan_model = DBSCAN(eps=1)
display_categories(dbscan_model,two_blobs_outliers)

In [16]:
#Total Outliers Founds
np.sum(dbscan_model.labels_ == -1)

3

In [17]:
100 * np.sum(dbscan_model.labels_ == -1) / len(dbscan_model.labels_)

0.29910269192422734

In [18]:
len(dbscan_model.labels_)

1003

In [19]:
number_of_outlier = []
outlier_percent = []

for eps in np.linspace(0.001,10,200):
    
    model = DBSCAN(eps=eps)
    model.fit(two_blobs_outliers)
    
    #Total Outliers Founds
    number_of_outlier.append(np.sum(model.labels_ == -1))
    
    perc_outliers = (100 * np.sum(model.labels_ == -1) / len(model.labels_))
    outlier_percent.append(perc_outliers)
    

In [20]:
fig = px.scatter(x=np.linspace(0.001,10,200),y=number_of_outlier,height=600, labels={"x": "Epsilon",  "y": "Number of Outliers"}).update_traces(mode='lines+markers')
fig.show()

In [21]:
dbscan_model = DBSCAN(eps=0.70)
display_categories(dbscan_model,two_blobs_outliers)

In [22]:
fig = px.scatter(x=np.linspace(0.001,10,200),y=outlier_percent,height=600, labels={"x": "Epsilon",  "y": "Percent of Outliers"}).update_traces(mode='lines+markers')
fig.show()

In [23]:
# When you can see that the Percent of outliers and Number of Outliers
# are becoming constant for a number of epsilon values
# look where the sequence of those values started
#the staring point or the first point of that value is your EPSILON

## For Minimum Samples

In [24]:
number_of_outlier = []
outlier_percent = []

for n in np.arange(1,100):
    
    model = DBSCAN(min_samples=n)
    model.fit(two_blobs_outliers)
    
    #Total Outliers Founds
    number_of_outlier.append(np.sum(model.labels_ == -1))
    
    outlier_percent.append(100 * np.sum(model.labels_ == -1) / len(model.labels_))
    
    

In [25]:
fig = px.scatter(x=np.arange(1,100),y=number_of_outlier,height=600, labels={"x": "Minimum Points",  "y": "Number of Outliers"}).update_traces(mode='lines+markers')
fig.show()

In [26]:
len(two_blobs_outliers.columns)

2

In [27]:
dbscan_model = DBSCAN(eps=0.70, min_samples=2*len(two_blobs_outliers.columns))
display_categories(dbscan_model,two_blobs_outliers)