# Setup

In [9]:
import numpy as np
import pandas as pd
from scipy.io import arff

In [10]:
data = arff.loadarff('diabetes.arff')
df = pd.DataFrame(data[0])

# Drop the last label column
df.drop(['class'], axis = 1, inplace = True)

# Standardise data
df = (df - df.mean())/df.std()

# Numpy array
data = np.asarray(df)

# DBSCAN

In [11]:
# Node class for individual samples
class Node:
    def __init__(self, sample, index, status = "Noise"):
        # Store the Vector 
        self.sample = sample
        
        # Index in main data
        self.index = index
        
        # Core / Border/ Noise
        self.status = status
        
        # Parent and child for 
        # Cluster (Linked list)
        self.parent = None
        self.child = None
    
    # Method to link two samples 
    # For clustering together
    def link(self, parent):
        child = self
        
        # End of cluster (End Of linked list)
        while(parent.child != None):
            parent = parent.child
        
        # Start of cluster (Head Of linked list)
        while(child.parent != None):
            child = child.parent
        
        # Linking
        parent.child = child
        child.parent = parent

# Cluster class (Linked List)
class Cluster:
    def __init__(self, head):
        self.head = head
    
    # Utility function to print clusters
    def print_cluster(self):
        print("Node ID\tType")
        node = self.head

        while(node.child != None):
            print(node.index,"\t",node.status)
            node = node.child

In [12]:
from sklearn.metrics.pairwise import euclidean_distances

# DBSCAN method
class DBSCAN:
    def __init__(self, min_pts = 5, eps = 2):
        # Placeholder for all data
        self.data = None
        
        # List of all clusters
        self.clusters = []
        
        # Visited / Not for each node
        self.visited = None
        
        # Core points in data
        self.core = []
        
        # List of all node objects
        self.Nodes = None
        
        # Min points within radius
        self.min_pts = min_pts
        
        # Eps radius
        self.eps = eps
        
        # Neighbours
        self.pair_dist = None
        self.neighbours = None
        self.neighbours = None
    
    # Function to find 
    # out all core points
    def core_points(self):
        for index, i in enumerate(self.data):
            # Number of neighbours  > Min points
            if(np.sum(self.neighbours, axis = 1)[index] >= self.min_pts):
                # Define as core point and append
                self.Nodes[index] = Node(i, index, "Core")
                self.core.append(self.Nodes[index])
                
#             else:
#                 self.Nodes[index] = Node(i, index, "Noise")
    
    # Iteratively go through 
    # Density connected points
    def density_connect(self, index):
        # Consider point as visited
        self.visited[index] = 1
        
        # Go through all neighbours
        for index_, j in enumerate(self.neighbours[index]):
            if(self.visited[index_] == 0):
                # If border point, just add to cluster
                if(j and self.Nodes[index_] == None):
                    self.Nodes[index_] = Node(data[index_], index_, "Border")
                    self.Nodes[index_].link(self.Nodes[index])
                    
                # If Core point, branch into its neighbouts
                elif(j and self.Nodes[index_].status == "Core"):
                    self.Nodes[index_].link(self.Nodes[index])
                    self.density_connect(index_)
    
    # Utility function to print 
    # clusters and points
    def cluster_print(self):
        print("No.of Clusters identified: ", len(self.clusters))
        for i, cluster in enumerate(self.clusters):
            print("\n\nCluster No.", i + 1)
            cluster.print_cluster()
    
    # Perform DBSCAN Clustering
    def fit(self, data):
        self.data = data
        self.visited = [0] * len(data)
        self.Nodes = [None] * len(data)
        
        self.pair_dist = euclidean_distances(data, data)
        self.neighbours = self.pair_dist <= self.eps
        self.neighbours[range(len(data)), range(len(data))] = False
        
        self.core_points()
        for node in self.core:
            if(self.visited[node.index] == 0):
                self.clusters.append(Cluster(self.Nodes[node.index]))
                self.density_connect(node.index)

In [13]:
# Fit data
dbscan = DBSCAN(eps=2, min_pts=5)
dbscan.fit(data)

We obtain 3 clusters with our method

In [14]:
# Print CLusters
dbscan.cluster_print()

No.of Clusters identified:  3


Cluster No. 1
Node ID	Type
0 	 Core
30 	 Core
37 	 Core
34 	 Core
23 	 Core
25 	 Core
24 	 Core
88 	 Core
281 	 Core
48 	 Core
38 	 Core
1 	 Core
3 	 Core
6 	 Core
19 	 Core
59 	 Core
87 	 Core
47 	 Core
27 	 Core
32 	 Core
5 	 Core
10 	 Core
17 	 Core
21 	 Core
26 	 Core
11 	 Core
22 	 Core
154 	 Core
178 	 Core
29 	 Core
41 	 Core
36 	 Core
44 	 Core
2 	 Core
192 	 Core
61 	 Core
64 	 Core
76 	 Core
62 	 Core
113 	 Core
52 	 Core
50 	 Core
51 	 Core
46 	 Core
101 	 Core
79 	 Core
55 	 Core
68 	 Core
80 	 Core
83 	 Core
65 	 Core
69 	 Core
40 	 Core
71 	 Core
31 	 Core
20 	 Core
16 	 Core
57 	 Core
275 	 Core
77 	 Core
74 	 Core
85 	 Core
35 	 Core
63 	 Core
70 	 Core
66 	 Core
128 	 Core
91 	 Core
73 	 Core
95 	 Core
14 	 Core
107 	 Core
110 	 Core
127 	 Core
89 	 Core
90 	 Core
97 	 Core
103 	 Core
96 	 Core
98 	 Core
108 	 Core
109 	 Core
112 	 Core
118 	 Core
117 	 Core
104 	 Core
167 	 Core
116 	 Core
124 	 Core
102 	 Core
106 	 Border
169 	 Core
9

# Verification
We use the SKLearn DBSCAN function to verify the clusters we obtained

In [15]:
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=2, min_samples=5).fit(data)

We can see 3 labels here aswell(Representing 3 clusters)

In [16]:
clustering.labels_

array([ 0,  0,  0,  0, -1,  0,  0,  1, -1, -1,  0,  0,  0, -1,  0,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  2,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  1,  0,  0,  2,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        0,  1,  0,  0,  0