## Assignment 2
### Foundations of Machine Learning (CS564)

### *DBSCAN Algorithm*

<table style="font-size:15px">
    <thead>
        <td><b>Name of Student</b></td>
        <td><b>Roll No.</b></td>
        <td><b>Date</b></td>
    </thead>
    <tr>
        <td>M. Maheeth Reddy</td>
        <td>1801CS31</td>
        <td>01-Sep-2021</td>
    </tr>
</table>

**NOTE**: I used SciPy library only for reading the dataset (diabetes1.arff) and nothing else.

In [1]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
import math

In [2]:
# load data from arff file to DataFrame
raw_data = loadarff('diabetes1.arff')
df_data = pd.DataFrame(raw_data[0])

# fill any NaN values with mode
df_data.fillna(df_data.mode().iloc[0], inplace=True)

# drop the last column as mentioned in assignment
df_data.drop(['class'],axis=1,inplace=True)

# normalize the data
df_data = (df_data - df_data.mean()) / (df_data.std())

# convert to numpy
data = df_data.to_numpy()
data

array([[ 0.63953049,  0.84777132,  0.1495433 , ...,  0.20387991,
         0.46818687,  1.42506672],
       [-0.84433482, -1.12266474, -0.16044119, ..., -0.68397621,
        -0.36482303, -0.19054773],
       [ 1.23307662,  1.94245802, -0.26376935, ..., -1.10253696,
         0.6040037 , -0.10551539],
       ...,
       [ 0.34275743,  0.00329872,  0.1495433 , ..., -0.73471085,
        -0.68474712, -0.27558007],
       [-0.84433482,  0.15968254, -0.47042568, ..., -0.24004815,
        -0.37085933,  1.1699697 ],
       [-0.84433482, -0.87245064,  0.04621514, ..., -0.20199718,
        -0.4734765 , -0.87080644]])

In [3]:
# Python class for a Node, to store each item in dataset
class NodeClass:
    def __init__(self, item, index, status="noise"):
        self.item = item
        self.index = index          # Index of data item in dataset
        self.status = status        # Store whether point is core/border/noise
        
        # Cluster is stored as linked list, so parent and child nodes are stored as shown
        self.parent = None
        self.child = None
    
    # extend a cluster
    def extend_cluster(self, parent):
        child = self
        
        # End of cluster (End Of linked list)
        while(parent.child != None):
            parent = parent.child
        
        # Start of cluster (Head Of linked list)
        while(child.parent != None):
            child = child.parent
        
        # extend the cluster by linking
        parent.child = child
        child.parent = parent


In [4]:
# Python class for storing a Cluster
class ClusterClass:
    def __init__(self, head):
        self.head = head
    
    # Utility function to print clusters
    def print_cluster(self):
        count = 0
        node = self.head
        while(node.child != None):
            count = count+1
            node = node.child

        print(f"There are {count} no. of nodes")
        border = []
        core = []

        node = self.head
        while(node.child != None):
            if node.status.lower() == "core":
                core.append(node.index)
            elif node.status.lower() == "border":
                border.append(node.index)
            node = node.child
        
        print(f">>> {len(core)} Core Points and {len(border)} Border Points <<<\n")
        print("Core Points:")
        print(core)
        
        print("\nBorder Points:")
        print(border)
        print("================================\n")

In [5]:
class DBSCAN:
    def __init__(self, data, eps=2, minPts=5):
        self.data = data        # Dataset itself
        self.clusters = []      # List of all clusters
        self.visited = None     # List of the nodes that are visited
        self.core = []          # Core points in data
        self.nodes_list = None  # List of all node objects
        self.eps = eps          # Eps radius
        self.minPts = minPts    # Min points within radius
        self.dist_matrix = None # distance matrix
        self.neighbours = None  # list of neighbours for each data item
    
    # find out all core points
    def core_points(self):
        for index in range(len(self.data)):
            # check if core point and append
            if(np.sum(self.neighbours, axis = 1)[index] >= self.minPts):
                self.nodes_list[index] = NodeClass(self.data[index], index, "core")
                self.core.append(self.nodes_list[index])
    
    # find all density connected points
    def density_connect(self, index):
        self.visited[index] = True     # Current point is marked as visited
        
        # iterate through all neighbour points
        # If neighbour point is a border point, just add it to the cluster
        # else, if it is a Core point, iterate over its neighbours also
        for index_ in range(len(self.neighbours[index])):
            if(not self.visited[index_]):
                if(self.neighbours[index][index_] and self.nodes_list[index_] == None):
                    self.nodes_list[index_] = NodeClass(data[index_], index_, "border")
                    self.nodes_list[index_].extend_cluster(self.nodes_list[index])
                elif(self.neighbours[index][index_] and self.nodes_list[index_].status == "core"):
                    self.nodes_list[index_].extend_cluster(self.nodes_list[index])
                    self.density_connect(index_)
    
    # display clusters and their points
    def show_clusters(self):
        print(f"Count of Clusters = {len(self.clusters)}\n\n")
        for i, cluster in enumerate(self.clusters):
            print(f"Cluster No. {i+1}")
            print(f"-----------------")
            cluster.print_cluster()
    
    # calculate euclidian distance between two points
    def euclid_dist(self, pt1, pt2):
        return math.sqrt(((pt1-pt2)**2).sum())

    # calculate the distance matrix of all points in dataset
    def get_dist_matrix(self, data):
        dist_mat = []
        for i in range(len(data)):
            dist_row = np.array([self.euclid_dist(data[i],data[j]) for j in range(len(data))])
            dist_mat.append(dist_row)
        return np.array(dist_mat)

    # function that implements dbscan algorithm
    def predict(self):
        self.visited = [False for i in range(len(data))]
        self.nodes_list = [None for i in range(len(data))]
        self.dist_matrix = self.get_dist_matrix(data)
        self.neighbours = self.dist_matrix <= self.eps
        self.neighbours[range(len(data)), range(len(data))] = False
        
        self.core_points()
        for node in self.core:
            if(not self.visited[node.index]):
                self.clusters.append(ClusterClass(self.nodes_list[node.index]))
                self.density_connect(node.index)


In [6]:
# final output
dbscan = DBSCAN(data, eps=2, minPts=5)
dbscan.predict()
dbscan.show_clusters()


Count of Clusters = 3


Cluster No. 1
-----------------
There are 696 no. of nodes
>>> 648 Core Points and 48 Border Points <<<

Core Points:
[0, 30, 37, 34, 23, 25, 24, 88, 281, 48, 38, 1, 3, 6, 19, 59, 87, 47, 27, 32, 5, 10, 17, 21, 26, 11, 22, 154, 178, 29, 41, 36, 44, 2, 192, 61, 64, 76, 62, 113, 52, 50, 51, 46, 101, 79, 55, 68, 80, 83, 65, 69, 40, 71, 31, 20, 16, 57, 275, 77, 74, 85, 35, 63, 70, 66, 128, 91, 73, 95, 14, 107, 110, 127, 89, 90, 97, 103, 96, 98, 108, 109, 112, 118, 117, 104, 167, 116, 124, 102, 169, 94, 105, 122, 119, 33, 151, 134, 136, 137, 142, 121, 133, 82, 165, 171, 114, 54, 56, 132, 130, 144, 135, 126, 141, 92, 161, 146, 302, 174, 149, 158, 156, 157, 181, 138, 164, 168, 163, 166, 160, 189, 150, 99, 162, 139, 199, 244, 195, 295, 175, 185, 209, 402, 188, 191, 86, 358, 298, 214, 217, 198, 147, 218, 265, 179, 230, 201, 226, 196, 183, 170, 42, 176, 143, 194, 250, 219, 184, 93, 115, 123, 140, 263, 148, 207, 283, 246, 264, 180, 190, 197, 200, 202, 210, 203, 205, 224, 2