## Assignment 2
### Foundations of Machine Learning (CS564)

### *DBSCAN Algorithm*

<table style="font-size:15px">
    <thead>
        <td><b>Name of Student</b></td>
        <td><b>Roll No.</b></td>
        <td><b>Date</b></td>
    </thead>
    <tr>
        <td>M. Maheeth Reddy</td>
        <td>1801CS31</td>
        <td>01-Sep-2021</td>
    </tr>
</table>

**NOTE**: I used SciPy library only for reading the dataset and nothing else.

In [1]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 

import math

eps = 2
minPts = 5

In [2]:
# load data from arff file to DataFrame
raw_data = loadarff('diabetes1.arff')
df_data = pd.DataFrame(raw_data[0])

# fill NaN values with mode
df_data.fillna(df_data.mode().iloc[0], inplace=True)

# drop the last column as mentioned in assignment
df_data.drop(['class'],axis=1,inplace=True)

# normalize the data
df_data=(df_data-df_data.mean())/(df_data.std())

# show some rows in the data
df_data.sample(5)

In [None]:
df_data = df_data.to_numpy()
df_data

In [3]:
# Defining Utility Functions
def euclidean_distance(x1, x2):
    """ Calculates the l2 distance between two vectors """
    distance = 0
    # Squared distance between each coordinate
    for i in range(len(x1)):
        distance += pow((x1[i] - x2[i]), 2)
    return math.sqrt(distance)

In [4]:
# %%
class DBSCAN():
    """A density based clustering method that expands clusters from 
    samples that have more neighbors within a radius specified by eps
    than the value min_samples.
    Parameters:
    -----------
    eps: float
        The radius within which samples are considered neighbors
    min_samples: int
        The number of neighbors required for the sample to be a core point. 
    """
    def __init__(self, eps=1, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples

    def _get_neighbors(self, sample_i):
        """ Return a list of indexes of neighboring samples
        A sample_2 is considered a neighbor of sample_1 if the distance between
        them is smaller than epsilon """
        neighbors = []
        idxs = np.arange(len(self.X))
        for i, _sample in enumerate(self.X[idxs != sample_i]):
            distance = euclidean_distance(self.X[sample_i], _sample)
            if distance < self.eps:
                neighbors.append(i)
        return np.array(neighbors)

    def _expand_cluster(self, sample_i, neighbors, core_points, border_points):
        """ Recursive method which expands the cluster until we have reached the border
        of the dense area (density determined by eps and min_samples) """
        cluster = [sample_i]
        # Iterate through neighbors
        for neighbor_i in neighbors:
            if not neighbor_i in self.visited_samples:
                self.visited_samples.append(neighbor_i)
                # Fetch the sample's distant neighbors (neighbors of neighbor)
                self.neighbors[neighbor_i] = self._get_neighbors(neighbor_i)
                # Make sure the neighbor's neighbors are more than min_samples
                # (If this is true the neighbor is a core point)
                if len(self.neighbors[neighbor_i]) >= self.min_samples:
                    core_points.append(neighbor_i)
                    # Expand the cluster from the neighbor
                    expanded_cluster, core_points, border_points = self._expand_cluster(
                        neighbor_i, self.neighbors[neighbor_i], core_points, border_points)
                    # Add expanded cluster to this cluster
                    cluster = cluster + expanded_cluster
                else:
                    # These will be Border Points
                    # If the neighbor is not a core point we only add the neighbor point
                    cluster.append(neighbor_i)
                    border_points.append(neighbor_i)
        return cluster, core_points, border_points

    def _get_cluster_labels(self):
        """ Return the samples labels as the index of the cluster in which they are
        contained """
        # Set default value to number of clusters
        # Will make sure all outliers have same cluster label
        labels = np.full(shape=self.X.shape[0], fill_value=len(self.clusters))
        for cluster_i, cluster in enumerate(self.clusters):
            for sample_i in cluster:
                labels[sample_i] = cluster_i
        return labels

    # DBSCAN
    def predict(self, X):
        self.X = X
        self.clusters = []
        self.core_points = []
        self.border_points = []
        self.visited_samples = []
        self.neighbors = {}
        n_samples = np.shape(self.X)[0]
        # Iterate through samples and expand clusters from them
        # if they have more neighbors than self.min_samples
        for sample_i in range(n_samples):
            if sample_i in self.visited_samples:
                continue
            self.neighbors[sample_i] = self._get_neighbors(sample_i)
            if len(self.neighbors[sample_i]) >= self.min_samples:
                # If core point => mark as visited
                self.visited_samples.append(sample_i)
                core_points = [sample_i]
                border_points = []
                # Sample has more neighbors than self.min_samples => expand
                # cluster from sample
                new_cluster, core_points, border_points = self._expand_cluster(
                    sample_i, self.neighbors[sample_i], core_points, border_points)
                # Add cluster to list of clusters
                self.clusters.append(new_cluster)
                self.core_points.append(core_points)
                self.border_points.append(border_points)

        # Get the resulting cluster labels
        cluster_labels = self._get_cluster_labels()
        return cluster_labels, self.core_points, self.border_points

In [5]:
dbscan = DBSCAN(eps=2, min_samples=5)
dbscan_out, final_core_points, final_border_points = dbscan.predict(X)

In [6]:
unique, counts = np.unique(dbscan_out, return_counts=True)
print("Clusters formed with labels are respective counts are:")
print(unique)
print(counts)
print('Cluster Lables', dbscan_out)

Clusters formed with labels are respective counts are:
[0 1 2]
[761   1   6]
Cluster Lables [0 0 0 0 2 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0

In [7]:
print('Core Points for the First Cluster', final_core_points[0])
print('Border Points for the First Cluster', final_border_points[0])

Core Points for the First Cluster [0, 29, 5, 16, 19, 1, 2, 191, 23, 24, 87, 3, 26, 11, 21, 17, 10, 60, 49, 59, 38, 6, 31, 20, 30, 36, 40, 68, 27, 46, 50, 32, 51, 47, 54, 55, 52, 64, 44, 63, 35, 62, 112, 65, 48, 37, 34, 25, 33, 118, 70, 66, 69, 72, 41, 71, 78, 434, 134, 79, 74, 76, 142, 77, 86, 190, 80, 82, 132, 56, 94, 85, 90, 89, 83, 88, 280, 101, 123, 93, 114, 14, 113, 103, 96, 97, 102, 105, 121, 98, 107, 91, 73, 138, 124, 109, 108, 117, 104, 116, 61, 115, 122, 119, 133, 92, 140, 150, 99, 149, 136, 127, 110, 126, 141, 128, 168, 151, 166, 135, 95, 129, 277, 137, 148, 206, 53, 205, 158, 156, 157, 180, 170, 42, 169, 163, 165, 216, 171, 130, 131, 442, 139, 143, 175, 184, 167, 164, 178, 154, 22, 153, 285, 217, 161, 146, 160, 188, 197, 174, 196, 183, 176, 179, 229, 213, 181, 199, 144, 194, 249, 203, 204, 344, 219, 192, 218, 147, 242, 233, 201, 225, 208, 209, 185, 237, 469, 195, 189, 198, 173, 240, 210, 202, 200, 223, 284, 263, 283, 207, 282, 214, 281, 288, 224, 238, 244, 162, 243, 394, 230