# DBSCAN From Scratch

**DBSCAN** (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering non-parametric algorithm: given a set of points in some space, it groups together points that are closely packed together (points with many nearby neighbors), marking as outliers points that lie alone in low-density regions.

## Key Concepts:
- **Epsilon ($\epsilon$)**: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
- **MinPoints**: The number of samples in a neighborhood for a point to be considered as a core point.
- **Core Point**: A point with at least `MinPoints` in its $\epsilon$-neighborhood.
- **Border Point**: A point reachable from a core point but with fewer than `MinPoints` in its neighborhood.
- **Noise Point**: Any point that is neither a core nor a border point.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN as SklearnDBSCAN

## 1. Implementation

In [None]:
class DBSCAN:
    def __init__(self, eps=0.5, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples
        self.labels = None

    def fit(self, X):
        n_samples = X.shape[0]
        self.labels = np.zeros(n_samples, dtype=int) # 0: unvisited, -1: noise, >0: cluster id
        cluster_id = 0
        
        for i in range(n_samples):
            if self.labels[i] != 0:
                continue
            
            neighbors = self._get_neighbors(X, i)
            
            if len(neighbors) < self.min_samples:
                self.labels[i] = -1 # Mark as noise
            else:
                cluster_id += 1
                self._expand_cluster(X, i, neighbors, cluster_id)
        
        return self

    def _get_neighbors(self, X, i):
        distances = np.linalg.norm(X - X[i], axis=1)
        return np.where(distances <= self.eps)[0]

    def _expand_cluster(self, X, i, neighbors, cluster_id):
        self.labels[i] = cluster_id
        
        idx = 0
        while idx < len(neighbors):
            neighbor_idx = neighbors[idx]
            
            if self.labels[neighbor_idx] == -1:
                self.labels[neighbor_idx] = cluster_id # Border point
            elif self.labels[neighbor_idx] == 0:
                self.labels[neighbor_idx] = cluster_id
                new_neighbors = self._get_neighbors(X, neighbor_idx)
                if len(new_neighbors) >= self.min_samples:
                    # Add new neighbors to the list to explore
                    neighbors = np.unique(np.concatenate([neighbors, new_neighbors]))
            
            idx += 1

## 2. Testing on Non-Linear Clusters (Moons)

In [None]:
X, _ = make_moons(n_samples=200, noise=0.05, random_state=42)

dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan.fit(X)

plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels, cmap='viridis')
plt.title("DBSCAN Clustering (Non-Linear Shapes)")
plt.show()

## 3. Comparison with Sklearn

In [None]:
sk_db = SklearnDBSCAN(eps=0.2, min_samples=5)
sk_labels = sk_db.fit_predict(X)

print(f"Unique labels (Our): {np.unique(dbscan.labels)}")
print(f"Unique labels (Sklearn): {np.unique(sk_labels)}")