In [2]:
import numpy as np
from io import StringIO
from collections import deque
import mylibrary as mylib
from mylibrary import euclidean_distance
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline

In [3]:
class Point:
    def __init__(self, loc):
        self.loc = loc
        self.is_visited = False
        self.is_noise = False
        self.cluster_id = -1
        
    def label_noise(self, is_noise=True):
        self.is_noise = is_noise
        
    def label_cluster_id(self, c_id = -1):
        self.cluster_id = c_id
    def label_is_visited(self, is_visited=True):
        self.is_visited = True
    
    def __repr__(self):
         return str(self.loc) + " in cluster: " + str(self.cluster_id) 

In [4]:
def preprocess(data):
    return list(map(lambda x: Point(x), data))

In [5]:
def region_query(data, pt, eps):
    neighbors = []
    for neighbor in data:
        if euclidean_distance(neighbor.loc, pt.loc) <= eps:
            neighbors.append(neighbor)
    return neighbors

In [6]:
def dbscan(data, eps, min_pts):
    cluster_id = 0
    for pt in data:
        if pt.cluster_id == -1:
            neighbors = region_query(data, pt, eps)
            if len(neighbors) < min_pts:
                pt.label_noise(True)
            else:
                cluster_id += 1
                expand_cluster(data, pt, neighbors, cluster_id, eps, min_pts)
    return cluster_id

In [7]:
def expand_cluster(data, pt, neighbors, cluster_id, eps, min_pts):
    pt.label_cluster_id(cluster_id)
    neighbors.remove(pt)
    queue = deque(neighbors)
    
    while len(queue) != 0:
        neighbor = queue.popleft()
        if neighbor.is_noise:
            neighbor.label_noise(False)
            neighbor.label_cluster_id(cluster_id)
        if neighbor.cluster_id == -1:
            neighbor.label_cluster_id(cluster_id)
            neighbor_neighbors = region_query(data, neighbor, eps)
            if len(neighbor_neighbors) >= min_pts:
                queue.extend(neighbor_neighbors)
    return data

## Test

In [8]:
data = mylib.generate_data()
data = preprocess(data)
eps = 0.5
pt = data[0]
min_pts = 5

In [9]:
dbscan(data, eps, min_pts)

4

In [10]:
data

[[0.82679503 0.55753602] in cluster: 1,
 [1.14800094 1.0665746 ] in cluster: 1,
 [1.13428448 0.95745916] in cluster: 1,
 [-0.03032689  0.17266964] in cluster: 1,
 [1.02846725 1.85834606] in cluster: 1,
 [0.80001344 1.71216282] in cluster: 1,
 [0.22608323 1.22477703] in cluster: 1,
 [1.71143456 0.87102596] in cluster: 1,
 [0.19877022 0.66565585] in cluster: 1,
 [0.3788093  1.07324723] in cluster: 1,
 [1.63684038 0.2461589 ] in cluster: 1,
 [0.87884682 0.33827679] in cluster: 1,
 [1.58327623 0.96517425] in cluster: 1,
 [0.99777135 1.46304385] in cluster: 1,
 [1.26421435 1.41995528] in cluster: 1,
 [0.3368284  1.26362059] in cluster: 1,
 [1.16739536 1.10379134] in cluster: 1,
 [1.42590506 1.50813708] in cluster: 1,
 [1.1464513  0.56138169] in cluster: 1,
 [0.90052813 0.79226112] in cluster: 1,
 [0.71151073 0.55326079] in cluster: 1,
 [1.06873632 1.21535153] in cluster: 1,
 [1.25529108 0.35385239] in cluster: 1,
 [1.29457478 1.08873061] in cluster: 1,
 [0.41419556 0.52199105] in cluster: 1