In [2]:
%matplotlib inline
import numpy
import matplotlib.pyplot
import math
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import DBSCAN
from sklearn import datasets
from sklearn import metrics

import pandas
iris = datasets.load_iris()
iris.keys()
iris_df = pandas.DataFrame(iris.data)
iris_df.columns = iris.feature_names
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
def distance_squared(x1,x2):
    squared_distance = 0
    for k in range(len(x1)):
        squared_distance += (x1[k] - x2[k])**2
    return squared_distance
class DBScan:
    def __init__(self, eps, min_sample, data):
        self.eps = eps
        self.min_sample = min_sample
        self.data = data
    def neighbors(self):
        #Lists are for quick implementation, probably should've used objects/classes
        self.neighbors_list = (list([] for i in range(len(self.data))))
        self.core_list = (list([] for i in range(len(self.data))))
        self.new_cores = (list([] for i in range(len(self.data))))
        self.label = list("" for i in range(len(self.data)))
        #The first for loop gets data points from the dataset and compares the distance 
        #between the values to determine if the point are lower than eps
        for x in range(0,len(self.data)):
            for y in range(x + 1, len(self.data)):
                rows = self.data.iloc[[x,y]]
                if distance_squared(rows.values[0], rows.values[1]) < self.eps**2:
                    self.neighbors_list[x].append(y)
                    self.neighbors_list[y].append(x)
        #neighbors_count = map(lambda x: len(x), self.neighbors_list)
        #Goes through all the unlabeled points and gives them a new label or labels them as "noise"
        for point in range(0, len(self.data)):
            cluster_label = point
            if (self.label[point] == ""):
                if len(self.neighbors_list[point]) < self.min_sample:
                    self.label[point] = "noise"
                else:
                    self.label[point] = cluster_label
                    #Once a point is given a new label, it'll give all its neighbors a new label
                    for neighbor in self.neighbors_list[point]:
                        if (self.label[neighbor] == "" or self.label[neighbor] == "noise"):
                            self.label[neighbor] = cluster_label
                            #Core points are placed into another list and they go through a similar process
                            if len(self.neighbors_list[neighbor]) > self.min_sample:
                                self.core_list[point].append(neighbor)
                                self.new_cores[point].append(neighbor)
                    self.core_loop(point, cluster_label)
        print(self.core_list)
        print(self.label)
    def core_loop(self, index, cluster_label):
        #more stupid for loop stuff
        #the core_list is to make sure points aren't iterated over again
        temp_new_cores = []
        for point in self.core_list[index]:
            for neighbor in self.neighbors_list[point]:
                if (self.label[neighbor] == "" or self.label[neighbor] == "noise"):
                    self.label[neighbor] = cluster_label
                    if len(self.neighbors_list[neighbor]) > self.min_sample:
                        if (neighbor not in self.core_list[index]):
                            self.core_list[index].append(neighbor)
                            temp_new_cores.append(neighbor)
        if temp_new_cores != []:
            #I couldn't figure out a way to avoid recursion, so this implementation would fail 
            #if there were enough data points to exceed stack limit
            self.new_cores[index] = temp_new_cores
            self.core_loop(index, cluster_label)
        
# dbscan = DBScan(0.43, 5, iris_df)
# dbscan.neighbors()
realModel = DBSCAN(eps = 0.43, min_samples = 5)
realModel.fit(iris_df)
print(iris.target)
print("My DBSCAN", metrics.adjusted_rand_score(iris.target, dbscan.label) )
print("Scikit Learn DBSCAN", metrics.adjusted_rand_score(iris.target, realModel.labels_))


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
My DBSCAN 0.6476102752155394
Scikit Learn DBSCAN 0.5058120433673868


I'm not quite sure why my DBSCAN would create 3 clusters as opposed to scikit learn's 2 clusters, my model somehow did better. However, I would assume scikit learn's might perform better with different datasets. Plus, even if scikit-learn is worse than my model, it still runs about 100x faster than my model. 