In [1]:
import re, csv, sys
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from scipy.spatial import distance
import numpy as np
import pandas as pd
import mnist
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [2]:


class Cluster(object):
    
    def __init__(self,  name, dim):
        self.name = name
        self.dim = dim
        self.points = []
    
    def add_point(self, point):
        self.points.append(point)
        
    def get_points(self):
        return self.points
    
    def erase(self):
        self.points = []
    
    def get_X(self):
        return [p[0] for p in self.points]
    
    def get_Y(self):
        return [p[1] for p in self.points]

    def get_Z(self):
        if self.dim > 2:
            return [p[2] for p in self.points]
        return None
    
    def has(self, point):
        return point in self.points
            
    def __str__(self):
        return "%s: %d points" % (self.name, len(self.points))
    

In [9]:
class DBScanner:
    
    def __init__(self, config):
        self.eps = config['eps']
        self.min_pts = config['min_pts']
        self.dim = config['dim']
        self.clusters = set()
        self.cluster_count = 0
        self.visited = []
        self.color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
    
    def dbscan(self, data):
        self.init_params()
        self.data = data

        ## Setting up the plot
        #fig = plt.figure()

        #axis_proj = 'rectilinear'
        #if self.dim > 2:
        #    axis_proj = '%dd' % self.dim

        #ax = fig.add_subplot(111, projection = axis_proj)
        
        #default noise cluster
        noise = Cluster('Noise', self.dim)
        self.clusters.add(noise)
        count = 0
        data_len = len(data)
        
        for point in data:
            
            count += 1
            if (((count/data_len)*100 % 0.1) == 0): print ((count/data_len)*100)
            
            if point not in self.visited:
                self.visited.append(point)
                neighbour_pts = self.region_query(point)
                if len(neighbour_pts) < self.min_pts:
                    noise.add_point(point)
                else:
                    name = 'cluster-%d' % self.cluster_count
                    new_cluster = Cluster(name, self.dim)

                    self.cluster_count += 1
                    self.expand_cluster(new_cluster, point, neighbour_pts)
                    
                    #if self.dim == 2:
                        #ax.scatter(new_cluster.get_X(), new_cluster.get_Y(), c = self.color[self.cluster_count % len(self.color)],
                        #marker = 'o', label = name)
                    #elif self.dim == 3:
                        #ax.scatter(new_cluster.get_X(), new_cluster.get_Y(), new_cluster.get_Z(), marker = 'o', 
                        #c = self.color[self.cluster_count % len(self.color)], label = name)

                    #ax.hold(True)
        
        #if len(noise.get_points()) != 0:
            #if self.dim > 2:
            #    ax.scatter(noise.get_X(), noise.get_Y(), noise.get_Z(), marker = 'x', label = noise.name)
            #else:
            #    ax.scatter(noise.get_X(), noise.get_Y(), marker = 'x', label = noise.name)
        
        print ("Number of clusters found: %d" % self.cluster_count)
        
        #ax.hold(False)
        #ax.legend(loc='lower left')
        #ax.grid(True)
        #plt.title(r'DBSCAN Clustering', fontsize=18)
        #plt.show()
                    

    def expand_cluster(self, cluster, point, neighbour_pts):
        cluster.add_point(point)
        for p in neighbour_pts:
            if p not in self.visited:
                self.visited.append(p)
                np = self.region_query(p)
                if len(np) >= self.min_pts:
                    for n in np:
                        if n not in neighbour_pts:
                            neighbour_pts.append(n)
                    
                for other_cluster in self.clusters:
                    if not other_cluster.has(p):
                        if not cluster.has(p):
                            cluster.add_point(p)

                if self.cluster_count == 0:
                    if not cluster.has(p):
                        cluster.add_point(p)
                        
        self.clusters.add(cluster)
                    
    
    def get_distance(self, from_point, to_point):
        p1 = [from_point[k] for k in range(self.dim)]
        p2 = [to_point[k] for k in range(self.dim)]
        return distance.euclidean(p1, p2)

                     
    def region_query(self, point):
        result = []
        for d_point in self.data:
            if d_point != point:
                if self.get_distance(d_point, point) <= self.eps:
                    result.append(d_point)
        return result

    def init_params(self):
        self.clusters = set()
        self.cluster_count = 0
        self.visited = []


## Fashion MNIST

In [121]:
def get_fashion_data(fraction):
    
    fashion_train_df = pd.read_csv('../data/fashionmnist/fashion-mnist_train.csv').sample(frac=fraction)\
                        .reset_index().drop('index', axis=1)
    fashion_test_df = pd.read_csv('../data/fashionmnist/fashion-mnist_test.csv')    

    X_train = fashion_train_df.iloc[:,fashion_train_df.columns != 'label'].reset_index()
    X_test = fashion_test_df.iloc[:,fashion_test_df.columns != 'label']

    Y_train = fashion_train_df[['label']]
    Y_test =  fashion_test_df[['label']]    
    
    X_train.columns = ['id'] + [i for i in range(len(X_train.columns) -1)]
    
    return X_train.to_dict('records'), Y_train

## EPS 1000

In [166]:
%time
CONFIG = 'config'

data, labels = get_data(0.01)
config = {'eps':1250, 'min_pts': 10, 'dim': len(data[0].keys())-1}


dbc = DBScanner(config)
dbc.dbscan(data)

1.0
2.0
3.0
4.0
5.0
6.0
8.0
9.0
10.0
11.0
12.0
13.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0
100.0
Number of clusters found: 2


In [168]:
[str(i) for i in list(dbc.clusters)]

['Noise: 475 points', 'cluster-1: 27 points', 'cluster-0: 98 points']

In [169]:
clusters = [None for i in range(len(data))]
cluster_names = [re.findall(r'\d+|Noise', str(i))[0] for i in list(dbc.clusters)]
i = 0

for cluster in list(dbc.clusters):
    for point in cluster.get_points():
        clusters[point['id']] = cluster_names[i]
    i += 1

In [170]:
train_labels = list(labels['label'])
purity_df = pd.DataFrame([clusters, train_labels]).transpose()
purity_df.columns = ['yhat_test', 'y_test']
purity_numerator = 0
purity_denominator = len(purity_df)

for cluster in purity_df['yhat_test'].unique():
    
    purity_numerator += int(purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                .count()\
                                .sort_values('yhat_test', ascending=False)\
                                .iloc[0])
purity_numerator/purity_denominator

0.22833333333333333

In [171]:
gini_coeffs = np.zeros([len(purity_df['yhat_test'].unique()), 2])
i = 0

for cluster in clusters:

    if (i < len(set(clusters))):
        
        gini_coeffs[i][0] = sum(((purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                    .count()\
                                    .sort_values('yhat_test', ascending=False)\
                            /len(purity_df[purity_df['yhat_test'] == cluster]))\
                            **2)\
                            ['yhat_test'])

        gini_coeffs[i][1] = len(purity_df[purity_df['yhat_test'] == cluster])

        i += 1
    
    else: break

sum(gini_coeffs[:,0] * gini_coeffs[:,1]) / sum(gini_coeffs[:,1])

0.11075457063711913

## EPS 1500

In [172]:
len(data[0].keys())

785

In [173]:
%time
CONFIG = 'config'

data, labels = get_data(0.01)
config = {'eps':1250, 'min_pts': 10, 'dim': len(data[0].keys())-1}


dbc = DBScanner(config)
dbc.dbscan(data)

1.0
2.0
3.0
4.0
5.0
6.0
8.0
9.0
10.0
11.0
12.0
13.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0
100.0
Number of clusters found: 3


In [174]:
[str(i) for i in list(dbc.clusters)]

['cluster-1: 71 points',
 'cluster-0: 52 points',
 'cluster-2: 46 points',
 'Noise: 431 points']

In [175]:
clusters = [None for i in range(len(data))]
cluster_names = [re.findall(r'\d+|Noise', str(i))[0] for i in list(dbc.clusters)]
i = 0

for cluster in list(dbc.clusters):
    for point in cluster.get_points():
        clusters[point['id']] = cluster_names[i]
    i += 1

In [176]:
train_labels = list(labels['label'])
purity_df = pd.DataFrame([clusters, train_labels]).transpose()
purity_df.columns = ['yhat_test', 'y_test']
purity_numerator = 0
purity_denominator = len(purity_df)

for cluster in purity_df['yhat_test'].unique():
    
    purity_numerator += int(purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                .count()\
                                .sort_values('yhat_test', ascending=False)\
                                .iloc[0])
purity_numerator/purity_denominator

0.2733333333333333

In [177]:
gini_coeffs = np.zeros([len(purity_df['yhat_test'].unique()), 2])
i = 0

for cluster in clusters:

    if (i < len(set(clusters))):
        
        gini_coeffs[i][0] = sum(((purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                    .count()\
                                    .sort_values('yhat_test', ascending=False)\
                            /len(purity_df[purity_df['yhat_test'] == cluster]))\
                            **2)\
                            ['yhat_test'])

        gini_coeffs[i][1] = len(purity_df[purity_df['yhat_test'] == cluster])

        i += 1
    
    else: break

sum(gini_coeffs[:,0] * gini_coeffs[:,1]) / sum(gini_coeffs[:,1])

0.11106206361938188

## 20 NG

In [140]:
def get_20ng_data(fraction):
    
    news_train   = fetch_20newsgroups(data_home='../data/20newsgroups/', 
                                subset='train', 
                                remove=('headers', 'footers', 'quotes'))
    
    vectorizer      = CountVectorizer(binary=True, stop_words='english', min_df = fraction)
    
    X_train       = vectorizer.fit_transform(news_train.data)
    Y_train       = news_train.target 

    X_train    = pd.DataFrame(X_train.todense()).reset_index()
    
    X_train.columns = ['id'] + [i for i in range(len(X_train.columns) -1)]
    
    return X_train.to_dict('records'), Y_train

In [141]:
data, label = get_20ng_data(150)

In [195]:
config = {'eps':0.2, 'min_pts': 2, 'dim': len(data[0].keys())-2}

dbc = DBScanner(config)
data, labels = get_20ng_data(500)

dbc.dbscan(data)

50.0
100.0
Number of clusters found: 130


In [208]:
clusters = [None for i in range(len(data))]
cluster_names = [re.findall(r'\d+|Noise', str(i))[0] for i in list(dbc.clusters)]
i = 0

for cluster in list(dbc.clusters):
    for point in cluster.get_points():
        clusters[point['id']] = cluster_names[i]
    i += 1

In [209]:
train_labels = list(labels)
purity_df = pd.DataFrame([clusters, train_labels]).transpose()
purity_df.columns = ['yhat_test', 'y_test']
purity_numerator = 0
purity_denominator = len(purity_df)

for cluster in purity_df['yhat_test'].unique():
    
    purity_numerator += int(purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                .count()\
                                .sort_values('yhat_test', ascending=False)\
                                .iloc[0])
purity_numerator/purity_denominator

0.07972423546049143

In [198]:
gini_coeffs = np.zeros([len(purity_df['yhat_test'].unique()), 2])
i = 0

for cluster in clusters:

    if (i < len(set(clusters))):
        
        gini_coeffs[i][0] = sum(((purity_df[purity_df['yhat_test'] == cluster].groupby('y_test')\
                                    .count()\
                                    .sort_values('yhat_test', ascending=False)\
                            /len(purity_df[purity_df['yhat_test'] == cluster]))\
                            **2)\
                            ['yhat_test'])

        gini_coeffs[i][1] = len(purity_df[purity_df['yhat_test'] == cluster])

        i += 1
    
    else: break

sum(gini_coeffs[:,0] * gini_coeffs[:,1]) / sum(gini_coeffs[:,1])

0.11106206361938188

In [211]:
purity_df

Unnamed: 0,yhat_test,y_test
0,Noise,7
1,Noise,4
2,Noise,4
3,Noise,1
4,Noise,14
5,Noise,16
6,Noise,13
7,Noise,3
8,Noise,2
9,Noise,4


## Housing Dataset

In [4]:
def get_housing_data(fraction):
    
    X_train = pd.read_csv('../data/household_power_consumption.txt', delimiter = ';').sample(frac=fraction)\
                        .reset_index().drop(['index', 'Date', 'Time'], axis=1).reset_index()
    
    X_train = X_train.loc[~ X_train['Global_active_power'].str.contains('\?', na=False),]
    
    X_train = X_train.astype('float')
    X_train.columns = ['id'] + [i for i in range(len(X_train.columns) -1)]
    
    return X_train.to_dict('records')
    return X_train

In [7]:
data = get_housing_data(0.001)
data

  if self.run_code(code, result):


[{0: 0.534, 1: 0.0, 2: 242.48, 3: 2.2, 4: 0.0, 5: 0.0, 6: 0.0, 'id': 0.0},
 {0: 0.246, 1: 0.104, 2: 241.57, 3: 1.0, 4: 0.0, 5: 1.0, 6: 1.0, 'id': 1.0},
 {0: 1.586, 1: 0.0, 2: 236.98, 3: 6.6, 4: 0.0, 5: 0.0, 6: 17.0, 'id': 2.0},
 {0: 0.284, 1: 0.062, 2: 244.24, 3: 1.2, 4: 0.0, 5: 0.0, 6: 0.0, 'id': 3.0},
 {0: 0.222, 1: 0.0, 2: 243.31, 3: 0.8, 4: 0.0, 5: 0.0, 6: 1.0, 'id': 4.0},
 {0: 1.392, 1: 0.206, 2: 238.63, 3: 5.8, 4: 0.0, 5: 1.0, 6: 18.0, 'id': 5.0},
 {0: 0.154, 1: 0.0, 2: 241.04, 3: 0.6, 4: 0.0, 5: 0.0, 6: 0.0, 'id': 6.0},
 {0: 0.44, 1: 0.114, 2: 241.09, 3: 1.8, 4: 0.0, 5: 0.0, 6: 1.0, 'id': 7.0},
 {0: 1.434, 1: 0.146, 2: 243.11, 3: 5.8, 4: 0.0, 5: 0.0, 6: 18.0, 'id': 8.0},
 {0: 0.412, 1: 0.198, 2: 241.22, 3: 1.8, 4: 0.0, 5: 1.0, 6: 1.0, 'id': 9.0},
 {0: 0.25, 1: 0.054, 2: 246.47, 3: 1.0, 4: 0.0, 5: 0.0, 6: 1.0, 'id': 10.0},
 {0: 1.126, 1: 0.052, 2: 243.65, 3: 4.6, 4: 0.0, 5: 0.0, 6: 1.0, 'id': 11.0},
 {0: 0.218, 1: 0.094, 2: 243.9, 3: 1.0, 4: 0.0, 5: 1.0, 6: 1.0, 'id': 12.0},
 {0:

In [10]:
config = {'eps':0.2, 'min_pts': 2, 'dim': len(data[0].keys())-2}

dbc = DBScanner(config)

dbc.dbscan(data)

Number of clusters found: 117


In [8]:
len(data)

2046

In [11]:
[str(i) for i in list(dbc.clusters)]

['cluster-29: 3 points',
 'cluster-72: 3 points',
 'cluster-39: 5 points',
 'cluster-88: 4 points',
 'cluster-48: 4 points',
 'cluster-50: 3 points',
 'cluster-65: 3 points',
 'cluster-31: 7 points',
 'cluster-51: 6 points',
 'cluster-34: 5 points',
 'cluster-83: 3 points',
 'cluster-49: 3 points',
 'cluster-76: 3 points',
 'cluster-71: 3 points',
 'cluster-32: 3 points',
 'cluster-59: 3 points',
 'cluster-84: 7 points',
 'cluster-57: 4 points',
 'cluster-38: 11 points',
 'cluster-89: 1 points',
 'cluster-58: 6 points',
 'cluster-35: 6 points',
 'cluster-30: 6 points',
 'cluster-53: 4 points',
 'cluster-77: 2 points',
 'cluster-44: 6 points',
 'cluster-47: 12 points',
 'cluster-80: 4 points',
 'cluster-78: 3 points',
 'cluster-69: 7 points',
 'cluster-28: 17 points',
 'cluster-33: 4 points',
 'cluster-90: 3 points',
 'cluster-75: 4 points',
 'cluster-85: 2 points',
 'cluster-43: 9 points',
 'cluster-63: 4 points',
 'cluster-52: 4 points',
 'cluster-54: 3 points',
 'cluster-81: 6 points