## Imports

In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

##### Standard data visualisaton params for Jupyter

In [2]:

%matplotlib inline
rcParams['figure.figsize'] = 15, 0.1
sb.set_style('whitegrid')

## Getting the dataset

In [3]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)

##### Sort the data on timestamp

In [4]:
df_data = df_data.sort_values(by=['time'])

##### <font color='red'>TEMP</font> Cut off the dataset (Grab around 6 hours of timestamps) <font color='red'>TEMP</font>

In [5]:
df_data = df_data[:42]

##### Show the first 5 entrys of the dataset

In [6]:
df_data.head()

Unnamed: 0,name,state,time
2,Staande_Lamp_3,0,1509489940655
6,Staande_Lamp_5,1,1509490011225
0,Staande_Lamp_1,1,1509491943009
1,Staande_Lamp_2,0,1509492221471
3,Staande_Lamp_3,1,1509492826941


## Cleaning the dataset

##### Reformat the first column to iterables:
<font color='#ccc'>
    To prevent the following error when building the DBSCAN model<br>
    ValueError: could not convert string to float: 'Staande_Lamp_5'
</font> 

In [7]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

d = defaultdict(LabelEncoder)
df_fit = df_data.apply(lambda x: d[x.name].fit_transform(x))
df_fit['time'] = df_data['time']

##### Show the changes to the dataset

In [8]:
print('before:\n', df_data.head())
print('\n')
print('after:\n', df_fit.head())

before:
              name  state           time
2  Staande_Lamp_3      0  1509489940655
6  Staande_Lamp_5      1  1509490011225
0  Staande_Lamp_1      1  1509491943009
1  Staande_Lamp_2      0  1509492221471
3  Staande_Lamp_3      1  1509492826941


after:
    name  state           time
2     2      0  1509489940655
6     4      1  1509490011225
0     0      1  1509491943009
1     1      0  1509492221471
3     2      1  1509492826941


## Setup variables
(300000 milliseconds = 5 minutes)

In [9]:
five_minutes = 300000
min_samples_untill_its_a_cluster = 2


## Fit the model

In [10]:
model = DBSCAN(
    eps=five_minutes, 
    min_samples=min_samples_untill_its_a_cluster
).fit(df_fit)

## Clustering

In [11]:
# DEBUG DINGEN
print('datapoints per cluster (count)', Counter(model.labels_))
print('amount of clusters (count)', max(Counter(model.labels_)))
print('amount of data:', df_fit.shape[0])
print('amount of outliers:', df_fit[model.labels_==-1].shape[0])
print('\nsome of the outliers:\n', df_fit[model.labels_==-1].head())

datapoints per cluster (count) Counter({-1: 12, 2: 5, 6: 4, 10: 4, 7: 3, 0: 2, 1: 2, 3: 2, 4: 2, 5: 2, 8: 2, 9: 2})
amount of clusters (count) 10
amount of data: 42
amount of outliers: 12

some of the outliers:
     name  state           time
12     3      0  1509494311740
13     4      1  1509495178373
11     2      0  1509496790877
19     3      0  1509498728417
18     2      0  1509499036484


In [17]:
amount_of_clusters = max(Counter(model.labels_))
amount_of_clusters

10

## Add clusters to the dataset

In [13]:
df_data = df_data
df_data['cluster'] = model.labels_

##### Remove outliers

In [14]:
df_data = df_data.loc[df_data['cluster'] != -1]

In [15]:
df_data.head()

Unnamed: 0,name,state,time,cluster
2,Staande_Lamp_3,0,1509489940655,0
6,Staande_Lamp_5,1,1509490011225,0
0,Staande_Lamp_1,1,1509491943009,1
1,Staande_Lamp_2,0,1509492221471,1
3,Staande_Lamp_3,1,1509492826941,2


### Split up the clusters into multiple dataframes & remove the cluster column

In [34]:
cluster_dict = {}

for idx in range(amount_of_clusters):
    cluster_dict[idx] = df_data.loc[df_data['cluster'] == idx, ['name', 'state', 'time']]
    print('CLUSTER', idx)
    print(cluster_dict[idx])
    print('\n\n')


CLUSTER 0
             name  state           time
2  Staande_Lamp_3      0  1509489940655
6  Staande_Lamp_5      1  1509490011225



CLUSTER 1
             name  state           time
0  Staande_Lamp_1      1  1509491943009
1  Staande_Lamp_2      0  1509492221471



CLUSTER 2
             name  state           time
3  Staande_Lamp_3      1  1509492826941
4  Staande_Lamp_3      1  1509492860451
7  Staande_Lamp_1      1  1509492904889
5  Staande_Lamp_4      0  1509492910368
8  Staande_Lamp_2      0  1509492983553



CLUSTER 3
              name  state           time
9   Staande_Lamp_3      1  1509495776509
10  Staande_Lamp_3      1  1509496070897



CLUSTER 4
              name  state           time
16  Staande_Lamp_3      0  1509497372654
20  Staande_Lamp_5      0  1509497587456



CLUSTER 5
              name  state           time
14  Staande_Lamp_1      1  1509497980691
15  Staande_Lamp_2      0  1509498130379



CLUSTER 6
              name  state           time
27  Staande_Lamp_5    

In [64]:
too_large_clusters_dataframes_dict = {}

for idx, df in cluster_dict.items():
    
    first_time = df['time'].iloc[0]
    last_time = df['time'].iloc[df['time'].size - 1]
    
    diffrence_in_miliseconds = last_time - first_time
    diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

    
    if diffrence_in_minutes > 5:
        print('cluster', idx)
        print('fisrt time in cluster -->', first_time)
        print('last time in cluster  -->', last_time)
        print('diff in milliseconds  -->', diffrence_in_miliseconds)
        print('diff in minutes       -->', diffrence_in_minutes)
        print('diff in minutes > 5 ? -->', diffrence_in_minutes > 5)
        print('\n')
        too_large_clusters_dataframes_dict[idx] = df

too_large_clusters_dataframes_dict

cluster 6
fisrt time in cluster --> 1509501685952
last time in cluster  --> 1509502163379
diff in milliseconds  --> 477427
diff in minutes       --> 7.957116666666667
diff in minutes > 5 ? --> True




{6:               name  state           time
 27  Staande_Lamp_5      1  1509501685952
 26  Staande_Lamp_4      1  1509501895396
 21  Staande_Lamp_1      1  1509502111561
 22  Staande_Lamp_2      1  1509502163379}

In [None]:
# count_dict = {}
# 
# for idx, row in df_data.iterrows():
#     if count_dict[row['cluster']] is None:
#         count_dict[row['cluster']] = 0
#         
#     count_dict[row['cluster']] += 1
# count_dict

##### Create a color dictionary for the scatter-plot

In [None]:
# COLOR_MAP = {}
# for n in set(model.labels_):
#     if n % 3 == 0:
#         COLOR_MAP[n] = 'firebrick'
#     elif n % 3 == 1:
#         COLOR_MAP[n] = 'darkgreen'
#     else:
#         COLOR_MAP[n] = 'darkblue'
# 
# #Outliers
# COLOR_MAP[-1] = 'lightgrey'
# 
# print('All of the datapoints:\n', model.labels_)
# print('\n')
# print('One color for each datapoint:\n', set(model.labels_))

##### Create the graph

In [None]:
# fig = plt.figure()
# ax = fig.add_axes([.1, .1, 1, 1])
# 
# colors = [COLOR_MAP[l] for l in model.labels_]
# 
# # colors = model.labels_
# 
# # use first column as X value
# X = df_fit.iloc[:,2]
# # set nr 1 as the value for y so it will be 1Dimentional
# Y = [1] * df_fit.shape[0]
# 
# ax.scatter(X, Y, c=colors, s=20)
# ax.set_xlabel = 'time'
# plt.title = 'Groupingz'

![title](../datasets/images/expected_6_hours_realistic_legend.png)
![title](../datasets/images/expected_6_hours_realistic_sub_clustering.png)