# Setup

### Imports

In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

### Setup data visualisaton params for Jupyter

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 15, 0.1
sb.set_style('whitegrid')

### Getting the dataset

In [3]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)

##### Sort the data on timestamp

In [4]:
df_data = df_data.sort_values(by=['time'])

##### <font color='red'>TEMP</font> Cut off the dataset (Grab around 6 hours of timestamps) <font color='red'>TEMP</font>

In [5]:
df_data = df_data[:42]
df_data.head()

Unnamed: 0,name,state,time
2,Staande_Lamp_3,0,1509489940655
6,Staande_Lamp_5,1,1509490011225
0,Staande_Lamp_1,1,1509491943009
1,Staande_Lamp_2,0,1509492221471
3,Staande_Lamp_3,1,1509492826941


# Functions

### Cleaning the dataset

In [6]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def clean_dataframe_for_fitting(df):
    d = defaultdict(LabelEncoder)
    df_fit = df.apply(lambda x: d[x.name].fit_transform(x))
    df_fit['time'] = df['time']
    return df_fit

### Fit the DBSCAN model

In [7]:
min_samples_untill_its_a_cluster = 2

def fit_model(df, eps_distance_in_milliseconds):
    model = DBSCAN(
        eps=eps_distance_in_milliseconds, 
        min_samples=min_samples_untill_its_a_cluster
    ).fit(df)
    return model

### Get information from the model

In [8]:
def get_model_info(model):
    info_dict = {}
    info_dict['amount_of_datapoints'] = model.labels_.size
    info_dict['amount_of_outliers'] = Counter(model.labels_)[-1]
    
    
    cluster_data_count = Counter(model.labels_)
    cluster_data_count.pop(-1) # don't count outliers as a cluster
    if (bool(cluster_data_count)):
        amount_of_clusters = max(cluster_data_count) + 1
    else:
        amount_of_clusters = 0;
    info_dict['amount_of_clusters'] = amount_of_clusters
    info_dict['datapoints_per_cluster_dict'] = Counter(model.labels_)
    return info_dict

In [9]:
def split_dataframe_on_cluster(model, df):
    df['cluster'] = model.labels_
    
    cluster_dict = {}
    
    amount_of_clusters = get_model_info(model)['amount_of_clusters']
    
    for idx in range(amount_of_clusters):
        cluster_dict[idx] = df.loc[df['cluster'] == idx].drop(columns=['cluster'])

    return cluster_dict

In [10]:
def get_too_large_clusters(cluster_dict, limit_in_milliseconds):
    too_large_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds > limit_in_milliseconds:
            too_large_clusters_dataframes_dict[idx] = df

    return too_large_clusters_dataframes_dict

# Program

- (300000 milliseconds = 5 minutes)

In [11]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def do_shit(df, eps, iteration=0):
    
    indentation = '';
    for x in range(iteration):
        indentation += '    '
    indentation += '|--'
    
    model = fit_model(df, eps)
    cluster_dict = split_dataframe_on_cluster(model, df)
    too_large_clusters_dict = get_too_large_clusters(cluster_dict, eps)
    
    
    # print('MODEL INFO')
    # pp.pprint(get_model_info(model))
    # print('\n')
    
    if (len(too_large_clusters_dict) > 0):
        print(indentation, len(too_large_clusters_dict), 'cluster(s) is/are too large with an EPS of ', eps, '\n')
    else:
        print(indentation, 'no clusters are too large :) :) with an EPS of ', eps, '\n')

    for idx, df in too_large_clusters_dict.items():
        if eps == 300000:
            print(indentation, 'CLUSTER WITH ID', idx, 'is too large')
            
        print(indentation, 'RUNNING IT AGAIN with an EPS of ', eps / 2)
        do_shit(too_large_clusters_dict[idx], eps / 2, iteration + 1)

In [12]:

five_minutes = 300000
df_fit = clean_dataframe_for_fitting(df_data)

print('clustering bulshit...')
do_shit(df_fit, five_minutes)

clustering bulshit...
|-- 2 cluster(s) is/are too large with an EPS of  300000 

|-- CLUSTER WITH ID 6 is too large
|-- RUNNING IT AGAIN with an EPS of  150000.0
    |-- no clusters are too large :) :) with an EPS of  150000.0 

|-- CLUSTER WITH ID 10 is too large
|-- RUNNING IT AGAIN with an EPS of  150000.0
    |-- 1 cluster(s) is/are too large with an EPS of  150000.0 

    |-- RUNNING IT AGAIN with an EPS of  75000.0
        |-- no clusters are too large :) :) with an EPS of  75000.0 



### Expected clusters 7 (id=6) and cluster 11 (id=10) to be too large with an EPS of 30000 (5 minutes)

![title](../datasets/images/expected_6_hours_realistic_legend.png)
![title](../datasets/images/expected_6_hours_realistic_sub_clustering.png)