# Setup

### Imports

In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

### Setup data visualisaton params for Jupyter

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 15, 0.1
sb.set_style('whitegrid')

### Getting the dataset

In [3]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)

##### Sort the data on timestamp

In [4]:
df_data = df_data.sort_values(by=['time'])

##### <font color='red'>TEMP</font> Cut off the dataset (Grab around 6 hours of timestamps) <font color='red'>TEMP</font>

In [5]:
df_data
df_data.head()

Unnamed: 0,name,state,time
2,Staande_Lamp_3,0,1509489940655
6,Staande_Lamp_5,1,1509490011225
0,Staande_Lamp_1,1,1509491943009
1,Staande_Lamp_2,0,1509492221471
3,Staande_Lamp_3,1,1509492826941


# Functions

### Cleaning the dataset

In [6]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def clean_dataframe_for_fitting(df):
    d = defaultdict(LabelEncoder)
    df_fit = df.apply(lambda x: d[x.name].fit_transform(x))
    df_fit['state'] = df['state']
    df_fit['time'] = df['time']
    return df_fit

### Fit the DBSCAN model

In [7]:
min_samples_untill_its_a_cluster = 2

def fit_model(df, eps_distance_in_milliseconds):
    model = DBSCAN(
        eps=eps_distance_in_milliseconds, 
        min_samples=min_samples_untill_its_a_cluster
    ).fit(df)
    return model

### Get information from the model

In [8]:
def get_model_info(model):
    info_dict = {}
    info_dict['amount_of_datapoints'] = model.labels_.size
    info_dict['amount_of_outliers'] = Counter(model.labels_)[-1]
    
    
    cluster_data_count = Counter(model.labels_)
    if -1 in cluster_data_count:
        cluster_data_count.pop(-1) # don't count outliers as a cluster
    if (bool(cluster_data_count)):
        amount_of_clusters = max(cluster_data_count) + 1
    else:
        amount_of_clusters = 0;
    info_dict['amount_of_clusters'] = amount_of_clusters
    info_dict['datapoints_per_cluster_dict'] = Counter(model.labels_)
    return info_dict

In [9]:
def split_dataframe_on_cluster(model, df):
    df['cluster'] = model.labels_
    
    cluster_dict = {}
    
    amount_of_clusters = get_model_info(model)['amount_of_clusters']
    
    for idx in range(amount_of_clusters):
        cluster_dict[idx] = df.loc[df['cluster'] == idx].drop(columns=['cluster'])

    return cluster_dict

In [10]:
def get_too_large_clusters(cluster_dict, limit_in_milliseconds):
    too_large_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds > limit_in_milliseconds:
            too_large_clusters_dataframes_dict[idx] = df

    return too_large_clusters_dataframes_dict

In [11]:
def get_perfect_size_clusters(cluster_dict, limit_in_milliseconds):
    perfect_size_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds <= limit_in_milliseconds:
            perfect_size_clusters_dataframes_dict[idx] = df

    return perfect_size_clusters_dataframes_dict

# Program

- (300000 milliseconds = 5 minutes)

In [12]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def do_shit(df, eps, iteration=0, cluster_arr=[]):
    
    model = fit_model(df, eps)
    cluster_dict = split_dataframe_on_cluster(model, df)
    
    too_large_clusters_dict = get_too_large_clusters(cluster_dict, eps * 1.5)
    perfect_size_clusters = get_perfect_size_clusters(cluster_dict, eps * 1.5)
    
    for idx, df in too_large_clusters_dict.items():
        cluster_arr + do_shit(too_large_clusters_dict[idx], eps / 2, iteration + 1, cluster_arr)
    
    for idx, df in perfect_size_clusters.items():
        cluster_arr.append(df)
    
    return cluster_arr



five_minutes = 300000

one_week_in_milliseconds = (1000 * 60 * 60 * 24 * 7)
last_timestamp = df_data['time'].max()

weeks_clusters = []
for week in range(3):
    df_week_x = df_data[df_data['time'] >= last_timestamp - ((week + 1) * one_week_in_milliseconds)]
    df_week_x = df_week_x[df_week_x['time'] < last_timestamp - (week * one_week_in_milliseconds)]
    df_fit_1 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 1])
    df_fit_0 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 0])
    cluster_arr1 = do_shit(df_fit_1, five_minutes)
    cluster_arr2 = do_shit(df_fit_0, five_minutes)
    cluster_arr = cluster_arr1 + cluster_arr2
    weeks_clusters.append(cluster_arr)


print(len(weeks_clusters[0]))
print(len(weeks_clusters[1]))
print(len(weeks_clusters[2]))

440
852
1268


In [13]:
from functools import reduce
week_hashcodes = []
# hashcodes_3_weeks = []
# hashcodes_last_week = []
for week, cluster_arr in enumerate(weeks_clusters):
    week_hashcodes.append([])
    for idx, df in enumerate(cluster_arr):
        cluster = []
        for row in df.iterrows():
            index, data = row
            cluster.append(data['name'].tolist())

        cluster = list(set(cluster))

        hashcodedingus = 0
        for lamp in cluster:
            hashcodedingus += pow(2, lamp)

        if(len(cluster) > 1):
            week_hashcodes[week].append(hashcodedingus)
            # hashcodes_3_weeks.append(hashcodedingus)
            # if week == 0:
            #     hashcodes_last_week.append(hashcodedingus)
    
print(len(week_hashcodes[0]))
print(len(week_hashcodes[1]))
print(len(week_hashcodes[2]))
print(week_hashcodes)

388
746
1102
[[20, 17, 5, 25, 6, 18, 6, 6, 6, 20, 12, 18, 6, 5, 20, 12, 18, 10, 12, 6, 24, 14, 22, 20, 5, 6, 5, 20, 9, 6, 21, 6, 20, 5, 5, 24, 6, 6, 5, 17, 6, 12, 5, 13, 12, 9, 22, 24, 20, 21, 17, 5, 10, 5, 10, 5, 20, 3, 10, 9, 6, 6, 12, 24, 10, 20, 10, 5, 20, 12, 5, 12, 20, 17, 6, 6, 12, 21, 13, 17, 12, 17, 7, 6, 12, 5, 6, 3, 20, 6, 17, 5, 5, 6, 9, 24, 6, 5, 12, 20, 5, 12, 10, 20, 18, 20, 13, 12, 6, 12, 5, 12, 20, 10, 20, 5, 12, 12, 20, 5, 17, 10, 25, 26, 17, 5, 10, 20, 21, 5, 6, 12, 5, 20, 22, 6, 5, 20, 6, 12, 9, 17, 6, 5, 6, 26, 20, 3, 6, 12, 28, 6, 9, 12, 10, 20, 5, 18, 12, 20, 24, 20, 20, 20, 6, 20, 28, 5, 5, 9, 9, 18, 22, 6, 20, 20, 5, 24, 22, 9, 6, 12, 7, 20, 12, 12, 5, 3, 20, 6, 12, 3, 20, 20, 20, 17, 5, 25, 6, 18, 6, 6, 6, 20, 12, 18, 6, 5, 20, 12, 18, 10, 12, 6, 24, 14, 22, 20, 5, 6, 5, 20, 9, 6, 21, 6, 20, 5, 5, 24, 6, 6, 5, 17, 6, 12, 5, 13, 12, 9, 22, 24, 20, 21, 17, 5, 10, 5, 10, 5, 20, 3, 10, 9, 6, 6, 12, 24, 10, 20, 10, 5, 20, 12, 5, 12, 20, 17, 6, 6, 12, 21, 13, 17, 12

In [37]:
count_dict = {}
for week, hashcodes_arr in enumerate(week_hashcodes):
    for i in hashcodes_arr:
        if i in count_dict:
            count_dict[i]['occurance_week_' + str(week)] += 1
        else:
            count_dict[i] = {}
            count_dict[i]['occurance_week_0'] = 0
            count_dict[i]['occurance_week_1'] = 0
            count_dict[i]['occurance_week_2'] = 0
            count_dict[i]['occurance_week_' + str(week)] += 1

count_dict

{20: {'occurance_week_0': 68,
  'occurance_week_1': 130,
  'occurance_week_2': 184},
 17: {'occurance_week_0': 20, 'occurance_week_1': 36, 'occurance_week_2': 52},
 5: {'occurance_week_0': 62, 'occurance_week_1': 122, 'occurance_week_2': 170},
 25: {'occurance_week_0': 4, 'occurance_week_1': 6, 'occurance_week_2': 10},
 6: {'occurance_week_0': 66, 'occurance_week_1': 118, 'occurance_week_2': 154},
 18: {'occurance_week_0': 12, 'occurance_week_1': 24, 'occurance_week_2': 36},
 12: {'occurance_week_0': 54, 'occurance_week_1': 84, 'occurance_week_2': 146},
 10: {'occurance_week_0': 22, 'occurance_week_1': 36, 'occurance_week_2': 56},
 24: {'occurance_week_0': 14, 'occurance_week_1': 28, 'occurance_week_2': 46},
 14: {'occurance_week_0': 2, 'occurance_week_1': 12, 'occurance_week_2': 18},
 22: {'occurance_week_0': 10, 'occurance_week_1': 14, 'occurance_week_2': 24},
 9: {'occurance_week_0': 18, 'occurance_week_1': 42, 'occurance_week_2': 68},
 21: {'occurance_week_0': 8, 'occurance_week_1'

In [47]:
threshold = 75
threshold_perc = 90
for key,val in count_dict.items():
    occurances_3_weeks = val['occurance_week_0'] + val['occurance_week_1'] + val['occurance_week_2']
    if occurances_3_weeks >= threshold:
        div = (occurances_3_weeks / threshold)
        count = 0
        perc = threshold_perc
        while div > 1:
            count += 1
            div /= 2
            perc += ((100 - threshold_perc) / 2) * (1 / count)
            count_dict[key]['is_predicted_group_percentage'] = perc
            
    else:
        count_dict[key]['is_predicted_group_percentage'] = round(
            (occurances_3_weeks / threshold) * threshold_perc, 2
        )

        
##### TODO: dat zeg ik, GAMMA!
for key,val in count_dict.items():
    count_dict[key]['is_relevant_group_percentage'] = 'unknown'
##### TODO: dat zeg ik, GAMMA!

pprint.pprint(count_dict)

{3: {'is_predicted_group_percentage': 95.0,
     'is_relevant_group_percentage': 'unknown',
     'occurance_week_0': 10,
     'occurance_week_1': 30,
     'occurance_week_2': 42},
 5: {'is_predicted_group_percentage': 99.16666666666667,
     'is_relevant_group_percentage': 'unknown',
     'occurance_week_0': 62,
     'occurance_week_1': 122,
     'occurance_week_2': 170},
 6: {'is_predicted_group_percentage': 99.16666666666667,
     'is_relevant_group_percentage': 'unknown',
     'occurance_week_0': 66,
     'occurance_week_1': 118,
     'occurance_week_2': 154},
 7: {'is_predicted_group_percentage': 28.8,
     'is_relevant_group_percentage': 'unknown',
     'occurance_week_0': 4,
     'occurance_week_1': 8,
     'occurance_week_2': 12},
 9: {'is_predicted_group_percentage': 95.0,
     'is_relevant_group_percentage': 'unknown',
     'occurance_week_0': 18,
     'occurance_week_1': 42,
     'occurance_week_2': 68},
 10: {'is_predicted_group_percentage': 95.0,
      'is_relevant_group_pe