# Setup

### Imports

In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

### Setup data visualisaton params for Jupyter

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 15, 0.1
sb.set_style('whitegrid')

### Getting the dataset

In [3]:
address = '../datasets/staandelamp_realistic_huge.json'
df_data = pd.read_json(address)

##### Sort the data on timestamp

In [4]:
df_data = df_data.sort_values(by=['time'])

In [5]:
df_data.head()

Unnamed: 0,name,state,time
1,Staande_Lamp_2,1,1509490125797
2,Staande_Lamp_3,1,1509490363420
0,Staande_Lamp_1,0,1509491964532
6,Staande_Lamp_5,1,1509492113970
4,Staande_Lamp_3,0,1509492126316


# Functions

### Cleaning the dataset

In [6]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def clean_dataframe_for_fitting(df):
    d = defaultdict(LabelEncoder)
    df_fit = df.apply(lambda x: d[x.name].fit_transform(x))
    df_fit['state'] = df['state']
    df_fit['time'] = df['time']
    return df_fit

### Fit the DBSCAN model

In [7]:
min_samples_untill_its_a_cluster = 2

def fit_model(df, eps_distance_in_milliseconds):
    model = DBSCAN(
        eps=eps_distance_in_milliseconds, 
        min_samples=min_samples_untill_its_a_cluster
    ).fit(df)
    return model

### Get information from the model

In [8]:
def get_model_info(model):
    info_dict = {}
    info_dict['amount_of_datapoints'] = model.labels_.size
    info_dict['amount_of_outliers'] = Counter(model.labels_)[-1]
    
    
    cluster_data_count = Counter(model.labels_)
    if -1 in cluster_data_count:
        cluster_data_count.pop(-1) # don't count outliers as a cluster
    if (bool(cluster_data_count)):
        amount_of_clusters = max(cluster_data_count) + 1
    else:
        amount_of_clusters = 0;
    info_dict['amount_of_clusters'] = amount_of_clusters
    info_dict['datapoints_per_cluster_dict'] = Counter(model.labels_)
    return info_dict

### Get multiple datasets (one per cluster)

In [9]:
def split_dataframe_on_cluster(model, df):
    df['cluster'] = model.labels_
    
    cluster_dict = {}
    
    amount_of_clusters = get_model_info(model)['amount_of_clusters']
    
    for idx in range(amount_of_clusters):
        cluster_dict[idx] = df.loc[df['cluster'] == idx].drop(columns=['cluster'])

    return cluster_dict

### Get all clusters (as dataframes) that are too large

In [10]:
def get_too_large_clusters(cluster_dict, limit_in_milliseconds):
    too_large_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds > limit_in_milliseconds:
            too_large_clusters_dataframes_dict[idx] = df

    return too_large_clusters_dataframes_dict

### Get all clusters (as dataframes) that are not too large

In [11]:
def get_perfect_size_clusters(cluster_dict, limit_in_milliseconds):
    perfect_size_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds <= limit_in_milliseconds:
            perfect_size_clusters_dataframes_dict[idx] = df

    return perfect_size_clusters_dataframes_dict

# Variables for the program

In [12]:
# minimum distance between datapoints
starting_eps = 5 * 60 * 1000
    # 5 minutes

    
next_eps_division = 2
    # 5 minutes -> 2.5 minutes -> 1.25 minutes

    
# eps * max_cluster_size is when a cluster is considered too large and will be split up
max_cluster_size_multiplication = (1.5) 
    # 5 minutes -> 7.5 minutes
    # 2.5 minutes -> 3.75 minutes

# amount of weeks to analyse
amount_of_weeks = 20

# lol
week_threshold = 25
threshold_perc = 90   
relevance_decay_strength = 0.5

# Program

- (300000 milliseconds = 5 minutes)

In [13]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def do_shit(df, eps, iteration=0, cluster_arr=[]):
    
    model = fit_model(df, eps)
    cluster_dict = split_dataframe_on_cluster(model, df)
    
    too_large_clusters_dict = get_too_large_clusters(cluster_dict, eps * max_cluster_size_multiplication)
    perfect_size_clusters = get_perfect_size_clusters(cluster_dict, eps * max_cluster_size_multiplication)
    
    for idx, df in too_large_clusters_dict.items():
        cluster_arr + do_shit(too_large_clusters_dict[idx], eps / next_eps_division, iteration + 1, cluster_arr)
    
    for idx, df in perfect_size_clusters.items():
        cluster_arr.append(df)
    
    return cluster_arr



# five_minutes = 300000

one_week_in_milliseconds = (1000 * 60 * 60 * 24 * 7)
last_timestamp = df_data['time'].max()

weeks_clusters = []
for week in range(amount_of_weeks):
    df_week_x = df_data[df_data['time'] >= last_timestamp - ((week + 1) * one_week_in_milliseconds)]
    df_week_x = df_week_x[df_week_x['time'] < last_timestamp - (week * one_week_in_milliseconds)]
    df_fit_1 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 1])
    df_fit_0 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 0])
    cluster_arr1 = do_shit(df_fit_1, starting_eps)
    cluster_arr2 = do_shit(df_fit_0, starting_eps)
    cluster_arr = cluster_arr1 + cluster_arr2
    weeks_clusters.append(cluster_arr)


for week in range(amount_of_weeks):
    print('week', week, len(weeks_clusters[week]))

week 0 380
week 1 778
week 2 1242
week 3 1664
week 4 2044
week 5 2464
week 6 2898
week 7 3316
week 8 3722
week 9 4106
week 10 4486
week 11 4912
week 12 5344
week 13 5738
week 14 6150
week 15 6560
week 16 6992
week 17 7386
week 18 7820
week 19 8244


In [14]:
from functools import reduce
week_hashcodes = []
for week, cluster_arr in enumerate(weeks_clusters):
    week_hashcodes.append([])
    for idx, df in enumerate(cluster_arr):
        cluster = []
        for row in df.iterrows():
            index, data = row
            cluster.append(data['name'].tolist())

        cluster = list(set(cluster))

        hashcodedingus = 0
        for lamp in cluster:
            hashcodedingus += pow(2, lamp)

        if(len(cluster) > 1):
            week_hashcodes[week].append(hashcodedingus)

for week in range(amount_of_weeks):
    print('week', week, 'hashcodes', len(week_hashcodes[week]))

week 0 hashcodes 336
week 1 hashcodes 690
week 2 hashcodes 1098
week 3 hashcodes 1470
week 4 hashcodes 1808
week 5 hashcodes 2168
week 6 hashcodes 2538
week 7 hashcodes 2900
week 8 hashcodes 3250
week 9 hashcodes 3576
week 10 hashcodes 3908
week 11 hashcodes 4286
week 12 hashcodes 4662
week 13 hashcodes 5008
week 14 hashcodes 5374
week 15 hashcodes 5712
week 16 hashcodes 6094
week 17 hashcodes 6444
week 18 hashcodes 6832
week 19 hashcodes 7206


In [15]:
count_dict = {}
for week, hashcodes_arr in enumerate(week_hashcodes):
    for i in hashcodes_arr:
        if i in count_dict:
            count_dict[i]['occurance_week_' + str(week)] += 1
        else:
            count_dict[i] = {}
            for w in range(amount_of_weeks):
                count_dict[i]['occurance_week_' + str(w)] = 0

count_dict

{5: {'occurance_week_0': 59,
  'occurance_week_1': 118,
  'occurance_week_2': 182,
  'occurance_week_3': 234,
  'occurance_week_4': 276,
  'occurance_week_5': 328,
  'occurance_week_6': 374,
  'occurance_week_7': 432,
  'occurance_week_8': 476,
  'occurance_week_9': 550,
  'occurance_week_10': 604,
  'occurance_week_11': 672,
  'occurance_week_12': 730,
  'occurance_week_13': 786,
  'occurance_week_14': 840,
  'occurance_week_15': 896,
  'occurance_week_16': 960,
  'occurance_week_17': 1026,
  'occurance_week_18': 1082,
  'occurance_week_19': 1124},
 12: {'occurance_week_0': 67,
  'occurance_week_1': 116,
  'occurance_week_2': 176,
  'occurance_week_3': 238,
  'occurance_week_4': 282,
  'occurance_week_5': 332,
  'occurance_week_6': 394,
  'occurance_week_7': 438,
  'occurance_week_8': 494,
  'occurance_week_9': 546,
  'occurance_week_10': 584,
  'occurance_week_11': 648,
  'occurance_week_12': 698,
  'occurance_week_13': 754,
  'occurance_week_14': 822,
  'occurance_week_15': 876,
  '

In [16]:

for key,val in count_dict.items():
    threshold = week_threshold * amount_of_weeks
    
    total_occurances = 0
    for week in range(amount_of_weeks):
        total_occurances += val['occurance_week_' + str(week)]
        
    if total_occurances >= threshold:
        div = (total_occurances / threshold)
        count = 0
        perc = threshold_perc
        while div > 1:
            count += 1
            div /= 2
            perc += ((100 - threshold_perc) / 2) * (1 / count)
    else:
        perc = (total_occurances / threshold) * threshold_perc
    
    count_dict[key]['is_predicted_group_percentage'] = round(perc, 2)


for key,val in count_dict.items():
    total = 0
    current = 0
    for week in range(amount_of_weeks):
        
        perc = 0
        if val['occurance_week_' + str(week)] >= week_threshold:
            div = (val['occurance_week_' + str(week)] / week_threshold)
            count = 0
            perc = threshold_perc
            while div > 1:
                count += 1
                div /= 2
                perc += ((100 - threshold_perc) / 2) * (1 / count)
        else:
            perc = (val['occurance_week_' + str(week)] / week_threshold) * threshold_perc
        
        total += 100 * (0.5) / pow(2, week * relevance_decay_strength)
        current += perc * (0.5) / pow(2, week * relevance_decay_strength)

    count_dict[key]['is_relevant_group_percentage'] = round( (current / total) * 100, 2 )

pprint.pprint(count_dict)

{3: {'is_predicted_group_percentage': 100.42,
     'is_relevant_group_percentage': 93.14,
     'occurance_week_0': 23,
     'occurance_week_1': 44,
     'occurance_week_10': 222,
     'occurance_week_11': 246,
     'occurance_week_12': 272,
     'occurance_week_13': 276,
     'occurance_week_14': 288,
     'occurance_week_15': 306,
     'occurance_week_16': 326,
     'occurance_week_17': 336,
     'occurance_week_18': 354,
     'occurance_week_19': 374,
     'occurance_week_2': 68,
     'occurance_week_3': 92,
     'occurance_week_4': 104,
     'occurance_week_5': 134,
     'occurance_week_6': 162,
     'occurance_week_7': 178,
     'occurance_week_8': 194,
     'occurance_week_9': 212},
 5: {'is_predicted_group_percentage': 101.42,
     'is_relevant_group_percentage': 99.21,
     'occurance_week_0': 59,
     'occurance_week_1': 118,
     'occurance_week_10': 604,
     'occurance_week_11': 672,
     'occurance_week_12': 730,
     'occurance_week_13': 786,
     'occurance_week_14': 840,