# Setup

### Imports

In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

### Setup data visualisaton params for Jupyter

In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 15, 0.1
sb.set_style('whitegrid')

### Getting the dataset

In [3]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)

##### Sort the data on timestamp

In [4]:
df_data = df_data.sort_values(by=['time'])

In [5]:
df_data.head()

Unnamed: 0,name,state,time
2,Staande_Lamp_3,0,1509489940655
6,Staande_Lamp_5,1,1509490011225
0,Staande_Lamp_1,1,1509491943009
1,Staande_Lamp_2,0,1509492221471
3,Staande_Lamp_3,1,1509492826941


# Functions

### Cleaning the dataset

In [6]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def clean_dataframe_for_fitting(df):
    d = defaultdict(LabelEncoder)
    df_fit = df.apply(lambda x: d[x.name].fit_transform(x))
    df_fit['state'] = df['state']
    df_fit['time'] = df['time']
    return df_fit

### Fit the DBSCAN model

In [7]:
min_samples_untill_its_a_cluster = 2

def fit_model(df, eps_distance_in_milliseconds):
    model = DBSCAN(
        eps=eps_distance_in_milliseconds, 
        min_samples=min_samples_untill_its_a_cluster
    ).fit(df)
    return model

### Get information from the model

In [8]:
def get_model_info(model):
    info_dict = {}
    info_dict['amount_of_datapoints'] = model.labels_.size
    info_dict['amount_of_outliers'] = Counter(model.labels_)[-1]
    
    
    cluster_data_count = Counter(model.labels_)
    if -1 in cluster_data_count:
        cluster_data_count.pop(-1) # don't count outliers as a cluster
    if (bool(cluster_data_count)):
        amount_of_clusters = max(cluster_data_count) + 1
    else:
        amount_of_clusters = 0;
    info_dict['amount_of_clusters'] = amount_of_clusters
    info_dict['datapoints_per_cluster_dict'] = Counter(model.labels_)
    return info_dict

### Get multiple datasets (one per cluster)

In [9]:
def split_dataframe_on_cluster(model, df):
    df['cluster'] = model.labels_
    
    cluster_dict = {}
    
    amount_of_clusters = get_model_info(model)['amount_of_clusters']
    
    for idx in range(amount_of_clusters):
        cluster_dict[idx] = df.loc[df['cluster'] == idx].drop(columns=['cluster'])

    return cluster_dict

### Get all clusters (as dataframes) that are too large

In [10]:
def get_too_large_clusters(cluster_dict, limit_in_milliseconds):
    too_large_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds > limit_in_milliseconds:
            too_large_clusters_dataframes_dict[idx] = df

    return too_large_clusters_dataframes_dict

### Get all clusters (as dataframes) that are not too large

In [11]:
def get_perfect_size_clusters(cluster_dict, limit_in_milliseconds):
    perfect_size_clusters_dataframes_dict = {}

    for idx, df in cluster_dict.items():

        first_time = df['time'].iloc[0]
        last_time = df['time'].iloc[df['time'].size - 1]

        diffrence_in_miliseconds = last_time - first_time
        # diffrence_in_minutes = diffrence_in_miliseconds / 1000 / 60

        if diffrence_in_miliseconds <= limit_in_milliseconds:
            perfect_size_clusters_dataframes_dict[idx] = df

    return perfect_size_clusters_dataframes_dict

# Variables for the program

In [12]:
# minimum distance between datapoints
starting_eps = 5 * 60 * 1000
    # 5 minutes

    
next_eps_division = 2
    # 5 minutes -> 2.5 minutes -> 1.25 minutes

    
# eps * max_cluster_size is when a cluster is considered too large and will be split up
max_cluster_size_multiplication = (1.5) 
    # 5 minutes -> 7.5 minutes
    # 2.5 minutes -> 3.75 minutes

# amount of weeks to analyse
amount_of_weeks = 50

# lol
week_threshold = 25
threshold_perc = 90   
relevance_decay_strength = 0.5

# Program

- (300000 milliseconds = 5 minutes)

In [14]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def do_shit(df, eps, iteration=0, cluster_arr=None):
    if cluster_arr is None:
        cluster_arr = []
    
    model = fit_model(df, eps)
    cluster_dict = split_dataframe_on_cluster(model, df)
    too_large_clusters_dict = get_too_large_clusters(cluster_dict, eps * max_cluster_size_multiplication)
    perfect_size_clusters = get_perfect_size_clusters(cluster_dict, eps * max_cluster_size_multiplication)
    
    for idx, df in too_large_clusters_dict.items():
        cluster_arr + do_shit(too_large_clusters_dict[idx], eps / next_eps_division, iteration + 1, cluster_arr)
    
    for idx, df in perfect_size_clusters.items():
        cluster_arr.append(df)
    return cluster_arr

one_week_in_milliseconds = (1000 * 60 * 60 * 24 * 7)
last_timestamp = df_data['time'].max()

weeks_clusters = []
for week in range(amount_of_weeks):
    
    df_week_x = df_data[df_data['time'] >= last_timestamp - ((week + 1) * one_week_in_milliseconds)]
    df_week_x = df_week_x[df_week_x['time'] < last_timestamp - (week * one_week_in_milliseconds)]
    if not df_week_x.empty:
        df_fit_1 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 1])
        df_fit_0 = clean_dataframe_for_fitting(df_week_x.loc[df_week_x['state'] == 0])

        cluster_arr1 = do_shit(df_fit_1, starting_eps)
        cluster_arr2 = do_shit(df_fit_0, starting_eps)

        cluster_arr = cluster_arr1 + cluster_arr2

        weeks_clusters.append(cluster_arr)
    else:
        amount_of_weeks = week
        break
        
print('WARNING!!! amount_of_weeks HAS BEEN CHANGED TO', amount_of_weeks)

for week in range(amount_of_weeks):
    print('week', week, len(weeks_clusters[week]))

week 0 220
week 1 206
week 2 208
week 3 208
week 4 198
week 5 216
week 6 220
week 7 209
week 8 183
week 9 225
week 10 210
week 11 168


In [None]:
from functools import reduce
week_hashcodes = []
for week, cluster_arr in enumerate(weeks_clusters):
    week_hashcodes.append([])
    for idx, df in enumerate(cluster_arr):
        cluster = []
        for row in df.iterrows():
            index, data = row
            cluster.append(data['name'].tolist())

        cluster = list(set(cluster))

        hashcodedingus = 0
        for lamp in cluster:
            hashcodedingus += pow(2, lamp)

        if(len(cluster) > 1):
            week_hashcodes[week].append(hashcodedingus)

for week in range(amount_of_weeks):
    print('week', week, 'hashcodes', len(week_hashcodes[week]))

In [None]:
count_dict = {}
for week, hashcodes_arr in enumerate(week_hashcodes):
    for i in hashcodes_arr:
        if i in count_dict:
            # count_dict[i]['occurance_week_' + str(week)] += 1
            count_dict[i]['occurance_week'][str(week)] += 1
        else:
            count_dict[i] = {}
            count_dict[i]['occurance_week'] = {}
            for w in range(amount_of_weeks):
                # count_dict[i]['occurance_week_' + str(w)] = 0
                count_dict[i]['occurance_week'][str(w)] = 0

count_dict

In [None]:

for key,val in count_dict.items():
    threshold = week_threshold * amount_of_weeks
    
    total_occurances = 0
    for week in range(amount_of_weeks):
        total_occurances += val['occurance_week'][str(week)]
        
    if total_occurances >= threshold:
        div = (total_occurances / threshold)
        count = 1
        perc = threshold_perc
        
        while div > 1:
            div /= 2
            perc += ((100 - threshold_perc) / 2) * (1 / count)
            count *= 2
            
    else:
        perc = (total_occurances / threshold) * threshold_perc
    
    count_dict[key]['is_predicted_group_percentage'] = round(perc, 2)


for key,val in count_dict.items():
    total = 0
    current = 0
    for week in range(amount_of_weeks):
        
        perc = 0
        if val['occurance_week'][str(week)] >= week_threshold:
            div = (val['occurance_week'][str(week)] / week_threshold)
            count = 1
            perc = threshold_perc
            while div > 1:
                div /= 2
                perc += ((100 - threshold_perc) / 2) * (1 / count)
                count *= 2
        else:
            perc = (val['occurance_week'][str(week)] / week_threshold) * threshold_perc
        
        total += 100 * (0.5) / pow(2, week * relevance_decay_strength)
        current += perc * (0.5) / pow(2, week * relevance_decay_strength)

    count_dict[key]['is_relevant_group_percentage'] = round((current / total) * 100, 2)
    
    # count_dict[key].pop('occurance_week', None)

pprint.pprint(count_dict)