In [1]:
import pandas as pd
from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
import datetime

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from functools import reduce

In [2]:
class DataFrameValidator:
    def __init__(self, df):
        self.df = df
    
    def validate(columns = [], minimumRowAmount=1):
        # TODO: validate columns
        # TODO: validate rows
        pass

In [3]:
to_milliseconds = lambda seconds : seconds * 60 * 1000

class BinaryDataAnalysis:
    def __init__(self,
                 eps=5, #minutes
                 cluster_degregation=2,
                 max_cluster_distance=7.5,   #minutes
                 weeks=5,
                 decay_strength=0.5,
                 cluster_threshold=25,
                 threshold_percentage=90):
        self.eps =                  eps
        self.cluster_degregation =  cluster_degregation
        self.max_cluster_distance = max_cluster_distance
        self.weeks =                weeks
        self.decay_strength =       decay_strength
        self.cluster_threshold =    cluster_threshold
        self.threshold_percentage = threshold_percentage
        
    def analyze(self, df):
        
        self.lookup_table = self.create_lookup_table(
            df=df
        )
        
        df_fit = self.clean_dataframe(
            df=df
        )
        week_hashcodes = self.get_week_clusters_hash_codes(
            df=df_fit
        )
        hashcode_occurances = self.get_hashcode_occurances_per_week(
            week_hashcodes=week_hashcodes
        )
        predicted_groups = self.calculate_groups(
            hashcode_occurances_per_week=hashcode_occurances
        )
        
        result = []
        for key in predicted_groups:
            items = self.get_lookup_values(
                hashcode=key
            )
            result.append({
                'item_ids': items,
                'is_predicted_group_percentage': predicted_groups[key]['is_predicted_group_percentage'],
                'is_relevant_group_percentage': predicted_groups[key]['is_relevant_group_percentage']
            })
        
        return result
    
    def create_lookup_table(self, df):
                
        df_lookup = pd.DataFrame(data={ 'id': pd.Series(df['id']).unique() })
        
        df_lookup['hashcode'] = self.clean_dataframe(
            df=df_lookup
        )['id']
        lookup_dict = dict()
        for index, row in df_lookup.iterrows():
            lookup_dict[row['hashcode']] = row['id']
        return lookup_dict
    
    def clean_dataframe(self, df):
        """Convert non-nummeric values in the dataframe to numbers so that the dataframe can be used to fit a model

        Args:
            df: The dataframe to clean.

        Returns:
            df_fit: The dataframe with nummeric values
        """
        d = defaultdict(LabelEncoder)
        df_fit = df.apply(lambda x: d[x.name].fit_transform(x))
        if 'state' in df.columns:
            df_fit['state'] = df['state']
        if 'time' in df.columns:
            df_fit['time'] = df['time']
        return df_fit
    
    def get_week_clusters_hash_codes(self, df):
        """Get Cluster for a dataframe per week

        Args:
            df: The dataframe with more than one week of timestamps to cluster.

        Returns:
            TODO: CHANGE: cluster_arr: An array of weeks (arrays) that each hold 0 or more dataframes (clusters)
        """
        one_week_in_milliseconds = (1000 * 60 * 60 * 24 * 7)
        last_timestamp = df['time'].max()
        week_hashcodes = []
        for week in range(self.weeks):
            week_hashcodes.append([])
            df_week = df[df['time'] >= last_timestamp - ((week + 1) * one_week_in_milliseconds)]
            df_week = df_week[df_week['time'] < last_timestamp - (week * one_week_in_milliseconds)]

            if not df_week.empty:
                cluster_arr = self.split_dataframe_on_state_and_get_cluster_arr(
                    df=df_week, 
                    starting_eps=self.eps
                )
                for idx, df_week in enumerate(cluster_arr):
                    cluster = []
                    for row in df_week.iterrows():
                        index, data = row
                        cluster.append(data['id'].tolist())

                    cluster = list(set(cluster))

                    hashcode = 0
                    for lamp in cluster:
                        hashcode += pow(2, lamp)

                    if(len(cluster) > 1):
                        week_hashcodes[week].append(hashcode)
            else:
                print('WARNING!!! There are not', self.weeks, 'weeks in the dataset... amount_of_weeks HAS BEEN CHANGED TO', week)
                self.weeks = week
                break
        return week_hashcodes
    
    def split_dataframe_on_state_and_get_cluster_arr(self, df, starting_eps):
        """Split a dataframe into 2 seperate dataframes (one with state=0, the other with state=1) 
           and get the clusters for both of the dataframes

        Args:
            df: The dataframe to split & get clusters from.

        Returns:
            cluster_arr: an array that holds 0 or more dataframes (clusters)
        """
        df_1 = df.loc[df['state'] == 1]
        df_0 = df.loc[df['state'] == 0]
        cluster_arr1 = self.get_clusters_recursive(df=df_1.copy(), eps=self.eps)
        cluster_arr2 = self.get_clusters_recursive(df=df_0.copy(), eps=self.eps)
        cluster_arr = cluster_arr1 + cluster_arr2
        return cluster_arr
    
    def get_clusters_recursive(self, df, eps, iteration=0, cluster_arr=None):
        if cluster_arr is None:
            cluster_arr = []
        
        model = self.fit_model(df, eps)
        cluster_dict = self.get_clusters(df=df, model=model)
        
        for idx, df in cluster_dict['too_large'].items():
            cluster_arr + self.get_clusters_recursive(
                df=cluster_dict['too_large'][idx], 
                eps=eps / self.cluster_degregation, 
                iteration=iteration + 1, 
                cluster_arr=cluster_arr
            )
    
        for idx, df in cluster_dict['perfect_size'].items():
            cluster_arr.append(df)
        return cluster_arr
    
    
    def fit_model(self, df, eps):
        model = DBSCAN(
            eps=to_milliseconds(eps),
            min_samples=2
        ).fit(df)
        return model
    
    
    def get_clusters(self, df, model):
        
        df['cluster'] = model.labels_
        
        cluster_dict_too_large = {}
        cluster_dict_perfect_size = {}
        
        
        # Calculate amount of clusters
        cluster_data_count = Counter(model.labels_)
        if -1 in cluster_data_count:
            cluster_data_count.pop(-1) # don't count outliers as a cluster
        if (bool(cluster_data_count)):
            amount_of_clusters = max(cluster_data_count) + 1
        else:
            amount_of_clusters = 0;
        
        
        for idx in range(amount_of_clusters):
            cluster_df = df.loc[df['cluster'] == idx].drop(columns=['cluster'])
            
            first_time = cluster_df['time'].iloc[0]
            last_time = cluster_df['time'].iloc[cluster_df['time'].size - 1]
            diffrence_in_miliseconds = last_time - first_time
            if diffrence_in_miliseconds > to_milliseconds(self.max_cluster_distance):
                cluster_dict_too_large[idx] = cluster_df
            else:
                cluster_dict_perfect_size[idx] = cluster_df
        
        return {
            'too_large': cluster_dict_too_large,
            'perfect_size': cluster_dict_perfect_size
        }
        
    
    def get_hashcode_occurances_per_week(self, week_hashcodes):
        count_dict = {}
        for week, hashcodes_arr in enumerate(week_hashcodes):
            for i in hashcodes_arr:
                if i in count_dict:
                    count_dict[i]['occurance_week'][str(week)] += 1
                else:
                    count_dict[i] = {}
                    count_dict[i]['occurance_week'] = {}
                    for w in range(self.weeks):
                        count_dict[i]['occurance_week'][str(w)] = 0
        return count_dict
    
    def calculate_groups(self, hashcode_occurances_per_week):
        
        # TODO: clean this up
        
        count_dict = hashcode_occurances_per_week
        for key,val in count_dict.items():
            threshold = self.cluster_threshold * self.weeks

            total_occurances = 0
            for week in range(self.weeks):
                total_occurances += val['occurance_week'][str(week)]

            if total_occurances >= threshold:
                div = (total_occurances / threshold)
                count = 1
                perc = self.threshold_percentage

                while div > 1:
                    div /= 2
                    perc += ((100 - self.threshold_percentage) / 2) * (1 / count)
                    count *= 2

            else:
                perc = (total_occurances / threshold) * self.threshold_percentage

            count_dict[key]['is_predicted_group_percentage'] = round(perc, 2)


        for key,val in count_dict.items():
            total = 0
            current = 0
            for week in range(self.weeks):

                perc = 0
                if val['occurance_week'][str(week)] >= self.cluster_threshold:
                    div = (val['occurance_week'][str(week)] / self.cluster_threshold)
                    count = 1
                    perc = self.threshold_percentage
                    while div > 1:
                        div /= 2
                        perc += ((100 - self.threshold_percentage) / 2) * (1 / count)
                        count *= 2
                else:
                    perc = (val['occurance_week'][str(week)] / self.cluster_threshold) * self.threshold_percentage

                total += 100 * (0.5) / pow(2, week * self.decay_strength)
                current += perc * (0.5) / pow(2, week * self.decay_strength)

            count_dict[key]['is_relevant_group_percentage'] = round((current / total) * 100, 2)
            count_dict[key].pop('occurance_week', None)
        return count_dict
    
    def get_lookup_values(self, hashcode):
        def bitfield(n):
            return [int(digit) for digit in bin(n)[2:]]
            
        bits = bitfield(hashcode)[::-1]
        
        items = []
        for idx, bit in enumerate(bits):
            if bit == 1:
                items.append(self.lookup_table[idx])
        return items
        
        


In [4]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)
df_data = df_data.sort_values(by=['time'])
df_data['id'] = df_data['name']
df_data = df_data.drop(columns=['name'])
print(df_data.shape)
df_data.head()

(14000, 3)


Unnamed: 0,state,time,id
2,0,1509489940655,Staande_Lamp_3
6,1,1509490011225,Staande_Lamp_5
0,1,1509491943009,Staande_Lamp_1
1,0,1509492221471,Staande_Lamp_2
3,1,1509492826941,Staande_Lamp_3


In [5]:
BDASCAN = BinaryDataAnalysis()
result = BDASCAN.analyze(df_data)
result

[{'item_ids': ['Staande_Lamp_3', 'Staande_Lamp_5'],
  'is_predicted_group_percentage': 95.0,
  'is_relevant_group_percentage': 92.63},
 {'item_ids': ['Staande_Lamp_1', 'Staande_Lamp_5'],
  'is_predicted_group_percentage': 32.4,
  'is_relevant_group_percentage': 31.89},
 {'item_ids': ['Staande_Lamp_1', 'Staande_Lamp_3'],
  'is_predicted_group_percentage': 95.0,
  'is_relevant_group_percentage': 92.38},
 {'item_ids': ['Staande_Lamp_1', 'Staande_Lamp_4', 'Staande_Lamp_5'],
  'is_predicted_group_percentage': 2.88,
  'is_relevant_group_percentage': 3.47},
 {'item_ids': ['Staande_Lamp_2', 'Staande_Lamp_3'],
  'is_predicted_group_percentage': 87.84,
  'is_relevant_group_percentage': 87.26},
 {'item_ids': ['Staande_Lamp_2', 'Staande_Lamp_5'],
  'is_predicted_group_percentage': 21.6,
  'is_relevant_group_percentage': 20.51},
 {'item_ids': ['Staande_Lamp_3', 'Staande_Lamp_4'],
  'is_predicted_group_percentage': 90,
  'is_relevant_group_percentage': 80.89},
 {'item_ids': ['Staande_Lamp_2', 'Staan