In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np
import time

In [2]:
from pylab import rcParams
import seaborn as sb

# Standard data visualisaton params for Jupyter
%matplotlib inline
rcParams['figure.figsize'] = 15, 4
sb.set_style('whitegrid')

In [3]:
address = '../datasets/staandelamp_realistic.json'
df_data = pd.read_json(address)

# Sort the data on timestamp
df_data = df_data.sort_values(by=['time'])


# Grab around 6 hours of timestamps
### TEMP ###
df_data = df_data[:42]
### TEMP ###

df_data.head()

Unnamed: 0,name,state,time
2,Staande_Lamp_3,0,1509489940655
6,Staande_Lamp_5,1,1509490011225
0,Staande_Lamp_1,1,1509491943009
1,Staande_Lamp_2,0,1509492221471
3,Staande_Lamp_3,1,1509492826941


In [4]:
import datetime
import time

df_printable_dates = df_data.copy(deep=True)
df_printable_dates['dates'] = df_data['time'].map(lambda x: time.ctime(x / 1000))
df_printable_dates['printable_dates'] = df_data['time'].map(lambda x: datetime.datetime.fromtimestamp(x / 1000).isoformat())

df_printable_dates.head()

Unnamed: 0,name,state,time,dates,printable_dates
2,Staande_Lamp_3,0,1509489940655,Tue Oct 31 23:45:40 2017,2017-10-31T23:45:40.655000
6,Staande_Lamp_5,1,1509490011225,Tue Oct 31 23:46:51 2017,2017-10-31T23:46:51.225000
0,Staande_Lamp_1,1,1509491943009,Wed Nov 1 00:19:03 2017,2017-11-01T00:19:03.009000
1,Staande_Lamp_2,0,1509492221471,Wed Nov 1 00:23:41 2017,2017-11-01T00:23:41.471000
3,Staande_Lamp_3,1,1509492826941,Wed Nov 1 00:33:46 2017,2017-11-01T00:33:46.941000


In [5]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

print('before:\n', df_data.head())

d = defaultdict(LabelEncoder)
df_fit = df_data.apply(lambda x: d[x.name].fit_transform(x))
df_fit['time'] = df_data['time']
print('after:\n', df_fit.head())

before:
              name  state           time
2  Staande_Lamp_3      0  1509489940655
6  Staande_Lamp_5      1  1509490011225
0  Staande_Lamp_1      1  1509491943009
1  Staande_Lamp_2      0  1509492221471
3  Staande_Lamp_3      1  1509492826941
after:
    name  state           time
2     2      0  1509489940655
6     4      1  1509490011225
0     0      1  1509491943009
1     1      0  1509492221471
3     2      1  1509492826941


In [17]:
from sklearn.cluster import AgglomerativeClustering

amountOfDataRows = df_fit['name'].size

model = AgglomerativeClustering(
    affinity='euclidean', 
    compute_full_tree='auto',
    connectivity=None, 
    linkage='ward', 
    memory=None, 
    n_clusters=amountOfDataRows - 1,
    pooling_func='deprecated'
).fit(df_fit)

model

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=41,
            pooling_func='deprecated')