In [1]:
# importing most of the libraries 
import numpy as np
import pandas as pd
import collections

import matplotlib.pyplot as plt
% matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [2]:
# read in the data
power = pd.read_csv("power.csv")
#power_factor = pd.read_csv("power_factor.csv")
data_for_clustering = pd.DataFrame(data = [power["207"]])

In [3]:
from sklearn.cluster import KMeans
data = np.array(data_for_clustering).T
kmeans = KMeans(n_clusters = 2)
kmeans.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [4]:
#we will need this later on
labels = kmeans.predict(data)
centroids = kmeans.cluster_centers_

In [5]:
# To prove that choosing 2 as n_clusters was correct 
centroids

array([[ 2394.7431584 ],
       [  210.54941943]])

In [6]:
labels = list(labels)

In [7]:
# Step 2, creating sub states based on the time intervals
def addtime(enter):
    
    copy = enter[:]
    num = 0
    for i in range(0,len(enter)):
        if i == 0:
            num += 30
        elif enter[i] == enter[i-1]:
            num+= 30
        else:
            num = 30
        copy[i] = num
    return copy

In [8]:
sub_states = addtime(labels)

In [9]:
df = pd.DataFrame(data = [labels,sub_states])
df = df.T
df.columns = ["labels", "substates"]
df.head(20)

Unnamed: 0,labels,substates
0,1,30
1,0,30
2,0,60
3,0,90
4,0,120
5,0,150
6,1,30
7,1,60
8,1,90
9,1,120


In [10]:
#now lets divide the data
ones = df[df['labels'] == 1]
zeros = df[df['labels'] == 0]

In [11]:
# Choosing the best sub states by elimiating one that are not useful
def best_sub_states(dat):
    
    a = dat.substates.values
    counttype=collections.Counter(a)
    print("How the Distribution looks like")
    print("**"*40)
    print(counttype)
    print("Visualising")
    print("**"*40)
    keytype = list(counttype.keys())
    populationtype = list(counttype.values())

    dataa = [go.Bar(
                y= populationtype,
                x = keytype,
                width = 1.5,
                marker=dict(
                   color = populationtype,
                colorscale='Portland',
                showscale=True,
                reversescale = False
                ),
                opacity=0.6
            )]
    
    layout= go.Layout(
        autosize= True,
        title= 'Distribution of Sub_States',
        hovermode= 'closest',
        yaxis=dict(
            title= 'Number of repetitions',
            ticklen= 5,
            gridwidth= 2
        ),
        showlegend= False
    )
    fig = go.Figure(data=dataa, layout=layout)
    py.iplot(fig, filename='barplottype')
    

In [12]:
best_sub_states(ones)

How the Distribution looks like
********************************************************************************
Counter({30: 1755, 60: 1754, 150: 1753, 120: 1753, 90: 1753, 180: 1750, 210: 70, 360: 1, 330: 1, 300: 1, 270: 1, 240: 1})
Visualising
********************************************************************************


In [13]:
best_sub_states(zeros)

How the Distribution looks like
********************************************************************************
Counter({30: 1754, 60: 1754, 90: 1754, 120: 1581, 150: 1002, 180: 642, 210: 468, 240: 378, 270: 303, 300: 254, 330: 211, 360: 188, 390: 167, 420: 148, 450: 135, 480: 115, 510: 104, 540: 96, 570: 88, 600: 81, 630: 70, 660: 66, 690: 57, 720: 54, 750: 47, 780: 42, 810: 38, 840: 37, 870: 37, 900: 32, 930: 29, 960: 27, 990: 27, 1020: 26, 1050: 23, 1080: 23, 1110: 21, 1140: 20, 1170: 18, 1200: 17, 1290: 16, 1230: 16, 1260: 16, 1320: 15, 1350: 15, 1380: 14, 1410: 13, 1440: 12, 1470: 12, 1500: 11, 1560: 10, 1530: 10, 1590: 9, 1620: 9, 1650: 8, 1680: 8, 1800: 7, 1830: 7, 1710: 7, 1740: 7, 1770: 7, 2310: 6, 2070: 6, 2340: 6, 2100: 6, 2370: 6, 1860: 6, 2130: 6, 1890: 6, 2160: 6, 1920: 6, 2190: 6, 1950: 6, 2220: 6, 1980: 6, 2250: 6, 2010: 6, 2280: 6, 2040: 6, 2580: 5, 2610: 5, 2400: 5, 2430: 5, 2460: 5, 2490: 5, 2520: 5, 2550: 5, 2640: 4, 2670: 4, 2700: 4, 2730: 4, 2760: 4, 2790: 4, 282

In [14]:
# Step 4, checking hte frequency of state change
def checkfreq(labels):
    frequency = []
    one2one = 0
    one2two = 0
    two2one = 0
    two2two = 0
    
    for i in range(1,len(labels)):
        if labels[i] == 1 and labels[i-1] == 1:
            one2one = one2one + 1
        if labels[i] == 0 and labels[i-1] == 1:
            one2two = one2two + 1
        if labels[i] == 0 and labels[i-1] == 0:
            two2two = two2two + 1
        if labels[i] == 1 and labels[i-1] == 0:
            two2one = two2one + 1 
    frequency.append(one2one)
    frequency.append(one2two)
    frequency.append(two2one)
    frequency.append(two2two)
    return frequency

In [15]:
Amat =  checkfreq(labels)

In [16]:
prob1to1 = Amat[0] / (Amat[0] + Amat[1])
prob1to0 = Amat[1] / (Amat[0] + Amat[1])
prob0to1 = Amat[2] / (Amat[2] + Amat[3])
prob0to0 = Amat[3] / (Amat[2] + Amat[3])

In [17]:
print(prob1to1)
print(prob1to0)
print(prob0to1)
print(prob0to0)

0.8344033232628398
0.1655966767371601
0.14117836445589182
0.8588216355441082


In [None]:
from sklearn.cluster import KMeans, DBSCAN
def checkstep1(data):
    