In [1]:
# importing most of the libraries 
import numpy as np
import pandas as pd
import collections

import matplotlib.pyplot as plt
% matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [2]:
# read in the data
power = pd.read_csv("power.csv")
#power_factor = pd.read_csv("power_factor.csv")
data_for_clustering = pd.DataFrame(data = [power["207"]])

In [3]:
from sklearn.cluster import KMeans, DBSCAN
data = np.array(data_for_clustering).T

#dividing data into train and test

train = data[0:round(len(data)*0.7)]
test = data[round(len(data)*0.7):]

#fitting clusters on clean train data
kmeans = KMeans(n_clusters = 2)
kmeans.fit(train)

db = DBSCAN(eps=0.3, min_samples=10)
db.fit(train)

DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='euclidean',
    min_samples=10, n_jobs=1, p=None)

In [4]:
#we will need this later on
labels = kmeans.predict(train)
centroids = kmeans.cluster_centers_

In [5]:
# To prove that choosing 2 as n_clusters was correct 
centroids

array([[  209.98380727],
       [ 2385.07949742]])

In [6]:
labels = list(labels)

In [7]:
# Step 2, creating sub states based on the time intervals
def addtime(enter):
    copy = enter[:]
    num = 0
    for i in range(0,len(enter)):
        if i == 0:
            num += 30
        elif enter[i] == enter[i-1]:
            num+= 30
        else:
            num = 30
        copy[i] = num
    return copy

In [8]:
sub_states = addtime(labels)

In [9]:
#Some Visualization
df = pd.DataFrame(data = [labels,sub_states])
df = df.T
df.columns = ["labels", "substates"]
df.head(20)

Unnamed: 0,labels,substates
0,0,30
1,1,30
2,1,60
3,1,90
4,1,120
5,1,150
6,0,30
7,0,60
8,0,90
9,0,120


In [10]:
# Learn Step 3 for periodicity
def getstep3(enter_input):
    info_list = []
    info_list.append(np.mean(enter_input))
    info_list.append(max(enter_input))
    info_list.append(min(enter_input))
    info_list.append(np.std(enter_input))
    return info_list

In [11]:
#checking step 3 
getstep3(sub_states)

[182.47455312810328, 3330, 30, 306.43696687937683]

In [12]:
step3 = getstep3(sub_states)

In [13]:
#now lets divide the data
ones = df[df['labels'] == 1]
zeros = df[df['labels'] == 0]

In [14]:
# Choosing the best sub states by elimiating one that are not useful
def best_sub_states(dat):
    
    a = dat.substates.values
    counttype=collections.Counter(a)
    print("How the Distribution looks like")
    print("**"*40)
    print(counttype)
    print("Visualising")
    print("**"*40)
    keytype = list(counttype.keys())
    populationtype = list(counttype.values())

    dataa = [go.Bar(
                y= populationtype,
                x = keytype,
                width = 1.5,
                marker=dict(
                   color = populationtype,
                colorscale='Portland',
                showscale=True,
                reversescale = False
                ),
                opacity=0.6
            )]
    
    layout= go.Layout(
        autosize= True,
        title= 'Distribution of Sub_States',
        hovermode= 'closest',
        yaxis=dict(
            title= 'Number of repetitions',
            ticklen= 5,
            gridwidth= 2
        ),
        showlegend= False
    )
    fig = go.Figure(data=dataa, layout=layout)
    py.iplot(fig, filename='barplottype')
    
    best_sub = []
    for i in range(len(populationtype)):
        if populationtype[i] > 20:
            best_sub.append(keytype[i])
    
    return best_sub

In [15]:
best_ones = best_sub_states(ones)

How the Distribution looks like
********************************************************************************
Counter({30: 1258, 60: 1258, 90: 1258, 120: 1115, 150: 679, 180: 432, 210: 316, 240: 248, 270: 198, 300: 160, 330: 133, 360: 118, 390: 105, 420: 90, 450: 84, 480: 74, 510: 65, 540: 59, 570: 55, 600: 51, 630: 44, 660: 40, 690: 36, 720: 33, 750: 27, 780: 23, 810: 21, 840: 21, 870: 21, 900: 20, 930: 17, 960: 16, 990: 16, 1020: 16, 1050: 15, 1080: 15, 1110: 13, 1140: 13, 1170: 12, 1290: 11, 1200: 11, 1230: 11, 1260: 11, 1320: 10, 1350: 10, 1380: 10, 1410: 9, 1440: 8, 1470: 8, 1560: 7, 1590: 7, 1620: 7, 1500: 7, 1530: 7, 1800: 6, 1830: 6, 1650: 6, 1680: 6, 1710: 6, 1740: 6, 1770: 6, 2310: 5, 2070: 5, 2340: 5, 2100: 5, 2370: 5, 1860: 5, 2130: 5, 1890: 5, 2160: 5, 1920: 5, 2190: 5, 1950: 5, 2220: 5, 1980: 5, 2250: 5, 2010: 5, 2280: 5, 2040: 5, 2580: 4, 2610: 4, 2400: 4, 2430: 4, 2460: 4, 2490: 4, 2520: 4, 2550: 4, 2640: 3, 2670: 3, 2700: 3, 2730: 3, 2760: 3, 2790: 3, 2820: 2, 2850:

In [16]:
best_zero = best_sub_states(zeros)

How the Distribution looks like
********************************************************************************
Counter({30: 1259, 150: 1258, 120: 1258, 90: 1258, 60: 1258, 180: 1254, 210: 46, 360: 1, 330: 1, 300: 1, 270: 1, 240: 1})
Visualising
********************************************************************************


In [17]:
# Step 4, checking hte frequency of state change
def checkfreq(labels):
    frequency = []
    one2one = 0
    one2two = 0
    two2one = 0
    two2two = 0
    
    for i in range(1,len(labels)):
        if labels[i] == 1 and labels[i-1] == 1:
            one2one = one2one + 1
        if labels[i] == 0 and labels[i-1] == 1:
            one2two = one2two + 1
        if labels[i] == 0 and labels[i-1] == 0:
            two2two = two2two + 1
        if labels[i] == 1 and labels[i-1] == 0:
            two2one = two2one + 1 
    frequency.append(one2one)
    frequency.append(one2two)
    frequency.append(two2one)
    frequency.append(two2two)
    return frequency

In [18]:
Amat =  checkfreq(labels)

In [19]:
prob1to1 = Amat[0] / (Amat[0] + Amat[1])
prob1to0 = Amat[1] / (Amat[0] + Amat[1])
prob0to1 = Amat[2] / (Amat[2] + Amat[3])
prob0to0 = Amat[3] / (Amat[2] + Amat[3])

In [20]:
print(prob1to1)
print(prob1to0)
print(prob0to1)
print(prob0to0)

0.8522780648191639
0.14772193518083607
0.16563528637261357
0.8343647136273864


In [21]:
#end of learning phase

In [22]:
# We start testing now

In [23]:
from sklearn.cluster import KMeans, DBSCAN
# If DBSCAN returns -1 then it is not part of any cluster and it is an anomaly 
def checkstep1(data):
    anomaly_array = []
    step1_output = []
    test_labels_kmeans = kmeans.predict(data)
    db.fit(test)
    test_labels_dbscan = db.labels_
    for i in range(len(test_labels_kmeans)):
        
        if test_labels_dbscan[i] == -1:
            anomaly_array.append(-1)
            step1_output.append("Malfunction")
        else:
            anomaly_array.append(test_labels_kmeans[i])
            step1_output.append("Step 1 Cleared")
            
    return anomaly_array, step1_output

In [24]:
anomaly, s1_output = checkstep1(test)

In [25]:
def checkstep2(enter_input):
    checkstep2_output = []
    test_labels_kmeans = kmeans.predict(enter_input)
    test_labels_kmeans = list(test_labels_kmeans)
    new_sub_states = addtime(test_labels_kmeans)
    #print(test_labels_kmeans)
    for i in range(len(test_labels_kmeans)):
        if test_labels_kmeans[i] == 1:
            
            if new_sub_states[i] in best_ones:
                checkstep2_output.append("Step2 Cleared")
                #print(test_labels_kmeans[i])
                #print(new_sub_states[i])
            else:
                checkstep2_output.append("Step2_malfunction")
        else:
            if new_sub_states[i] in best_zero:
                checkstep2_output.append("Step2 Cleared")
            else:
                checkstep2_output.append("Step2_malfunction")
        
    return checkstep2_output

In [26]:
step2 = checkstep2(test)

In [27]:
def checkstep3(enter_input, tolerance):
    test_labels_kmeans = kmeans.predict(enter_input)
    test_labels_kmeans = list(test_labels_kmeans)
    new_sub_states = addtime(test_labels_kmeans)
    info_list = []
    info_list.append(np.mean(enter_input))
    info_list.append(max(enter_input))
    info_list.append(min(enter_input))
    info_list.append(np.std(enter_input))
    
    if np.std(info_list) > np.std(step3)*(1 + tolerance) :
        return "Step 3 problem, Greater"
    elif np.std(info_list) < np.std(step3)* (1 - tolerance):
        return "Step 3 problem, Lower"
    else:
        return "No Problem"

In [28]:
# as we want 2 sigma tolerance
checkstep3(test, 0.66)

'No Problem'

In [29]:
def checkstep4(enter_input, tolerance):
    
    test_labels_kmeans = kmeans.predict(enter_input)
    test_labels_kmeans = list(test_labels_kmeans)
    Bmat =  checkfreq(test_labels_kmeans)
    print(np.std(Bmat))
    print(np.std(Amat))
    
    if np.std(Bmat) > np.std(Amat)*(1 + tolerance) :
        return "Step 4 problem, Greater"
    elif np.std(Bmat) < np.std(Amat)* (1 - tolerance):
        return "Step 4 problem, Lower"
    else:
        return "No Problem"

In [30]:
checkstep4(test,0.5)

1271.74073616
2788.82505502


'Step 4 problem, Lower'

In [31]:
full_output = pd.DataFrame(data = [s1_output, step2])

In [32]:
#this is how the output will look like
# Step 3 and 4 will just be one line reporting if the whole data passed the step or not
full_output = full_output.T
full_output.columns = ["Step1", "Step2"]
full_output.head()

Unnamed: 0,Step1,Step2
0,Malfunction,Step2 Cleared
1,Malfunction,Step2 Cleared
2,Step 1 Cleared,Step2 Cleared
3,Step 1 Cleared,Step2 Cleared
4,Step 1 Cleared,Step2 Cleared
