## Using DBSCAN directly on the features

### Imports

In [31]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
import math

from os import listdir
from os.path import isfile, join
from collections import Counter
from sklearn.cluster import DBSCAN

### Notebook options

In [19]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### General Assumptions

In [20]:
time_col = 'datetime'
time_gran_col = 'datetime_gran'
value_col = 'val'
scaled_value_col = 'scaled_val'
time_granularity = 'min'

In [21]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2016-01-01 00:00:01', fmt)
start_date = datetime.strptime('2016-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

### Data Directories

In [28]:
pump = 'P6302B'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'
RESULTS_DIR = '../data/results/'+pump+'/'

### Read Data

In [34]:
df_all.shape

(1609025, 26)

In [35]:
df_all.head()

Unnamed: 0,agg1,TT63109.PV,PT61A98.PV,TT61B02.PV,05GTWY_BN06:XT61B18.PNT,05GTWY_BN06:XT61B20.PNT,TT61B05.PV,PT63112.PV,FT61A99.PV,05GTWY_BN06:XT61B17.PNT,...,TT61B06.PV,05GTWY_BN06:ZT61B14.PNT,TT61B01.PV,05GTWY_BN06:XT61B11.PNT,TT61B03.PV,PT61B00.PV,05GTWY_BN06:XT61B13.PNT,05GTWY_BN06:XT61B10.PNT,05GTWY_BN06:ZT61B15.PNT,TT61B04.PV
0,29680,1.941281,0.825295,1.914424,0.183875,0.870151,0.592684,-0.558801,0.874481,0.57118,...,0.73045,1.166622,1.899783,0.7661,1.384925,0.900861,0.869104,0.74323,0.388229,1.02156
1,29681,1.996454,0.827962,1.936154,0.170595,0.928795,0.585121,-0.065803,0.87382,0.603177,...,0.729313,1.163214,1.911033,0.68286,1.387541,0.858828,0.888271,0.734662,0.376195,1.033388
2,29682,2.012111,0.83063,1.925289,0.344498,1.054797,0.588362,0.427196,0.87415,0.431124,...,0.731018,1.166622,1.909158,0.793627,1.378385,0.816796,0.859063,0.775941,0.395635,1.027474
3,29683,2.027769,0.811958,1.914424,0.144035,0.970796,0.591603,0.628929,0.874481,0.373169,...,0.732724,1.165073,1.907283,0.661886,1.369229,0.914085,0.898312,0.709738,0.387304,1.02156
4,29684,2.090399,0.832725,1.936154,0.37169,0.96287,0.594844,0.343635,0.874481,0.590499,...,0.73386,1.167861,1.929778,0.793627,1.384925,0.946201,0.917479,0.693383,0.390081,1.033388


In [None]:
eps_list = [0.55, 0.60, 0.7, 0.8, 0.9, 0.95]
min_pts_list = [3, 4, 5, 7]
agg_val_list = [1]

for agg_val in agg_val_list:
    
    agg_col = 'agg' + str(agg_val) 
    input_file = INT_DATA_DIR + 'agg_runstat/' + agg_col + '.csv'
    with open(input_file, 'rb') as f:
        df_all = pd.read_csv(input_file)
        X = df_all.drop(columns=[agg_col])
            
    res_file = RESULTS_DIR + 'dbscan_gridsearch_agg' + str(agg_val) + '.txt'
    
    dbscan_gridsearch(X=X, eps_list=eps_list, min_pts_list=min_pts_list, agg_val=agg_val, op_file=res_file)
    print('Completed Grid Search on agg_val = ' + str(agg_val))
    

# GridSearch-like for DBSCAN

In [29]:
def dbscan_gridsearch(X, eps_list, min_pts_list, agg_val, op_file):

    res_f = open(op_file, 'w')
    res_f.write('Instances = ' + str(len(df_all)) + '\n')
    res_f.write('| Eps | \tmpts | \tClus | \tAnom | \tTime |\n')
    res_f.write('| -- | -- | -- | -- | -- |\n')
    res_f.close()

    for eps in eps_list:
        for min_samples in min_pts_list:

            tic = time.time()
            clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
            toc = time.time()
            total_time = toc-tic

            labels = clustering.labels_
            unq_labels = np.unique(labels)
            # print('Number of clusters found', len(unq_labels))

            counter = Counter(labels)

            outliers_count = 0
            # The percentages in frequency_dict wont add upto 100 since only
            # values more than 1 are added to the new dictionary - look at the 
            # if statement below
            frequency_dict = {}
            for k in counter:
                v = counter[k]
                frequency_dict[k] = v

                if k == -1:
                    outliers_count = v

            res_f = open(op_file, 'a')
            res_f.write('|' + str(eps) + '|\t' + str(min_samples) + '|\t' 
                + str(len(unq_labels)-1) + '|\t' + str(outliers_count) + '|\t' + str(total_time) + '|' + '\n')
            res_f.close()

            print('|' + str(eps) + '|\t' + str(min_samples) + '|\t' 
                + str(len(unq_labels)-1) + '|\t' + str(outliers_count) + '|\t' + str(total_time) + '|')

            X['labels'] = labels
            dbscan_results_file = RESULTS_DIR + 'dbscan_runstat_eps'+str(eps)+'_mpts'+str(min_samples)+'_agg'+str(agg_val)+'.csv'
            with open(dbscan_results_file, 'wb') as dbscan_res:
                X.to_csv(dbscan_results_file, header=True, index=False)