## Using Running Status for Data Alignment

### Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
import math

from os import listdir
from os.path import isfile, join
from collections import Counter
from sklearn.cluster import DBSCAN

### Notebook options

In [2]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### General Assumptions

In [3]:
time_col = 'datetime'
time_gran_col = 'datetime_gran'
value_col = 'val'
scaled_value_col = 'scaled_val'
time_granularity = 'min'

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2014-01-01 00:00:01', fmt)
start_date = datetime.strptime('2014-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

### Data Directories

In [5]:
pump = 'P6302B'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'
RESULTS_DIR = '../data/results/'+pump+'/'

### Read Standardized Single Minute Data

In [6]:
agg_val = 1
agg_col = 'agg' + str(agg_val) 
input_file = INT_DATA_DIR + 'agg_single/' + agg_col + '.csv'
with open(input_file, 'rb') as f:
    df_agg = pd.read_csv(input_file)

In [7]:
df_agg.shape

(2629440, 26)

In [8]:
df_agg.columns.values

array(['agg1', 'PC63112E.AV', '05GTWY_BN06:XT61B19.PNT', 'TT61B06.PV',
       'FT61A99.PV', 'PT63112.PV', '05GTWY_BN06:XT61B18.PNT',
       'TT61B05.PV', '05GTWY_BN06:XT61B17.PNT', '05GTWY_BN06:XT61B12.PNT',
       'TT63109.PV', '05GTWY_BN06:ZT61B14.PNT', '05GTWY_BN06:XT61B10.PNT',
       '05GTWY_BN06:ZT61B15.PNT', '05GTWY_BN06:XT61B11.PNT', 'TT61B02.PV',
       'TT61B01.PV', '05GTWY_BN06:XT61B13.PNT', 'F61221', 'TT61B03.PV',
       'TC63109E.AV', 'PT63103.PV', 'PT61A98.PV',
       '05GTWY_BN06:XT61B20.PNT', 'PT61B00.PV', 'TT61B04.PV'],
      dtype=object)

In [9]:
tag_run_status = 'P6302BDI.PV'

In [10]:
from utils import read_data
input_dir = INT_DATA_DIR + 'standardized/'
df_features_std = read_data(input_dir, True)

22GTWY_E403:FALE22E23SP.PNT Error 2017-12-30 23:48:05
Number of files found in ../data/interim/P6302B/standardized/ is 60 
Number of features extracted from 60 files is 60 


In [11]:
df_run_status = df_features_std[tag_run_status]

In [12]:
df_run_status.head()

Unnamed: 0,datetime_gran,val,scaled_val,std_val
25,26,1.0,1.0,0.773235
26,27,1.0,1.0,0.773235
27,28,1.0,1.0,0.773235
28,29,1.0,1.0,0.773235
29,30,1.0,1.0,0.773235


### Change running status either 1 or 0

In [13]:
# Values that are non-zero are interpolated.
# Convert anything that is not 1 to 0

df_run_status['val'] = df_run_status['val'].apply(lambda x: 0 if x < 1 else 1)

In [15]:
df_run_status.drop(columns=['scaled_val', 'std_val'], inplace=True)

In [16]:
df_run_status.head()

Unnamed: 0,datetime_gran,val
25,26,1
26,27,1
27,28,1
28,29,1
29,30,1


In [17]:
print(df_run_status.groupby('val')['val'].count())

val
0    1000306
1    1629110
Name: val, dtype: int64


## Remove 'n' minutes before and after of each row

In [36]:
df_non_running_times = df_run_status.loc[df_run_status['val'] == 0]

In [37]:
non_running_times_withbuffer = df_non_running_times[time_gran_col]
non_running_times_withbuffer = set(non_running_times_withbuffer)

In [38]:
buffer = 2 # 2 mins before and after the running of the machine

for i in range(1, buffer+1):
    a = set(df_non_running_times[time_gran_col].apply(lambda x:x+i))
    b = set(df_non_running_times[time_gran_col].apply(lambda x:x-i))

    non_running_times_withbuffer = non_running_times_withbuffer | a | b

In [39]:
df_run_status = df_run_status.loc[~df_run_status[time_gran_col].isin(non_running_times_withbuffer)]
df_run_status = df_run_status[['datetime_gran', 'val']]

In [40]:
df_run_status.shape

(1625367, 2)

### Use data with only running status as 1

In [41]:
df_agg_run = pd.merge(df_agg, df_run_status, how='inner', left_on='agg1', right_on='datetime_gran')

In [42]:
df_agg_run.shape

(1625367, 28)

In [43]:
df_agg_run.head()

Unnamed: 0,agg1,PC63112E.AV,05GTWY_BN06:XT61B19.PNT,TT61B06.PV,FT61A99.PV,PT63112.PV,05GTWY_BN06:XT61B18.PNT,TT61B05.PV,05GTWY_BN06:XT61B17.PNT,05GTWY_BN06:XT61B12.PNT,...,F61221,TT61B03.PV,TC63109E.AV,PT63103.PV,PT61A98.PV,05GTWY_BN06:XT61B20.PNT,PT61B00.PV,TT61B04.PV,datetime_gran,val
0,26,,0.687327,0.225171,0.87382,-0.44331,0.157315,-0.779361,0.590499,0.330673,...,0.811948,0.869886,,0.60081,0.875594,0.82815,0.783579,0.725859,26,1
1,27,0.054785,0.866717,0.226876,0.874811,-0.713979,0.070048,-0.778281,0.469157,0.451374,...,0.81325,0.87026,,0.431101,0.881691,0.92087,0.770197,0.725859,27,1
2,28,0.054785,0.813155,0.225739,0.875802,-0.046705,0.224347,-0.788544,0.520471,0.40241,...,0.814552,0.870634,,0.822021,0.901696,0.946229,0.788144,0.726516,28,1
3,29,0.054785,0.848864,0.224603,0.875802,0.349903,0.08396,-0.798807,0.475798,0.511724,...,0.814552,0.871007,,0.958884,0.871593,0.996155,0.775864,0.727173,29,1
4,30,0.054785,0.839511,0.226308,0.875802,0.746511,0.023252,-0.80745,0.431124,0.523111,...,0.813901,0.871381,,1.095747,0.84149,0.870151,0.763585,0.727831,30,1


In [44]:
df_agg_run.drop(columns=['val', 'datetime_gran'], inplace=True)

In [45]:
df_agg_run.dropna(inplace=True)

In [46]:
df_agg_run.shape

(1605319, 26)

In [47]:
df_agg_run.head()

Unnamed: 0,agg1,PC63112E.AV,05GTWY_BN06:XT61B19.PNT,TT61B06.PV,FT61A99.PV,PT63112.PV,05GTWY_BN06:XT61B18.PNT,TT61B05.PV,05GTWY_BN06:XT61B17.PNT,05GTWY_BN06:XT61B12.PNT,...,TT61B01.PV,05GTWY_BN06:XT61B13.PNT,F61221,TT61B03.PV,TC63109E.AV,PT63103.PV,PT61A98.PV,05GTWY_BN06:XT61B20.PNT,PT61B00.PV,TT61B04.PV
20048,29681,0.054785,0.713682,0.729313,0.87382,-0.065803,0.170595,0.585121,0.603177,0.861299,...,1.911033,0.888271,0.814552,1.387541,0.586338,0.468408,0.827962,0.928795,0.858828,1.033388
20049,29682,0.054785,0.821657,0.731018,0.87415,0.427196,0.344498,0.588362,0.431124,0.704161,...,1.909158,0.859063,0.814552,1.378385,0.586338,0.639537,0.83063,1.054797,0.816796,1.027474
20050,29683,0.054785,0.87607,0.732724,0.874481,0.628929,0.144035,0.591603,0.373169,0.788424,...,1.907283,0.898312,0.814552,1.369229,0.586338,0.719423,0.811958,0.970796,0.914085,1.02156
20051,29684,0.054785,0.758742,0.73386,0.874481,0.343635,0.37169,0.594844,0.590499,0.775898,...,1.929778,0.917479,0.814552,1.384925,0.586338,0.616625,0.832725,0.96287,0.946201,1.033388
20052,29685,0.054785,0.677976,0.73386,0.874481,0.058342,0.277466,0.588362,0.590499,0.740599,...,1.931653,0.801561,0.814552,1.390158,0.586338,0.513826,0.853493,1.021514,0.978316,1.033388


### Write down the dataframe in files

In [48]:
output_file = INT_DATA_DIR + 'agg_runstat/' + 'agg1_buf2.csv'
with open(output_file, 'wb') as f:
    df_agg_run.to_csv(output_file, header=True, index=False)
    print('Writing to ', output_file)

Writing to  ../data/interim/P6302B/agg_runstat/agg1_buf2.csv


## Aggregation at 2/3/4/5 minute levels

In [49]:
from utils import lcl_divmul ## Aggregation at 2/3/4/5 minute levels

agg_val_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 45, 60]

for agg_val in agg_val_list:
    
    print('Started processing for agg_val= ' + str(agg_val), end='')
    agg_col = 'agg' + str(agg_val)
    df_agg_temp = df_agg_run.copy()
    df_agg_temp[agg_col] = df_agg_temp['agg1'].apply(lambda x: lcl_divmul(x, agg_val, agg_val))
    df_agg_temp.drop(columns=['agg1'], inplace=True)
    
    df_agg_temp = df_agg_temp.groupby(by=[agg_col]).mean()
    df_agg_temp = df_agg_temp.reset_index(level=0, inplace=False)
        
    
    output_file = INT_DATA_DIR + 'agg_runstat/' + 'agg'+str(agg_val)+ '_buf' + str(buffer) + '.csv'
    with open(output_file, 'wb') as f:
        df_agg_temp.to_csv(output_file, header=True, index=False)
        print('Completed: Written to ' + output_file)

Started processing for agg_val= 2Completed: Written to ../data/interim/P6302B/agg_runstat/agg2_buf2.csv
Started processing for agg_val= 3Completed: Written to ../data/interim/P6302B/agg_runstat/agg3_buf2.csv
Started processing for agg_val= 4Completed: Written to ../data/interim/P6302B/agg_runstat/agg4_buf2.csv
Started processing for agg_val= 5Completed: Written to ../data/interim/P6302B/agg_runstat/agg5_buf2.csv
Started processing for agg_val= 6Completed: Written to ../data/interim/P6302B/agg_runstat/agg6_buf2.csv
Started processing for agg_val= 7Completed: Written to ../data/interim/P6302B/agg_runstat/agg7_buf2.csv
Started processing for agg_val= 8Completed: Written to ../data/interim/P6302B/agg_runstat/agg8_buf2.csv
Started processing for agg_val= 9Completed: Written to ../data/interim/P6302B/agg_runstat/agg9_buf2.csv
Started processing for agg_val= 10Completed: Written to ../data/interim/P6302B/agg_runstat/agg10_buf2.csv
Started processing for agg_val= 15Completed: Written to ../dat

In [39]:
agg_val_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for agg_val in agg_val_list:
    
    agg_col = 'agg' + str(agg_val)
    input_file = INT_DATA_DIR + 'agg_runstat/' + 'agg'+str(agg_val)+'.csv'
    
    df = pd.read_csv(input_file)
    
    print(df.shape)

(1609025, 26)
(805490, 26)
(537629, 26)
(403702, 26)
(323357, 26)
(269775, 26)
(231494, 26)
(202821, 26)
(180468, 26)
(162608, 26)


In [33]:
df_agg_run.head()

Unnamed: 0,agg1,TT63109.PV,PT61A98.PV,TT61B02.PV,05GTWY_BN06:XT61B18.PNT,05GTWY_BN06:XT61B20.PNT,TT61B05.PV,PT63112.PV,FT61A99.PV,05GTWY_BN06:XT61B17.PNT,...,TT61B06.PV,05GTWY_BN06:ZT61B14.PNT,TT61B01.PV,05GTWY_BN06:XT61B11.PNT,TT61B03.PV,PT61B00.PV,05GTWY_BN06:XT61B13.PNT,05GTWY_BN06:XT61B10.PNT,05GTWY_BN06:ZT61B15.PNT,TT61B04.PV
20085,29680,1.941281,0.825295,1.914424,0.183875,0.870151,0.592684,-0.558801,0.874481,0.57118,...,0.73045,1.166622,1.899783,0.7661,1.384925,0.900861,0.869104,0.74323,0.388229,1.02156
20086,29681,1.996454,0.827962,1.936154,0.170595,0.928795,0.585121,-0.065803,0.87382,0.603177,...,0.729313,1.163214,1.911033,0.68286,1.387541,0.858828,0.888271,0.734662,0.376195,1.033388
20087,29682,2.012111,0.83063,1.925289,0.344498,1.054797,0.588362,0.427196,0.87415,0.431124,...,0.731018,1.166622,1.909158,0.793627,1.378385,0.816796,0.859063,0.775941,0.395635,1.027474
20088,29683,2.027769,0.811958,1.914424,0.144035,0.970796,0.591603,0.628929,0.874481,0.373169,...,0.732724,1.165073,1.907283,0.661886,1.369229,0.914085,0.898312,0.709738,0.387304,1.02156
20089,29684,2.090399,0.832725,1.936154,0.37169,0.96287,0.594844,0.343635,0.874481,0.590499,...,0.73386,1.167861,1.929778,0.793627,1.384925,0.946201,0.917479,0.693383,0.390081,1.033388
