# P6203B - Data Wrangling based on the tags available in Data string

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
from os.path import basename

from utils import parse

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
pump = 'P6302B'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'

### Reading of Data - Using tags to uniquely identify

In [None]:
data = {} # Contains mapping between tags and data
tag_files = {} # Contains mapping between tags and the file names
files_tag = {} # Contains the mapping between file names and tags

dir_files = os.listdir(RAW_DATA_DIR)

print('No of files in %s is %d' % (RAW_DATA_DIR, len(dir_files)))

print('| File No. | Data Files | Tags Available |')
print('| -- | -- | -- |')

idx = 1
for current_file in dir_files:

    current_base = current_file
    current_file = RAW_DATA_DIR + current_file

    if current_base.endswith('.csv'):
        print('| ' + str(idx) + ' | ', current_base, end=' | ')
        df = pd.read_csv(current_file, header=None)
        idx += 1

        df['feature'], df['val'], df['datetime'] = zip(*df[0].map(parse))

        df = df[df['feature'] != None]
        df = df[df['datetime'].notnull()]

        tags = np.unique(df['feature'])


        print(', '.join(map(str,tags)) + '|')

        for tag in tags:
            df_tag = df.loc[df['feature'] == tag]

            # print('Intersection = ', longestSubstringFinder(tag, current_base))

            if tag not in data.keys():
                data[tag] = pd.DataFrame(columns=df_tag.columns.values)
            data[tag] = data[tag].append(df_tag)

No of files in ../data/raw/P6302B/ is 81
| File No. | Data Files | Tags Available |
| -- | -- | -- |
1 |  P6302B 2016 Item 31-40.csv | PT61A98.PV, PT61B00.PV, PT63103.PV, PT63105.PV, TA61B47.PV, TA63109.PV|
2 |  P6302B 2016 Item 5-11 Jul Dec.csv | 05GTWY_BN06:XT61B16.PNT, 05GTWY_BN06:XT61B17.PNT, 05GTWY_BN06:XT61B18.PNT, 05GTWY_BN06:XT61B19.PNT, 05GTWY_BN06:XT61B20.PNT, 05GTWY_BN06:ZT61B14.PNT, 05GTWY_BN06:ZT61B15.PNT|
3 |  P6302B 2014 Item 51-63.csv | 07DATASCRCP1:AI07003.PNT, F61221VP, PC63112.AV, PC63112E.AV, PT63112.PV, T6150.PV, T6151.PV, T6152.PV, T6153.PV, TC63109.AV, TC63109E.AV, TT61B06.PV, TT63109.PV|
4 |  P6302B 2015 Item 7-11 Nov.csv | 05GTWY_BN06:XT61B18.PNT, 05GTWY_BN06:XT61B19.PNT, 05GTWY_BN06:XT61B20.PNT, 05GTWY_BN06:ZT61B14.PNT, 05GTWY_BN06:ZT61B15.PNT|
5 |  TT63109 PV Jan to Jun.csv | TT63109.PV|
6 |  P6302B 2015 Item 21-30.csv | PA61B223.PV, PA63110.PV, PA63112.PV, PA63113.PV, PAH61A98.PV, PAL61A98.PV, PC61A98.AV, PC61A98E.AV, PDA61B21.PV|
7 |  P6302B 2018 Item 11-20

In [92]:
data['PA61B223.PV'].head()

Unnamed: 0,0,feature,val,datetime
0,@{PIPoint=PA61B223.PV; Value=NORMAL; Timestamp=12/30/2015 11:48:15 PM},PA61B223.PV,NORMAL,2015-12-30 23:48:15
1,@{PIPoint=PA61B223.PV; Value=NORMAL; Timestamp=12/30/2015 11:18:15 PM},PA61B223.PV,NORMAL,2015-12-30 23:18:15
2,@{PIPoint=PA61B223.PV; Value=NORMAL; Timestamp=12/30/2015 10:48:15 PM},PA61B223.PV,NORMAL,2015-12-30 22:48:15
3,@{PIPoint=PA61B223.PV; Value=NORMAL; Timestamp=12/30/2015 10:19:14 PM},PA61B223.PV,NORMAL,2015-12-30 22:19:14
4,@{PIPoint=PA61B223.PV; Value=NORMAL; Timestamp=12/30/2015 9:49:14 PM},PA61B223.PV,NORMAL,2015-12-30 21:49:14


In [93]:
print('Number of distinct tags available are ', len(data.keys()))

Number of distinct tags available are  63


In [94]:
# Printing all the keys
', '.join(map(str, data.keys()))

'PT61A98.PV, PT61B00.PV, PT63103.PV, PT63105.PV, TA61B47.PV, TA63109.PV, 05GTWY_BN06:XT61B16.PNT, 05GTWY_BN06:XT61B17.PNT, 05GTWY_BN06:XT61B18.PNT, 05GTWY_BN06:XT61B19.PNT, 05GTWY_BN06:XT61B20.PNT, 05GTWY_BN06:ZT61B14.PNT, 05GTWY_BN06:ZT61B15.PNT, 07DATASCRCP1:AI07003.PNT, F61221VP, PC63112.AV, PC63112E.AV, PT63112.PV, T6150.PV, T6151.PV, T6152.PV, T6153.PV, TC63109.AV, TC63109E.AV, TT61B06.PV, TT63109.PV, PA61B223.PV, PA63110.PV, PA63112.PV, PA63113.PV, PAH61A98.PV, PAL61A98.PV, PC61A98.AV, PC61A98E.AV, PDA61B21.PV, FA61A99.PV, FIE61A99.PV, FT61A99.PV, LA63114.PV, LAL63114.PV, LT63114.PV, P6302BDI.PV, P6302BSD.PV, PA61B00.PV, F61221, 05GTWY_BN06:XT61B10.PNT, 05GTWY_BN06:XT61B11.PNT, 05GTWY_BN06:XT61B12.PNT, 05GTWY_BN06:XT61B13.PNT, PIE61B00.PV, PIE61B22.PV, PIE61B23.PV, PIE63113.PV, PIE61608.PV, TAE61B47.PV, TT61B01.PV, TT61B02.PV, TT61B03.PV, TT61B04.PV, TT61B05.PV, XA61B34.PV, XA61B58.PV, XAE61B34.PV'

In [95]:
data.keys()

dict_keys(['PT61A98.PV', 'PT61B00.PV', 'PT63103.PV', 'PT63105.PV', 'TA61B47.PV', 'TA63109.PV', '05GTWY_BN06:XT61B16.PNT', '05GTWY_BN06:XT61B17.PNT', '05GTWY_BN06:XT61B18.PNT', '05GTWY_BN06:XT61B19.PNT', '05GTWY_BN06:XT61B20.PNT', '05GTWY_BN06:ZT61B14.PNT', '05GTWY_BN06:ZT61B15.PNT', '07DATASCRCP1:AI07003.PNT', 'F61221VP', 'PC63112.AV', 'PC63112E.AV', 'PT63112.PV', 'T6150.PV', 'T6151.PV', 'T6152.PV', 'T6153.PV', 'TC63109.AV', 'TC63109E.AV', 'TT61B06.PV', 'TT63109.PV', 'PA61B223.PV', 'PA63110.PV', 'PA63112.PV', 'PA63113.PV', 'PAH61A98.PV', 'PAL61A98.PV', 'PC61A98.AV', 'PC61A98E.AV', 'PDA61B21.PV', 'FA61A99.PV', 'FIE61A99.PV', 'FT61A99.PV', 'LA63114.PV', 'LAL63114.PV', 'LT63114.PV', 'P6302BDI.PV', 'P6302BSD.PV', 'PA61B00.PV', 'F61221', '05GTWY_BN06:XT61B10.PNT', '05GTWY_BN06:XT61B11.PNT', '05GTWY_BN06:XT61B12.PNT', '05GTWY_BN06:XT61B13.PNT', 'PIE61B00.PV', 'PIE61B22.PV', 'PIE61B23.PV', 'PIE63113.PV', 'PIE61608.PV', 'TAE61B47.PV', 'TT61B01.PV', 'TT61B02.PV', 'TT61B03.PV', 'TT61B04.PV', 'TT

### Remove data that resulted in default values

In [96]:
fmt = '%Y-%m-%d %H:%M:%S'
remove_before = datetime.strptime('2010-01-01 00:00:01', fmt)
for tag, df in data.items():
    df = df.loc[df['datetime'] > remove_before][['val', 'datetime', 'feature']]
    data[tag] = df

In [97]:
data['PA61B223.PV'].head()

Unnamed: 0,val,datetime,feature
0,NORMAL,2015-12-30 23:48:15,PA61B223.PV
1,NORMAL,2015-12-30 23:18:15,PA61B223.PV
2,NORMAL,2015-12-30 22:48:15,PA61B223.PV
3,NORMAL,2015-12-30 22:19:14,PA61B223.PV
4,NORMAL,2015-12-30 21:49:14,PA61B223.PV


### Find the First and Last Date for features

In [98]:
# This is to get the base year found among all datasets
print('| Tag | First Data | Last Date |')
print('| -- | -- | -- |')
for tag, df in data.items():
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    df_temp = df.loc[df['datetime'] == min_date]
    print('| ', tag, ' | ', min_date, ' | ', max_date , ' |')

| Tag | First Data | Last Date |
| -- | -- | -- |
|  PT61A98.PV  |  2014-01-01 00:00:26  |  2018-11-30 09:15:58  |
|  PT61B00.PV  |  2014-01-01 00:00:26  |  2018-11-26 11:44:22  |
|  PT63103.PV  |  2014-01-01 00:00:26  |  2018-11-26 11:44:22  |
|  PT63105.PV  |  2014-01-21 14:37:21  |  2018-11-30 09:15:58  |
|  TA61B47.PV  |  2014-01-21 14:37:31  |  2018-11-26 11:44:31  |
|  TA63109.PV  |  2014-01-21 14:37:31  |  2018-11-26 11:44:31  |
|  05GTWY_BN06:XT61B16.PNT  |  2014-01-01 00:00:00  |  2018-11-30 08:07:55  |
|  05GTWY_BN06:XT61B17.PNT  |  2014-01-01 00:00:00  |  2018-11-30 08:07:55  |
|  05GTWY_BN06:XT61B18.PNT  |  2014-01-01 00:00:00  |  2018-11-30 08:07:55  |
|  05GTWY_BN06:XT61B19.PNT  |  2014-01-01 00:00:00  |  2018-11-30 08:07:55  |
|  05GTWY_BN06:XT61B20.PNT  |  2014-01-01 00:00:00  |  2018-11-30 08:07:05  |
|  05GTWY_BN06:ZT61B14.PNT  |  2015-01-01 00:00:21  |  2018-11-30 08:07:05  |
|  05GTWY_BN06:ZT61B15.PNT  |  2015-01-01 00:00:21  |  2018-11-30 09:08:45  |
|  07DATASCRCP

In [99]:
data['PA61B223.PV'].head()

Unnamed: 0,val,datetime,feature
0,NORMAL,2015-12-30 23:48:15,PA61B223.PV
1,NORMAL,2015-12-30 23:18:15,PA61B223.PV
2,NORMAL,2015-12-30 22:48:15,PA61B223.PV
3,NORMAL,2015-12-30 22:19:14,PA61B223.PV
4,NORMAL,2015-12-30 21:49:14,PA61B223.PV


In [100]:
meta_feature = [
    '05GTWY_BN06:XT61B10.PNT',
    '05GTWY_BN06:XT61B11.PNT',
    '05GTWY_BN06:XT61B12.PNT',
    '05GTWY_BN06:XT61B13.PNT',
    '05GTWY_BN06:XT61B16.PNT',
    '05GTWY_BN06:XT61B17.PNT',
    '05GTWY_BN06:XT61B18.PNT',
    '05GTWY_BN06:XT61B19.PNT',
    '05GTWY_BN06:XT61B20.PNT',
    '05GTWY_BN06:ZT61B14.PNT',
    '05GTWY_BN06:ZT61B15.PNT',
    'FA61A99.PV',
    'FIE61A99.PV',
    'FT61A99.PV',
    'LA63114.PV',
    'LAL63114.PV',
    'LT63114.PV',
    'P6302BDI.PV',
    'P6302BSD.PV',
    'PA61B00.PV',
    'PA61B223.PV',
    'PA63110.PV',
    'PA63112.PV',
    'PA63113.PV',
    'PAH61A98.PV',
    'PAL61A98.PV',
    'PC61A98.AV',
    'PC61A98E.AV',
    'PDA61B21.PV',
    'PIE61608.PV',
    'PIE61B00.PV',
    'PIE61B22.PV',
    'PIE61B23.PV',
    'PIE63113.PV',
    'PT61A98.PV',
    'PT61B00.PV',
    'PT63103.PV',
    'PT63105.PV',
    'TA61B47.PV',
    'TA63109.PV',
    'TAE61B47.PV',
    'TT61B01.PV',
    'TT61B02.PV',
    'TT61B03.PV',
    'TT61B04.PV',
    'XA61B34.PV',
    'XA61B58.PV',
    'XAE61B34.PV',
    'F61221',
    'TT61B05.PV',
    'TT61B06.PV',
    'TC63109E.AV',
    'TT63109.PV',
    'PC63112.AV',
    'PC63112E.AV',
    'PT63112.PV',
    'TC63109.AV',
    'F61221VP',
    'T6150.PV',
    'T6151.PV',
    'T6152.PV',
    'T6153.PV',
    '07DATASCRCP1:AI07003.PNT'
]

# meta_feature = ['PA61B223.PV', 'PA63110.PV', 'PA63112.PV', 'PA63113.PV', 'PAH61A98.PV', 'PAL61A98.PV', 'PC61A98.AV', 'PC61A98E.AV', 'PDA61B21.PV']

extra_features = list(set(data.keys()) - set(meta_feature))

print('| Tag | Size | First Date | Last Date |')
print('| -- | -- | -- | -- |')
for tag in meta_feature:
    
    df = data[tag]
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    print('|', tag, '|', len(df), '|', min_date, '|', max_date, '|')
    
print('\n| Tag | Size | First Date | Last Date |')
print('| -- | -- | -- | -- |')
for tag in extra_features:
    
    df = data[tag]
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    print('|', tag, '|', len(df), '|', min_date, '|', max_date, '|')

| Tag | Size | First Date | Last Date |
| -- | -- | -- | -- |
| 05GTWY_BN06:XT61B10.PNT | 4133086 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B11.PNT | 4039732 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B12.PNT | 4006519 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B13.PNT | 4026078 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B16.PNT | 3315919 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B17.PNT | 4052945 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B18.PNT | 4349675 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B19.PNT | 4387554 | 2014-01-01 00:00:00 | 2018-11-30 08:07:55 |
| 05GTWY_BN06:XT61B20.PNT | 4334858 | 2014-01-01 00:00:00 | 2018-11-30 08:07:05 |
| 05GTWY_BN06:ZT61B14.PNT | 4125992 | 2015-01-01 00:00:21 | 2018-11-30 08:07:05 |
| 05GTWY_BN06:ZT61B15.PNT | 4102902 | 2015-01-01 00:00:21 | 2018-11-30 09:08:45 |
| FA61A99.PV | 549030 | 2014-01-01 0

### Writing to Pickle Files

In [101]:
from utils import write_feature_dict

dir_path = INT_DATA_DIR + 'initial/'
write_feature_dict(dir_path, data)

Writing to file  ../data/interim/P6302B/initial/PT61A98.PV.pkl (1949625, 3) [ PT61A98.PV ]
Writing to file  ../data/interim/P6302B/initial/PT61B00.PV.pkl (2097026, 3) [ PT61B00.PV ]
Writing to file  ../data/interim/P6302B/initial/PT63103.PV.pkl (1316380, 3) [ PT63103.PV ]
Writing to file  ../data/interim/P6302B/initial/PT63105.PV.pkl (689899, 3) [ PT63105.PV ]
Writing to file  ../data/interim/P6302B/initial/TA61B47.PV.pkl (267, 3) [ TA61B47.PV ]
Writing to file  ../data/interim/P6302B/initial/TA63109.PV.pkl (221, 3) [ TA63109.PV ]
Writing to file  ../data/interim/P6302B/initial/05GTWY_BN06-XT61B16.PNT.pkl (3315919, 3) [ 05GTWY_BN06:XT61B16.PNT ]
Writing to file  ../data/interim/P6302B/initial/05GTWY_BN06-XT61B17.PNT.pkl (4052945, 3) [ 05GTWY_BN06:XT61B17.PNT ]
Writing to file  ../data/interim/P6302B/initial/05GTWY_BN06-XT61B18.PNT.pkl (4349675, 3) [ 05GTWY_BN06:XT61B18.PNT ]
Writing to file  ../data/interim/P6302B/initial/05GTWY_BN06-XT61B19.PNT.pkl (4387554, 3) [ 05GTWY_BN06:XT61B19.P

### Read data from Pickle Files

In [102]:
from utils import read_data_withfeature
df_features_all = read_data_withfeature(INT_DATA_DIR + 'initial/', True)

Number of files found in ../data/interim/P6302B/initial/ is 63 
../data/interim/P6302B/initial/TT61B05.PV.pkl ['TT61B05.PV']
../data/interim/P6302B/initial/TAE61B47.PV.pkl ['TAE61B47.PV']
../data/interim/P6302B/initial/PA61B223.PV.pkl ['PA61B223.PV']
../data/interim/P6302B/initial/PA63113.PV.pkl ['PA63113.PV']
../data/interim/P6302B/initial/PT63105.PV.pkl ['PT63105.PV']
../data/interim/P6302B/initial/TC63109E.AV.pkl ['TC63109E.AV']
../data/interim/P6302B/initial/PT61B00.PV.pkl ['PT61B00.PV']
../data/interim/P6302B/initial/LA63114.PV.pkl ['LA63114.PV']
../data/interim/P6302B/initial/PC63112E.AV.pkl ['PC63112E.AV']
../data/interim/P6302B/initial/T6153.PV.pkl ['T6153.PV']
../data/interim/P6302B/initial/XA61B58.PV.pkl ['XA61B58.PV']
../data/interim/P6302B/initial/PIE61B23.PV.pkl ['PIE61B23.PV']
../data/interim/P6302B/initial/PAH61A98.PV.pkl ['PAH61A98.PV']
../data/interim/P6302B/initial/TT61B03.PV.pkl ['TT61B03.PV']
../data/interim/P6302B/initial/LT63114.PV.pkl ['LT63114.PV']
../data/inter

In [77]:
df_features_all['PAL61A98.PV'].head()

Unnamed: 0,val,datetime,feature
2381781,NORMAL,2015-12-30 23:48:15,PAL61A98.PV
2381782,NORMAL,2015-12-30 23:18:15,PAL61A98.PV
2381783,NORMAL,2015-12-30 22:48:15,PAL61A98.PV
2381784,NORMAL,2015-12-30 22:19:14,PAL61A98.PV
2381785,NORMAL,2015-12-30 21:49:14,PAL61A98.PV


### Find distinct dates for a feature

In [39]:
feature = 'PAL61A98.PV'
date_col = 'date'
df = df_features_all[feature]
df[date_col] = df['datetime'].apply(lambda x: x.date())

In [40]:
np.unique(df[date_col])

array([datetime.date(2014, 1, 1), datetime.date(2014, 1, 2),
       datetime.date(2014, 1, 3), ..., datetime.date(2018, 11, 28),
       datetime.date(2018, 11, 29), datetime.date(2018, 11, 30)],
      dtype=object)

# Find Distinct Values in Alarms

In [103]:
tag_categ = {}
tag_categ['alarm'] = [
    'FA61A99.PV',
    'LA63114.PV',
    'LAL63114.PV',
    'PA61B00.PV',
    'PA61B223.PV',
    'PA63110.PV',
    'PA63112.PV',
    'PA63113.PV',
    'PAH61A98.PV',
    'PAL61A98.PV',
    'PDA61B21.PV',
    'TA61B47.PV',
    'TA63109.PV',
    'TAE61B47.PV',
    'XA61B34.PV',
    'XA61B58.PV',
    'XAE61B34.PV',
    'P6302BDI.PV',
    'P6302BSD.PV'
]


df_alarm = {}
for tag in tag_categ['alarm']:
    df_alarm[tag] = df_features_all[tag].copy()

In [104]:
for tag, df in df_alarm.items():
    print(tag, '( Total Records:', len(df), ')')
    print(df.groupby('val')['val'].count())
    print('-------------------')

FA61A99.PV ( Total Records: 549030 )
val
Arc Off-line    1     
Bad             72    
Configure       32    
Error           10    
Failed          25    
NORMAL          539207
Scan Off        1     
TRIP            9682  
Name: val, dtype: int64
-------------------
LA63114.PV ( Total Records: 549182 )
val
Arc Off-line    1     
Bad             78    
Configure       27    
Error           8     
Failed          25    
LOW             24    
NORMAL          549018
Scan Off        1     
Name: val, dtype: int64
-------------------
LAL63114.PV ( Total Records: 549159 )
val
Arc Off-line    1     
Bad             78    
Configure       27    
Error           8     
Failed          25    
NORMAL          549010
Scan Off        1     
TRIP            9     
Name: val, dtype: int64
-------------------
PA61B00.PV ( Total Records: 549159 )
val
Arc Off-line    1     
Bad             78    
Configure       25    
Error           8     
Failed          25    
NORMAL          548987
Scan Off     

In [105]:
# Copy the data into a test dataframe to see how many values can be replaced
df_alarm_test = df_alarm.copy()

mapping = {
    'alm':0,
    '??? 0':0,
    '??? 1':1,
    '0.0':0,
    '1.0':1,
    'arc off-line':0,
    'bad':0, 
    'closed':0,
    'configure':0,
    'error':0,
    'failed':0,
    'fail':0,
    'high':0,
    'low':0,
    'normal':1,
    'norm':1,
    'scan off':0,
    'trip':0,
}

for tag, df in df_alarm_test.items():
    print(df.groupby('val')['val'].count())
    df['val'] = df['val'].apply(lambda x: str(x).lower().strip())
    df['val'] = df['val'].replace(mapping)
    print(df.groupby('val')['val'].count())
    print('---------===========----------')

val
Arc Off-line    1     
Bad             72    
Configure       32    
Error           10    
Failed          25    
NORMAL          539207
Scan Off        1     
TRIP            9682  
Name: val, dtype: int64
val
0    9823  
1    539207
Name: val, dtype: int64
val
Arc Off-line    1     
Bad             78    
Configure       27    
Error           8     
Failed          25    
LOW             24    
NORMAL          549018
Scan Off        1     
Name: val, dtype: int64
val
0    164   
1    549018
Name: val, dtype: int64
val
Arc Off-line    1     
Bad             78    
Configure       27    
Error           8     
Failed          25    
NORMAL          549010
Scan Off        1     
TRIP            9     
Name: val, dtype: int64
val
0    149   
1    549010
Name: val, dtype: int64
val
Arc Off-line    1     
Bad             78    
Configure       25    
Error           8     
Failed          25    
NORMAL          548987
Scan Off        1     
fail            34    
Name: val, dtype: in

In [106]:
df_alarm['PAL61A98.PV'].head()

Unnamed: 0,val,datetime,feature
2381781,1,2015-12-30 23:48:15,PAL61A98.PV
2381782,1,2015-12-30 23:18:15,PAL61A98.PV
2381783,1,2015-12-30 22:48:15,PAL61A98.PV
2381784,1,2015-12-30 22:19:14,PAL61A98.PV
2381785,1,2015-12-30 21:49:14,PAL61A98.PV


### Write down alarm files into initial directory

In [107]:
from utils import write_feature_dict

dir_path = INT_DATA_DIR + 'initial/'
write_feature_dict(dir_path, df_alarm, remove_existing=False)

Writing to file  ../data/interim/P6302B/initial/FA61A99.PV.pkl (549030, 3) [ FA61A99.PV ]
Writing to file  ../data/interim/P6302B/initial/LA63114.PV.pkl (549182, 3) [ LA63114.PV ]
Writing to file  ../data/interim/P6302B/initial/LAL63114.PV.pkl (549159, 3) [ LAL63114.PV ]
Writing to file  ../data/interim/P6302B/initial/PA61B00.PV.pkl (549159, 3) [ PA61B00.PV ]
Writing to file  ../data/interim/P6302B/initial/PA61B223.PV.pkl (549160, 3) [ PA61B223.PV ]
Writing to file  ../data/interim/P6302B/initial/PA63110.PV.pkl (549059, 3) [ PA63110.PV ]
Writing to file  ../data/interim/P6302B/initial/PA63112.PV.pkl (549174, 3) [ PA63112.PV ]
Writing to file  ../data/interim/P6302B/initial/PA63113.PV.pkl (549156, 3) [ PA63113.PV ]
Writing to file  ../data/interim/P6302B/initial/PAH61A98.PV.pkl (549158, 3) [ PAH61A98.PV ]
Writing to file  ../data/interim/P6302B/initial/PAL61A98.PV.pkl (549163, 3) [ PAL61A98.PV ]
Writing to file  ../data/interim/P6302B/initial/PDA61B21.PV.pkl (549049, 3) [ PDA61B21.PV ]


### Read data from Pickle Files

In [5]:
from utils import read_data_withfeature
df_features_all = read_data_withfeature(INT_DATA_DIR + 'initial/', True)

Number of files found in ../data/interim/P6302B/initial/ is 63 
../data/interim/P6302B/initial/TT61B05.PV.pkl ['TT61B05.PV']
../data/interim/P6302B/initial/TAE61B47.PV.pkl ['TAE61B47.PV']
../data/interim/P6302B/initial/PA61B223.PV.pkl ['PA61B223.PV']
../data/interim/P6302B/initial/PA63113.PV.pkl ['PA63113.PV']
../data/interim/P6302B/initial/PT63105.PV.pkl ['PT63105.PV']
../data/interim/P6302B/initial/TC63109E.AV.pkl ['TC63109E.AV']
../data/interim/P6302B/initial/PT61B00.PV.pkl ['PT61B00.PV']
../data/interim/P6302B/initial/LA63114.PV.pkl ['LA63114.PV']
../data/interim/P6302B/initial/PC63112E.AV.pkl ['PC63112E.AV']
../data/interim/P6302B/initial/T6153.PV.pkl ['T6153.PV']
../data/interim/P6302B/initial/XA61B58.PV.pkl ['XA61B58.PV']
../data/interim/P6302B/initial/PIE61B23.PV.pkl ['PIE61B23.PV']
../data/interim/P6302B/initial/PAH61A98.PV.pkl ['PAH61A98.PV']
../data/interim/P6302B/initial/TT61B03.PV.pkl ['TT61B03.PV']
../data/interim/P6302B/initial/LT63114.PV.pkl ['LT63114.PV']
../data/inter

In [16]:
from utils import get_tags, is_number
value_col = 'val'
df_tag = df_features_all['TT61B02.PV'].copy()
print(len(df_tag.loc[df_tag[value_col] == 'Error']))
# print(df_tag.dtypes)
print(len(df_tag))

df_tag[value_col] = df_tag[value_col].apply(lambda x: float(x) if is_number(x) else None)
df_tag.dropna(inplace=True)
# print(df_tag.dtypes)
print(len(df_tag))

24
1587413
1587388


### Removing non-float values from non-alarm tags

In [18]:
from utils import get_tags, is_number

non_alarm_tag_types = ['vibration', 'temperature', 'pressure', 'level', 'flow', 'setpoint']
value_col = 'val'

for non_alarm_tag_type in non_alarm_tag_types:
    na_tags = get_tags(non_alarm_tag_type)
    for tag in na_tags:
        if tag in df_features_all.keys():
            df_tag = df_features_all[tag]
            df_tag[value_col] = df_tag[value_col].apply(lambda x: float(x) if is_number(x) else None)
            # df_tag.dropna(inplace=True)

### Writing to Pickle Files

In [20]:
from utils import write_feature_dict

dir_path = INT_DATA_DIR + 'initial/'
write_feature_dict(dir_path, df_features=df_features_all, remove_existing=False)

Writing to file  ../data/interim/P6302B/initial/TT61B05.PV.pkl (1638071, 3) [ TT61B05.PV ]
Writing to file  ../data/interim/P6302B/initial/TAE61B47.PV.pkl (229, 3) [ TAE61B47.PV ]
Writing to file  ../data/interim/P6302B/initial/PA61B223.PV.pkl (549160, 3) [ PA61B223.PV ]
Writing to file  ../data/interim/P6302B/initial/PA63113.PV.pkl (549156, 3) [ PA63113.PV ]
Writing to file  ../data/interim/P6302B/initial/PT63105.PV.pkl (689849, 3) [ PT63105.PV ]
Writing to file  ../data/interim/P6302B/initial/TC63109E.AV.pkl (362, 3) [ TC63109E.AV ]
Writing to file  ../data/interim/P6302B/initial/PT61B00.PV.pkl (2096958, 3) [ PT61B00.PV ]
Writing to file  ../data/interim/P6302B/initial/LA63114.PV.pkl (549182, 3) [ LA63114.PV ]
Writing to file  ../data/interim/P6302B/initial/PC63112E.AV.pkl (551172, 3) [ PC63112E.AV ]
Writing to file  ../data/interim/P6302B/initial/T6153.PV.pkl (952149, 3) [ T6153.PV ]
Writing to file  ../data/interim/P6302B/initial/XA61B58.PV.pkl (282, 3) [ XA61B58.PV ]
Writing to fi