# P1201 - Data Wrangling based on the tags available in Data string

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
from os.path import basename

from utils import parse

In [4]:
pd.set_option('display.max_colwidth', -1)

In [5]:
pump = 'P1201'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'

### Reading of Data - Using tags to uniquely identify

In [5]:
data = {} # Contains mapping between tags and data
tag_files = {} # Contains mapping between tags and the file names
files_tag = {} # Contains the mapping between file names and tags

dir_files = os.listdir(RAW_DATA_DIR)

print('No of files in %s is %d' % (RAW_DATA_DIR, len(dir_files)))

for current_file in dir_files:

    current_base = current_file
    current_file = RAW_DATA_DIR + current_file

    if current_base.endswith('.csv'):
        print('Processing ', current_file, '\t', end='')
        df = pd.read_csv(current_file, header=None)

    df['feature'], df['val'], df['datetime'] = zip(*df[0].map(parse))

    df = df[df['feature'] != None]
    df = df[df['datetime'].notnull()]

    tags = np.unique(df['feature'])

    
    print('Tags Found = ', tags)

    for tag in tags:
        df_tag = df.loc[df['feature'] == tag]

        # print('Intersection = ', longestSubstringFinder(tag, current_base))

        if tag not in data.keys():
            data[tag] = pd.DataFrame(columns=df_tag.columns.values)
        data[tag] = data[tag].append(df_tag)

No of files in ../data/raw/P1201/ is 30
Processing  ../data/raw/P1201/PI29562018.csv 	Tags Found =  ['12P01BCP4:PI12956.PNT']
Processing  ../data/raw/P1201/FC12847OUTS2018.csv 	Tags Found =  ['12P01BCP4:FC12847.OUT']
Processing  ../data/raw/P1201/PI12026BT2018.csv 	Tags Found =  ['HART_P1CP01:PI12026BT.MEAS']
Processing  ../data/raw/P1201/PI29552018.csv 	Tags Found =  ['12P01BCP4:PI12955.PNT']
Processing  ../data/raw/P1201/P1201_2_2018.csv 	Tags Found =  ['HART_P1CP01:PI12026BT.MEAS' 'HART_P1CP01:PI12037BT.MEAS'
 'HART_P1CP01:PI12046BT.MEAS' 'HART_P1CP04:FI12002BT.MEAS'
 'HART_P1CP04:FY12847FB.MEAS' 'HART_P1CP04:HY12017FB.MEAS'
 'HART_P1CP04:LY12001FB.MEAS' 'HART_P1CP04:PI12003BT.MEAS'
 'HART_P1CP04:PX12003FB.MEAS' 'HART_P1CP04:PY12003AFB.MEAS'
 'HART_P1CP04:PY12109FB.MEAS' 'HART_P1CP04:PY12110FB.MEAS'
 'HART_P1CP04:PY12516FB.MEAS' 'HART_U1CP03:PX12003BT.MEAS'
 'HART_U1CP03:PY24092FB.MEAS']
Processing  ../data/raw/P1201/P1201_3_2018.csv 	Tags Found =  ['12P01BCP4:FC12847.MEAS' '12P01BC

In [8]:
print('Number of distince tags available are ', len(data.keys()))

Number of distince tags available are  42


In [12]:
list(data.keys())

['12P01BCP4:PI12956.PNT',
 '12P01BCP4:FC12847.OUT',
 'HART_P1CP01:PI12026BT.MEAS',
 '12P01BCP4:PI12955.PNT',
 'HART_P1CP01:PI12037BT.MEAS',
 'HART_P1CP01:PI12046BT.MEAS',
 'HART_P1CP04:FI12002BT.MEAS',
 'HART_P1CP04:FY12847FB.MEAS',
 'HART_P1CP04:HY12017FB.MEAS',
 'HART_P1CP04:LY12001FB.MEAS',
 'HART_P1CP04:PI12003BT.MEAS',
 'HART_P1CP04:PX12003FB.MEAS',
 'HART_P1CP04:PY12003AFB.MEAS',
 'HART_P1CP04:PY12109FB.MEAS',
 'HART_P1CP04:PY12110FB.MEAS',
 'HART_P1CP04:PY12516FB.MEAS',
 'HART_U1CP03:PX12003BT.MEAS',
 'HART_U1CP03:PY24092FB.MEAS',
 '12P01BCP4:FC12847.MEAS',
 '12P01BCP4:FC12847.SPT',
 '12P01BCP4:XI12597.CIN',
 '12GTWY_E101:E12101FI.CIN',
 '12GTWY_E101:FALE12404FI.CIN',
 '12GTWY_E101:FALE12404SP.PNT',
 '12GTWY_E101:FIE12404.PNT',
 '12GTWY_E101:FTE12404MS.CIN',
 '12GTWY_E101:ZAE12131FI.CIN',
 '12GTWY_E104:E12104FI.CIN',
 '12GTWY_E104:HAE12131FI.CIN',
 '12GTWY_E428:HAE12646FI.CIN',
 '12GTWY_FG404:AAH12554BSP.PNT',
 '12GTWY_FG404:AAH12555BSP.PNT',
 '12GTWY_FG404:AAH12569SP.PNT',
 '12

### Remove data that resulted in default values

In [9]:
fmt = '%Y-%m-%d %H:%M:%S'
remove_before = datetime.strptime('2010-01-01 00:00:01', fmt)
for tag, df in data.items():
    df = df.loc[df['datetime'] > remove_before][['val', 'datetime', 'feature']]
    data[tag] = df

### Find the First and Last Date for features

In [8]:
# This is to get the base year found among all datasets
for tag, df in data.items():
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    df_temp = df.loc[df['datetime'] == min_date]
    print('Tag = ', tag, '\t', min_date, '\t', max_date)

Tag =  12P01BCP4:PI12956.PNT 	 2018-01-01 00:00:09 	 2018-11-02 14:20:40
Tag =  12P01BCP4:FC12847.OUT 	 2017-01-01 00:18:40 	 2018-11-02 14:20:40
Tag =  HART_P1CP01:PI12026BT.MEAS 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  12P01BCP4:PI12955.PNT 	 2018-01-01 00:00:09 	 2018-11-02 14:20:40
Tag =  HART_P1CP01:PI12037BT.MEAS 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  HART_P1CP01:PI12046BT.MEAS 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:FI12002BT.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:FY12847FB.MEAS 	 2016-10-28 01:06:07 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:HY12017FB.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:40
Tag =  HART_P1CP04:LY12001FB.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:40
Tag =  HART_P1CP04:PI12003BT.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:40
Tag =  HART_P1CP04:PX12003FB.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:PY12003AFB.MEAS 	 2016-10-13 01:03:42 	 2018-11-02 13:08:35


In [None]:
meta_feature = [
'12GTWY_E101:FALE12404SP.PNT',
'12GTWY_E101:FIE12404.PNT',
'12P01BCP4:FC12847.MEAS',
'12P01BCP4:FC12847.OUT',
'12P01BCP4:FC12847.SPT',
'12P01BCP4:PI12955.PNT',
'12P01BCP4:PI12956.PNT',
'12P01BCP4:XI12597.CIN',
'HART_P1CP04:FY12847FB.MEAS',
'12V04CP4:FC12006.MEAS',
'12V04CP4:FC12006.OUT',
'12V04CP4:PC12007.MEAS',
'12V05CP4:PC12073.MEAS',
'12V04CP4:LC12005A.MEAS',
'12V08CP4:FC12351.MEAS',
'12DATASCRCP1:TI12813.PNT']

'''
extra_features = [
'HART_P1CP04:PI12003BT.MEAS',
'HART_P1CP04:LY12001FB.MEAS',
'HART_P1CP04:PY12109FB.MEAS',
'12GTWY_E101:E12101FI.CIN',
'12GTWY_E104:HAE12131FI.CIN',
'12GTWY_FG404:AAH12554BSP.PNT',
'HART_P1CP01:PI12046BT.MEAS',
'HART_U1CP03:PX12003BT.MEAS',
'12GTWY_FG404:AAH12555BSP.PNT',
'HART_P1CP04:PY12110FB.MEAS',
'HART_P1CP04:PY12003AFB.MEAS',
'HART_U1CP03:PY24092FB.MEAS',
'12GTWY_E101:FTE12404MS.CIN',
'12GTWY_FG404:AAH12569SP.PNT',
'HART_P1CP04:FI12002BT.MEAS',
'HART_P1CP04:HY12017FB.MEAS',
'12GTWY_FG404:AAH12570SP.PNT',
'HART_P1CP04:PX12003FB.MEAS',
'12GTWY_E428:HAE12646FI.CIN',
'12GTWY_E101:ZAE12131FI.CIN',
'HART_P1CP04:PY12516FB.MEAS',
'HART_P1CP01:PI12026BT.MEAS',
'12GTWY_E104:E12104FI.CIN',
'12GTWY_E101:FALE12404FI.CIN',
'HART_P1CP01:PI12037BT.MEAS']
'''


extra_features = list(set(data.keys()) - set(meta_feature))

print('| Tag | Size | First Date | Last Date |')
print('| -- | -- | -- | -- |')
for tag in meta_feature:
    
    df = data[tag]
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    print('|', tag, '|', len(df), '|', min_date, '|', max_date, '|')
    
print('\n| Tag | Size | First Date | Last Date |')
print('| -- | -- | -- | -- |')
for tag in extra_features:
    
    df = data[tag]
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    print('|', tag, '|', len(df), '|', min_date, '|', max_date, '|')

### Writing to Pickle Files

In [9]:
from utils import write_feature_dict

dir_path = INT_DATA_DIR + 'initial/'    
write_feature_dict(dir_path, data)

Writing to file  ../data/interim/P1201/initial/12P01BCP4-PI12956.PNT.pkl (784472, 3)
Writing to file  ../data/interim/P1201/initial/12P01BCP4-FC12847.OUT.pkl (61945, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP01-PI12026BT.MEAS.pkl (59611, 3)
Writing to file  ../data/interim/P1201/initial/12P01BCP4-PI12955.PNT.pkl (784487, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP01-PI12037BT.MEAS.pkl (59610, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP01-PI12046BT.MEAS.pkl (59617, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP04-FI12002BT.MEAS.pkl (59688, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP04-FY12847FB.MEAS.pkl (163747, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP04-HY12017FB.MEAS.pkl (59739, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP04-LY12001FB.MEAS.pkl (59735, 3)
Writing to file  ../data/interim/P1201/initial/HART_P1CP04-PI12003BT.MEAS.pkl (59737, 3)
Writing to file  ../data/interim/

### Read date from Pickle Files

In [6]:
from utils import read_data_withfeature
df_features_all = read_data_withfeature(INT_DATA_DIR + 'initial/', True)

Number of files found in ../data/interim/P1201/initial/ is 42 
../data/interim/P1201/initial/HART_P1CP04-PY12003AFB.MEAS.pkl ['HART_P1CP04:PY12003AFB.MEAS']
../data/interim/P1201/initial/12GTWY_E101-ZAE12131FI.CIN.pkl ['12GTWY_E101:ZAE12131FI.CIN']
../data/interim/P1201/initial/HART_P1CP04-FY12847FB.MEAS.pkl ['HART_P1CP04:FY12847FB.MEAS']
../data/interim/P1201/initial/HART_U1CP03-PY24092FB.MEAS.pkl ['HART_U1CP03:PY24092FB.MEAS']
../data/interim/P1201/initial/12GTWY_FG404-AAH12570SP.PNT.pkl ['12GTWY_FG404:AAH12570SP.PNT']
../data/interim/P1201/initial/HART_P1CP01-PI12046BT.MEAS.pkl ['HART_P1CP01:PI12046BT.MEAS']
../data/interim/P1201/initial/12GTWY_FG404-AAH12569SP.PNT.pkl ['12GTWY_FG404:AAH12569SP.PNT']
../data/interim/P1201/initial/12V04CP4-PC12007.MEAS.pkl ['12V04CP4:PC12007.MEAS']
../data/interim/P1201/initial/HART_P1CP04-PY12109FB.MEAS.pkl ['HART_P1CP04:PY12109FB.MEAS']
../data/interim/P1201/initial/12GTWY_FG404-AAH12555BSP.PNT.pkl ['12GTWY_FG404:AAH12555BSP.PNT']
../data/interim/P

### Find distinct dates for a feature

In [8]:
feature = '12P01BCP4:FC12847.MEAS'
date_col = 'date'
df = df_features_all[feature]
df[date_col] = df['datetime'].apply(lambda x: x.date())

In [10]:
np.unique(df[date_col])

array([datetime.date(2018, 1, 1), datetime.date(2018, 1, 2),
       datetime.date(2018, 1, 3), datetime.date(2018, 1, 4),
       datetime.date(2018, 1, 5), datetime.date(2018, 1, 6),
       datetime.date(2018, 1, 7), datetime.date(2018, 1, 8),
       datetime.date(2018, 1, 9), datetime.date(2018, 1, 10),
       datetime.date(2018, 1, 11), datetime.date(2018, 1, 12),
       datetime.date(2018, 1, 13), datetime.date(2018, 1, 14),
       datetime.date(2018, 1, 15), datetime.date(2018, 1, 16),
       datetime.date(2018, 1, 17), datetime.date(2018, 1, 18),
       datetime.date(2018, 1, 19), datetime.date(2018, 1, 20),
       datetime.date(2018, 1, 21), datetime.date(2018, 1, 22),
       datetime.date(2018, 1, 23), datetime.date(2018, 1, 24),
       datetime.date(2018, 1, 25), datetime.date(2018, 1, 26),
       datetime.date(2018, 1, 27), datetime.date(2018, 1, 28),
       datetime.date(2018, 1, 29), datetime.date(2018, 1, 30),
       datetime.date(2018, 1, 31), datetime.date(2018, 2, 1),
  