# P1201 - Data Wrangling based on the tags available in Data string

In [54]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
from os.path import basename

In [55]:
pd.set_option('display.max_colwidth', -1)

In [56]:
RAW_DATA_DIR = '../data/raw/P1201/'
PROC_DATA_DIR = '../data/processed/P1201/'
INT_DATA_DIR = '../data/interim/P1201/'

In [57]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)

### Functions

In [58]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)

In [59]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [60]:
def get_time(dt_str):
    dt_str = dt_str.strip()
    dtobj = datetime.strptime(dt_str, '%m/%d/%Y %I:%M:%S %p')
    return dtobj

In [61]:
def parse(txt):
    '''
    @{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=60; Timestamp=12/30/2017 11:48:05 PM}
    '''
    pi_point, val, time  = None, None, None
    delimiter = ';'
    sub_delimiter = '='
    
    txt = txt[txt.find('{')+1:txt.find('}')]    
    parsed_vals = txt.split(';')
    
    if len(parsed_vals) >= 3:
        pi_point = parsed_vals[0].split(sub_delimiter)[1]
    
        values = parsed_vals[1].split(sub_delimiter)
        if len(values) >= 2:
            val = values[1]
            if is_number(val):
                val = float(val)
            else:
                val = None
        else:
            val = None

        time_vals = parsed_vals[2].split(sub_delimiter)
        if len(time_vals) >= 2:
            time = time_vals[1]
            time = get_time(time)
        else:
            return None, None, None
        
    pi_point = pi_point.replace('SCTM:', '')
    
    return pi_point, val, time    

In [62]:
def scale_val(val, min_val, max_val):
    if val is not None:
        return (val-min_val)/(max_val-min_val + 1e-7)
    return None

In [63]:
def get_minutes_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff_min = round((current_ts - base_ts) / 60.0)
    
    return time_diff_min

In [64]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2017-01-01 00:00:01', fmt)

### Reading of Data - Using tags to uniquely identify

In [70]:
data = {} # Contains mapping between tags and data
tag_files = {} # Contains mapping between tags and the file names
files_tag = {} # Contains the mapping between file names and tags

dir_files = os.listdir(RAW_DATA_DIR)
print('No of files in %s is %d' % (RAW_DATA_DIR, len(dir_files)))

for current_file in dir_files:
    
    current_base = current_file
    current_file = RAW_DATA_DIR + '/' + current_file
    
    print('Processing ', current_file, '\t', end='')
    df = pd.read_csv(current_file, header=None)
    
    df['feature'], df['val'], df['datetime'] = zip(*df[0].map(parse))
       
    df = df[df['feature'] != None]
    df = df[df['datetime'].notnull()]
        
    tags = np.unique(df['feature'])
    print('Tags Found = ', len(tags))
    
    for tag in tags:
        df_tag = df.loc[df['feature'] == tag]
        print('tag = ', tag, '\tsize =', df_tag.shape)
        
        if tag not in data.keys():
            data[tag] = pd.DataFrame(columns=df_tag.columns.values)
        data[tag] = data[tag].append(df_tag)

No of files in ../data/raw/P1201/ is 28
Processing  ../data/raw/P1201//PI29562018.csv 	Tags Found =  1
tag =  12P01BCP4:PI12956.PNT 	size = (387615, 4)
Processing  ../data/raw/P1201//FC12847OUTS2018.csv 	Tags Found =  1
tag =  12P01BCP4:FC12847.OUT 	size = (23799, 4)
Processing  ../data/raw/P1201//PI12026BT2018.csv 	Tags Found =  1
tag =  HART_P1CP01:PI12026BT.MEAS 	size = (22594, 4)
Processing  ../data/raw/P1201//PI29552018.csv 	Tags Found =  1
tag =  12P01BCP4:PI12955.PNT 	size = (387625, 4)
Processing  ../data/raw/P1201//P1201_2_2018.csv 	Tags Found =  15
tag =  HART_P1CP01:PI12026BT.MEAS 	size = (14901, 4)
tag =  HART_P1CP01:PI12037BT.MEAS 	size = (14901, 4)
tag =  HART_P1CP01:PI12046BT.MEAS 	size = (14901, 4)
tag =  HART_P1CP04:FI12002BT.MEAS 	size = (14899, 4)
tag =  HART_P1CP04:FY12847FB.MEAS 	size = (41969, 4)
tag =  HART_P1CP04:HY12017FB.MEAS 	size = (14907, 4)
tag =  HART_P1CP04:LY12001FB.MEAS 	size = (14907, 4)
tag =  HART_P1CP04:PI12003BT.MEAS 	size = (14907, 4)
tag =  HART

In [47]:
print('Number of distince tags available are ', len(data.keys()))

Number of distince tags available are  35


### Find the Minimum date for the items

In [48]:
# This is to get the base year found among all datasets
for tag, df in data.items():
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    df_temp = df.loc[df['date_time'] == min_date]
    print('Tag = ', tag, '\tShape = ', df_temp.shape, '\t', min_date, '\t', max_date)

Tag =  12P01BCP4:PI12956.PNT 	Shape =  (1, 4) 	 1970-01-01 00:00:00 	 2018-11-02 14:20:40
Tag =  12P01BCP4:FC12847.OUT 	Shape =  (1, 4) 	 2017-01-01 00:18:40 	 2018-11-02 14:20:40
Tag =  HART_P1CP01:PI12026BT.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  12P01BCP4:PI12955.PNT 	Shape =  (1, 4) 	 1970-01-01 00:00:00 	 2018-11-02 14:20:40
Tag =  HART_P1CP01:PI12037BT.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  HART_P1CP01:PI12046BT.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:15 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:FI12002BT.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:42 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:FY12847FB.MEAS 	Shape =  (1, 4) 	 2016-10-28 01:06:07 	 2018-11-02 13:08:35
Tag =  HART_P1CP04:HY12017FB.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:42 	 2018-11-02 13:08:40
Tag =  HART_P1CP04:LY12001FB.MEAS 	Shape =  (1, 4) 	 2016-10-13 01:03:42 	 2018-11-02 13:08:40
Tag =  HART_P1CP04:PI12003BT.MEAS 	Shape =  (1, 4) 	 2016-10-13 0

### Writing to Pickle Files

In [73]:
data['HART_P1CP04:PY12003AFB.MEAS'].head()

Unnamed: 0,0,feature,val,datetime
161529,@{PIPoint=HART_P1CP04:PY12003AFB.MEAS; Value=28.0625; Timestamp=11/2/2018 1:08:35 PM},HART_P1CP04:PY12003AFB.MEAS,28.0625,2018-11-02 13:08:35
161530,@{PIPoint=HART_P1CP04:PY12003AFB.MEAS; Value=28.04688; Timestamp=11/2/2018 1:08:15 PM},HART_P1CP04:PY12003AFB.MEAS,28.0469,2018-11-02 13:08:15
161531,@{PIPoint=HART_P1CP04:PY12003AFB.MEAS; Value=28.04688; Timestamp=11/2/2018 1:07:05 PM},HART_P1CP04:PY12003AFB.MEAS,28.0469,2018-11-02 13:07:05
161532,@{PIPoint=HART_P1CP04:PY12003AFB.MEAS; Value=28.04688; Timestamp=11/2/2018 1:05:55 PM},HART_P1CP04:PY12003AFB.MEAS,28.0469,2018-11-02 13:05:55
161533,@{PIPoint=HART_P1CP04:PY12003AFB.MEAS; Value=28.0625; Timestamp=11/2/2018 1:04:45 PM},HART_P1CP04:PY12003AFB.MEAS,28.0625,2018-11-02 13:04:45


In [71]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2016-01-01 00:00:01', fmt)### Combining Dataframes from both years and Normalization

for tag, df in data.items():
    
    df = df[['feature', 'datetime', 'val']]
    f_name = tag.replace(':', '_')
    pkl_file = INT_DATA_DIR + '/' + f_name + '.pkl'
    print('Writing to file ', pkl_file)
    
    with open(pkl_file, 'wb') as f:
        pkl.dump(df, f, protocol=pkl.HIGHEST_PROTOCOL)

Writing to file  ../data/interim/P1201//12P01BCP4_PI12956.PNT.pkl
Writing to file  ../data/interim/P1201//12P01BCP4_FC12847.OUT.pkl
Writing to file  ../data/interim/P1201//HART_P1CP01_PI12026BT.MEAS.pkl
Writing to file  ../data/interim/P1201//12P01BCP4_PI12955.PNT.pkl
Writing to file  ../data/interim/P1201//HART_P1CP01_PI12037BT.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP01_PI12046BT.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_FI12002BT.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_FY12847FB.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_HY12017FB.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_LY12001FB.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_PI12003BT.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_PX12003FB.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_PY12003AFB.MEAS.pkl
Writing to file  ../data/interim/P1201//HART_P1CP04_PY12109FB.MEAS.pkl
Writing to file  ../

In [76]:
data['HART_U1CP03:PY24092FB.MEAS'].head()

Unnamed: 0,0,feature,val,datetime
1030531,@{PIPoint=HART_U1CP03:PY24092FB.MEAS; Value=78; Timestamp=11/2/2018 1:08:35 PM},HART_U1CP03:PY24092FB.MEAS,78.0,2018-11-02 13:08:35
1030532,@{PIPoint=HART_U1CP03:PY24092FB.MEAS; Value=85.125; Timestamp=11/2/2018 1:07:45 PM},HART_U1CP03:PY24092FB.MEAS,85.125,2018-11-02 13:07:45
1030533,@{PIPoint=HART_U1CP03:PY24092FB.MEAS; Value=88.0625; Timestamp=11/2/2018 1:06:35 PM},HART_U1CP03:PY24092FB.MEAS,88.0625,2018-11-02 13:06:35
1030534,@{PIPoint=HART_U1CP03:PY24092FB.MEAS; Value=86.4375; Timestamp=11/2/2018 1:05:35 PM},HART_U1CP03:PY24092FB.MEAS,86.4375,2018-11-02 13:05:35
1030535,@{PIPoint=HART_U1CP03:PY24092FB.MEAS; Value=87.9375; Timestamp=11/2/2018 1:04:25 PM},HART_U1CP03:PY24092FB.MEAS,87.9375,2018-11-02 13:04:25
