# P2253 - Data Wrangling based on the tags available in Data string

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
from os.path import basename

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
RAW_DATA_DIR = '../data/raw/P2253/'
PROC_DATA_DIR = '../data/processed/P2253/'
INT_DATA_DIR = '../data/interim/P2253/'

In [4]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)

### Functions

In [5]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)

In [6]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [7]:
def get_time(dt_str):
    dt_str = dt_str.strip()
    dtobj = datetime.strptime(dt_str, '%m/%d/%Y %I:%M:%S %p')
    return dtobj

In [8]:
def parse(txt):
    '''
    @{PIPoint=SCTM:22GTWY_E403:FALE22E23SP.PNT; Value=60; Timestamp=12/30/2017 11:48:05 PM}
    '''
    pi_point, val, time  = None, None, None
    delimiter = ';'
    sub_delimiter = '='
    
    txt = txt[txt.find('{')+1:txt.find('}')]    
    parsed_vals = txt.split(';')
    
    if len(parsed_vals) >= 3:
        pi_point = parsed_vals[0].split(sub_delimiter)[1]
    
        if pi_point is not None:
            values = parsed_vals[1].split(sub_delimiter)
            if len(values) >= 2:
                val = values[1]
                if is_number(val):
                    val = float(val)
                else:
                    val = None
            else:
                val = None

            time_vals = parsed_vals[2].split(sub_delimiter)
            if len(time_vals) >= 2:
                time = time_vals[1]
                time = get_time(time)
            else:
                return None, None, None

    if pi_point is not None:
        pi_point = pi_point.replace('SCTM:', '')
    
    return pi_point, val, time    

In [17]:
def longestSubstringFinder(string1, string2):
    '''
    Code from https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings    
    '''
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        match = ""
        for j in range(len2):
            if (i + j < len1 and string1[i + j] == string2[j]):
                match += string2[j]
            else:
                if (len(match) > len(answer)): answer = match
                match = ""
    return answer

### Reading of Data - Using tags to uniquely identify

In [18]:
data = {} # Contains mapping between tags and data
tag_files = {} # Contains mapping between tags and the file names
files_tag = {} # Contains the mapping between file names and tags

dir_files = os.listdir(RAW_DATA_DIR)

print('No of files in %s is %d' % (RAW_DATA_DIR, len(dir_files)))

for current_file in dir_files:

    current_base = current_file
    current_file = RAW_DATA_DIR + current_file

    if current_base.endswith('.csv'):
        print('Processing ', current_file, '\t', end='')
        df = pd.read_csv(current_file, header=None)

    df['feature'], df['val'], df['datetime'] = zip(*df[0].map(parse))

    df = df[df['feature'] != None]
    df = df[df['datetime'].notnull()]

    tags = np.unique(df['feature'])

    
    print('Tags Found = ', tags)

    for tag in tags:
        df_tag = df.loc[df['feature'] == tag]

        print('Intersection = ', longestSubstringFinder(tag, current_base))

        if tag not in data.keys():
            data[tag] = pd.DataFrame(columns=df_tag.columns.values)
        data[tag] = data[tag].append(df_tag)

No of files in ../data/raw/P2253/ is 96
Processing  ../data/raw/P2253/IX22E472018.csv 	Tags Found =  ['22PM53CPM:IX22E47.PNT']
Intersection =  IX22E47
Processing  ../data/raw/P2253/XI22F26Y2017.csv 	Tags Found =  ['UBNV05CPB:XI22F26Y.PNT']
Intersection =  XI22F26Y
Processing  ../data/raw/P2253/22E24PNT2017.csv 	Tags Found =  ['22GTWY_E403:PDIE22E24.PNT']
Intersection =  22E24
Processing  ../data/raw/P2253/E23SP2018.csv 	Tags Found =  ['22GTWY_E402:PALE22F32SP.PNT']
Intersection =  E2
Processing  ../data/raw/P2253/FQI22E222018.csv 	Tags Found =  ['22P53CP4:FQI22E22.OUT']
Intersection =  FQI22E22
Processing  ../data/raw/P2253/XI22F30X2017.csv 	Tags Found =  ['UBNV05CPB:XI22F30X.PNT']
Intersection =  XI22F30X
Processing  ../data/raw/P2253/PI22F312018.csv 	Tags Found =  ['22P53CP4:PI22F31.PNT']
Intersection =  PI22F31
Processing  ../data/raw/P2253/XI22F29X2018.csv 	Tags Found =  ['UBNV05CPB:XI22F29X.PNT']
Intersection =  XI22F29X
Processing  ../data/raw/P2253/TI22F15A2018.csv 	Tags Found =

In [19]:
print('Number of distince tags available are ', len(data.keys()))

Number of distince tags available are  54


### Find the Minimum date for the items

In [20]:
# This is to get the base year found among all datasets
for tag, df in data.items():
    
    min_date = df['datetime'].min()
    max_date = df['datetime'].max()
    
    df_temp = df.loc[df['datetime'] == min_date]
    print('Tag = ', tag, '\tShape = ', df_temp.shape, '\t', min_date, '\t', max_date)

Tag =  22PM53CPM:IX22E47.PNT 	Shape =  (1, 4) 	 2018-01-01 00:00:29 	 2018-09-21 08:13:30
Tag =  UBNV05CPB:XI22F26Y.PNT 	Shape =  (1, 4) 	 2017-06-22 05:10:45 	 2018-09-21 12:49:15
Tag =  22GTWY_E403:PDIE22E24.PNT 	Shape =  (1, 4) 	 2017-06-21 21:16:40 	 2018-09-18 11:21:10
Tag =  22GTWY_E402:PALE22F32SP.PNT 	Shape =  (3, 4) 	 2018-01-01 00:12:45 	 2018-09-18 11:12:14
Tag =  22P53CP4:FQI22E22.OUT 	Shape =  (1, 4) 	 2017-06-21 21:18:50 	 2018-09-18 12:03:15
Tag =  UBNV05CPB:XI22F30X.PNT 	Shape =  (1, 4) 	 2017-06-22 05:10:45 	 2018-09-21 13:14:15
Tag =  22P53CP4:PI22F31.PNT 	Shape =  (1, 4) 	 2017-06-21 21:18:50 	 2018-09-18 13:19:10
Tag =  UBNV05CPB:XI22F29X.PNT 	Shape =  (1, 4) 	 2017-06-22 05:10:45 	 2018-09-21 12:57:15
Tag =  22PM53CPM:TI22F15A.PNT 	Shape =  (2, 4) 	 2017-06-21 21:29:35 	 2018-09-21 08:14:25
Tag =  22P53CP4:TI22F12.PNT 	Shape =  (1, 4) 	 2017-06-21 21:19:15 	 2018-09-20 14:59:14
Tag =  TMP:HCU_P2253_Flow_Balance.Cal 	Shape =  (1, 4) 	 2017-06-22 05:03:00 	 2018-09-2

### Writing to Pickle Files

In [25]:
for tag, df in data.items():
    
    df = df[['feature', 'datetime', 'val']]
    f_name = tag.replace(':', '_')
    pkl_file = INT_DATA_DIR + '/' + f_name + '.pkl'
    print('Writing to file ', pkl_file)
    
    with open(pkl_file, 'wb') as f:
        pkl.dump(df, f, protocol=pkl.HIGHEST_PROTOCOL)

Writing to file  ../data/interim/P2253//22PM53CPM_IX22E47.PNT.pkl
Writing to file  ../data/interim/P2253//UBNV05CPB_XI22F26Y.PNT.pkl
Writing to file  ../data/interim/P2253//22GTWY_E403_PDIE22E24.PNT.pkl
Writing to file  ../data/interim/P2253//22GTWY_E402_PALE22F32SP.PNT.pkl
Writing to file  ../data/interim/P2253//22P53CP4_FQI22E22.OUT.pkl
Writing to file  ../data/interim/P2253//UBNV05CPB_XI22F30X.PNT.pkl
Writing to file  ../data/interim/P2253//22P53CP4_PI22F31.PNT.pkl
Writing to file  ../data/interim/P2253//UBNV05CPB_XI22F29X.PNT.pkl
Writing to file  ../data/interim/P2253//22PM53CPM_TI22F15A.PNT.pkl
Writing to file  ../data/interim/P2253//22P53CP4_TI22F12.PNT.pkl
Writing to file  ../data/interim/P2253//TMP_HCU_P2253_Flow_Balance.Cal.pkl
Writing to file  ../data/interim/P2253//22P53CP4_TC22F38.SPT.pkl
Writing to file  ../data/interim/P2253//22P53CP4_FXC22E22.MEAS.pkl
Writing to file  ../data/interim/P2253//22P53CP4_FC22E04.MEAS.pkl
Writing to file  ../data/interim/P2253//UBNV05CPB_ZI22F