# Data Imputation - P6302B

### Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string

from os import listdir
from os.path import isfile, join

### Notebook options

In [2]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### General Assumptions

In [3]:
time_col = 'datetime'
time_gran_col = 'datetime_gran'
value_col = 'val'
scaled_value_col = 'scaled_val'
time_granularity = 'min'

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2014-01-01 00:00:01', fmt)
start_date = datetime.strptime('2014-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

### Data Directories

In [5]:
pump = 'P6302B'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'

### Read the aggregated data

In [6]:
from utils import read_data
input_dir = INT_DATA_DIR + 'aggregated/'
df_features = read_data(input_dir, True)

Number of files found in ../data/interim/P6302B/aggregated/ is 45 
Number of features extracted from 45 files is 45 


In [7]:
print('Number of features = ', len(list(df_features.keys())))

Number of features =  45


In [8]:
list(df_features.keys())

['TT61B05.PV',
 'PT63105.PV',
 'TC63109E.AV',
 'PT61B00.PV',
 'PC63112E.AV',
 'T6153.PV',
 'PIE61B23.PV',
 'TT61B03.PV',
 'LT63114.PV',
 '05GTWY_BN06:XT61B17.PNT',
 'T6151.PV',
 'TT61B01.PV',
 '05GTWY_BN06:XT61B16.PNT',
 '05GTWY_BN06:XT61B20.PNT',
 'PT63103.PV',
 'PA63110.PV',
 'PDA61B21.PV',
 '05GTWY_BN06:XT61B13.PNT',
 'PIE61B00.PV',
 'TT61B04.PV',
 'F61221VP',
 '05GTWY_BN06:XT61B12.PNT',
 'F61221',
 'TT61B06.PV',
 'PC61A98E.AV',
 'TT61B02.PV',
 '05GTWY_BN06:XT61B10.PNT',
 'PC61A98.AV',
 'PIE61B22.PV',
 'T6152.PV',
 'FT61A99.PV',
 '05GTWY_BN06:ZT61B14.PNT',
 'TC63109.AV',
 '05GTWY_BN06:XT61B19.PNT',
 'PT63112.PV',
 '05GTWY_BN06:XT61B18.PNT',
 'TT63109.PV',
 'FIE61A99.PV',
 'PIE63113.PV',
 '05GTWY_BN06:ZT61B15.PNT',
 'PC63112.AV',
 '05GTWY_BN06:XT61B11.PNT',
 'PT61A98.PV',
 'PIE61608.PV',
 'T6150.PV']

### Generate Master Dataframe for time

In [9]:
from utils import generate_master_df

time_granularity = 'min'
time_gran_col = 'datetime_gran'

fmt = '%Y-%m-%d %H:%M:%S'
# base_date = datetime.strptime('2017-01-01 00:00:01', fmt)
# stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

print(time_granularity, base_date, stop_date)

df_master = generate_master_df(time_granularity= time_granularity,
                               time_gran_col= time_gran_col, 
                               base_date= base_date, 
                               end_date= stop_date)

min 2014-01-01 00:00:01 2019-01-01 00:00:01


In [10]:
print('Size of the master df', len(df_master))

Size of the master df 2629441


In [11]:
for feature,df in df_features.items():
    print(feature, ' - Length: ', len(df))

TT61B05.PV  - Length:  1454769
PT63105.PV  - Length:  539886
TC63109E.AV  - Length:  339
PT61B00.PV  - Length:  896784
PC63112E.AV  - Length:  319592
T6153.PV  - Length:  834740
PIE61B23.PV  - Length:  64965
TT61B03.PV  - Length:  1384539
LT63114.PV  - Length:  1505140
05GTWY_BN06:XT61B17.PNT  - Length:  2137985
T6151.PV  - Length:  721069
TT61B01.PV  - Length:  1363987
05GTWY_BN06:XT61B16.PNT  - Length:  1534695
05GTWY_BN06:XT61B20.PNT  - Length:  2086795
PT63103.PV  - Length:  1207584
PA63110.PV  - Length:  317423
PDA61B21.PV  - Length:  317418
05GTWY_BN06:XT61B13.PNT  - Length:  2117248
PIE61B00.PV  - Length:  156957
TT61B04.PV  - Length:  1330282
F61221VP  - Length:  1248236
05GTWY_BN06:XT61B12.PNT  - Length:  2098925
F61221  - Length:  1163319
TT61B06.PV  - Length:  1326600
PC61A98E.AV  - Length:  319870
TT61B02.PV  - Length:  1404932
05GTWY_BN06:XT61B10.PNT  - Length:  2200962
PC61A98.AV  - Length:  1613066
PIE61B22.PV  - Length:  60417
T6152.PV  - Length:  751592
FT61A99.PV  - L

### Generate Master Features

In [12]:
df_master_features = {}

idx = 0
for feature in df_features.keys():
    df = df_features[feature].copy()
    df = pd.merge(df_master, df, how='left', left_on=[time_gran_col], right_on=[time_gran_col])
    
    df_master_features[feature] = df

### Interpolation - Linear

In [13]:
print('| Tag | Total | NANs before Interpolation | NANs after interpolation |')
print('| -- | -- | -- | -- |')

for feature in df_master_features.keys():    
    df = df_master_features[feature].copy()
    
    print('|', feature, '|', len(df),  '|', len(df.loc[df[value_col].isna()]), end='|')
    
    df.interpolate(inplace=True)
    
    print(len(df.loc[df[value_col].isna()]), '|')
    
    df_master_features[feature] = df

TT61B05.PV Total= 2629441  NANs=  1174672 --> 1
PT63105.PV Total= 2629441  NANs=  2089555 --> 29677
TC63109E.AV Total= 2629441  NANs=  2629102 --> 29678
PT61B00.PV Total= 2629441  NANs=  1732657 --> 0
PC63112E.AV Total= 2629441  NANs=  2309849 --> 26
T6153.PV Total= 2629441  NANs=  1794701 --> 1
PIE61B23.PV Total= 2629441  NANs=  2564476 --> 2224943
TT61B03.PV Total= 2629441  NANs=  1244902 --> 1
LT63114.PV Total= 2629441  NANs=  1124301 --> 0
05GTWY_BN06:XT61B17.PNT Total= 2629441  NANs=  491456 --> 0
T6151.PV Total= 2629441  NANs=  1908372 --> 1
TT61B01.PV Total= 2629441  NANs=  1265454 --> 2
05GTWY_BN06:XT61B16.PNT Total= 2629441  NANs=  1094746 --> 0
05GTWY_BN06:XT61B20.PNT Total= 2629441  NANs=  542646 --> 0
PT63103.PV Total= 2629441  NANs=  1421857 --> 0
PA63110.PV Total= 2629441  NANs=  2312018 --> 26
PDA61B21.PV Total= 2629441  NANs=  2312023 --> 26
05GTWY_BN06:XT61B13.PNT Total= 2629441  NANs=  512193 --> 0
PIE61B00.PV Total= 2629441  NANs=  2472484 --> 2224943
TT61B04.PV Tota

In [14]:
for feature,df in df_master_features.items():
    print(feature, '\tSize=', len(df), end=' --> ')
    df.dropna(subset=['val'], inplace=True)
    print(len(df))

TT61B05.PV 	Size= 2629441 --> 2629440
PT63105.PV 	Size= 2629441 --> 2599764
TC63109E.AV 	Size= 2629441 --> 2599763
PT61B00.PV 	Size= 2629441 --> 2629441
PC63112E.AV 	Size= 2629441 --> 2629415
T6153.PV 	Size= 2629441 --> 2629440
PIE61B23.PV 	Size= 2629441 --> 404498
TT61B03.PV 	Size= 2629441 --> 2629440
LT63114.PV 	Size= 2629441 --> 2629441
05GTWY_BN06:XT61B17.PNT 	Size= 2629441 --> 2629441
T6151.PV 	Size= 2629441 --> 2629440
TT61B01.PV 	Size= 2629441 --> 2629439
05GTWY_BN06:XT61B16.PNT 	Size= 2629441 --> 2629441
05GTWY_BN06:XT61B20.PNT 	Size= 2629441 --> 2629441
PT63103.PV 	Size= 2629441 --> 2629441
PA63110.PV 	Size= 2629441 --> 2629415
PDA61B21.PV 	Size= 2629441 --> 2629415
05GTWY_BN06:XT61B13.PNT 	Size= 2629441 --> 2629441
PIE61B00.PV 	Size= 2629441 --> 404498
TT61B04.PV 	Size= 2629441 --> 2629440
F61221VP 	Size= 2629441 --> 2629441
05GTWY_BN06:XT61B12.PNT 	Size= 2629441 --> 2629441
F61221 	Size= 2629441 --> 2629441
TT61B06.PV 	Size= 2629441 --> 2629439
PC61A98E.AV 	Size= 2629441 -->

### Write imputed files

In [15]:
from utils import write_feature_dict
int_dir = INT_DATA_DIR + 'imputed/'
write_feature_dict(int_dir, df_master_features)

Writing to file  ../data/interim/P6302B/imputed/TT61B05.PV.pkl (2629440, 2) [ TT61B05.PV ]
Writing to file  ../data/interim/P6302B/imputed/PT63105.PV.pkl (2599764, 2) [ PT63105.PV ]
Writing to file  ../data/interim/P6302B/imputed/TC63109E.AV.pkl (2599763, 2) [ TC63109E.AV ]
Writing to file  ../data/interim/P6302B/imputed/PT61B00.PV.pkl (2629441, 2) [ PT61B00.PV ]
Writing to file  ../data/interim/P6302B/imputed/PC63112E.AV.pkl (2629415, 2) [ PC63112E.AV ]
Writing to file  ../data/interim/P6302B/imputed/T6153.PV.pkl (2629440, 2) [ T6153.PV ]
Writing to file  ../data/interim/P6302B/imputed/PIE61B23.PV.pkl (404498, 2) [ PIE61B23.PV ]
Writing to file  ../data/interim/P6302B/imputed/TT61B03.PV.pkl (2629440, 2) [ TT61B03.PV ]
Writing to file  ../data/interim/P6302B/imputed/LT63114.PV.pkl (2629441, 2) [ LT63114.PV ]
Writing to file  ../data/interim/P6302B/imputed/05GTWY_BN06-XT61B17.PNT.pkl (2629441, 2) [ 05GTWY_BN06:XT61B17.PNT ]
Writing to file  ../data/interim/P6302B/imputed/T6151.PV.pkl (2

### Read imputed files

In [16]:
# Testing read_data function
input_dir = INT_DATA_DIR + 'imputed/'
df_features_norm = read_data(input_dir, True)

Number of files found in ../data/interim/P6302B/imputed/ is 45 
Number of features extracted from 45 files is 45 


In [17]:
for feature in df_features_norm.keys():    
    print(feature, '--', len(df_features_norm[feature]))

TT61B05.PV -- 2629440
PT63105.PV -- 2599764
TC63109E.AV -- 2599763
PT61B00.PV -- 2629441
PC63112E.AV -- 2629415
T6153.PV -- 2629440
PIE61B23.PV -- 404498
TT61B03.PV -- 2629440
LT63114.PV -- 2629441
05GTWY_BN06:XT61B17.PNT -- 2629441
T6151.PV -- 2629440
TT61B01.PV -- 2629439
05GTWY_BN06:XT61B16.PNT -- 2629441
05GTWY_BN06:XT61B20.PNT -- 2629441
PT63103.PV -- 2629441
PA63110.PV -- 2629415
PDA61B21.PV -- 2629415
05GTWY_BN06:XT61B13.PNT -- 2629441
PIE61B00.PV -- 404498
TT61B04.PV -- 2629440
F61221VP -- 2629441
05GTWY_BN06:XT61B12.PNT -- 2629441
F61221 -- 2629441
TT61B06.PV -- 2629439
PC61A98E.AV -- 2629415
TT61B02.PV -- 2629440
05GTWY_BN06:XT61B10.PNT -- 2629441
PC61A98.AV -- 2629441
PIE61B22.PV -- 404498
T6152.PV -- 2629381
FT61A99.PV -- 2629441
05GTWY_BN06:ZT61B14.PNT -- 2103841
TC63109.AV -- 2629440
05GTWY_BN06:XT61B19.PNT -- 2629441
PT63112.PV -- 2629441
05GTWY_BN06:XT61B18.PNT -- 2629441
TT63109.PV -- 2629440
FIE61A99.PV -- 404498
PIE63113.PV -- 404498
05GTWY_BN06:ZT61B15.PNT -- 210384

### Normalization

In [18]:
from utils import scale_val

scaled_value_col = 'scaled_val'
value_col = 'val'

for feature, df in df_features_norm.items():
    
    df.dropna(inplace=True)
    
    min_val = df[value_col].min()
    max_val = df[value_col].max()

    df[scaled_value_col] = df[value_col].apply(lambda x:scale_val(x, min_val, max_val))
    
    print(feature, ' -- ', len(df), ' -- ', len(df.loc[df[value_col].isna()]))

TT61B05.PV  --  2629440  --  0
PT63105.PV  --  2599764  --  0
TC63109E.AV  --  2599763  --  0
PT61B00.PV  --  2629441  --  0
PC63112E.AV  --  2629415  --  0
T6153.PV  --  2629440  --  0
PIE61B23.PV  --  404498  --  0
TT61B03.PV  --  2629440  --  0
LT63114.PV  --  2629441  --  0
05GTWY_BN06:XT61B17.PNT  --  2629441  --  0
T6151.PV  --  2629440  --  0
TT61B01.PV  --  2629439  --  0
05GTWY_BN06:XT61B16.PNT  --  2629441  --  0
05GTWY_BN06:XT61B20.PNT  --  2629441  --  0
PT63103.PV  --  2629441  --  0
PA63110.PV  --  2629415  --  0
PDA61B21.PV  --  2629415  --  0
05GTWY_BN06:XT61B13.PNT  --  2629441  --  0
PIE61B00.PV  --  404498  --  0
TT61B04.PV  --  2629440  --  0
F61221VP  --  2629441  --  0
05GTWY_BN06:XT61B12.PNT  --  2629441  --  0
F61221  --  2629441  --  0
TT61B06.PV  --  2629439  --  0
PC61A98E.AV  --  2629415  --  0
TT61B02.PV  --  2629440  --  0
05GTWY_BN06:XT61B10.PNT  --  2629441  --  0
PC61A98.AV  --  2629441  --  0
PIE61B22.PV  --  404498  --  0
T6152.PV  --  2629381  --  0


### Write normalized files

In [19]:
int_dir = INT_DATA_DIR + 'normalized/'
write_feature_dict(int_dir, df_features_norm)

Writing to file  ../data/interim/P6302B/normalized/TT61B05.PV.pkl (2629440, 3) [ TT61B05.PV ]
Writing to file  ../data/interim/P6302B/normalized/PT63105.PV.pkl (2599764, 3) [ PT63105.PV ]
Writing to file  ../data/interim/P6302B/normalized/TC63109E.AV.pkl (2599763, 3) [ TC63109E.AV ]
Writing to file  ../data/interim/P6302B/normalized/PT61B00.PV.pkl (2629441, 3) [ PT61B00.PV ]
Writing to file  ../data/interim/P6302B/normalized/PC63112E.AV.pkl (2629415, 3) [ PC63112E.AV ]
Writing to file  ../data/interim/P6302B/normalized/T6153.PV.pkl (2629440, 3) [ T6153.PV ]
Writing to file  ../data/interim/P6302B/normalized/PIE61B23.PV.pkl (404498, 3) [ PIE61B23.PV ]
Writing to file  ../data/interim/P6302B/normalized/TT61B03.PV.pkl (2629440, 3) [ TT61B03.PV ]
Writing to file  ../data/interim/P6302B/normalized/LT63114.PV.pkl (2629441, 3) [ LT63114.PV ]
Writing to file  ../data/interim/P6302B/normalized/05GTWY_BN06-XT61B17.PNT.pkl (2629441, 3) [ 05GTWY_BN06:XT61B17.PNT ]
Writing to file  ../data/interim/