### Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string

from os import listdir
from os.path import isfile, join

### Notebook options

In [2]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### General Assumptions

In [3]:
time_col = 'datetime'
time_gran_col = 'datetime_gran'
value_col = 'val'
scaled_value_col = 'scaled_val'
time_granularity = 'min'

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2017-01-01 00:00:01', fmt)
start_date = datetime.strptime('2017-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

### Data Directies

In [5]:
RAW_DATA_DIR = '../data/raw/P2253/'
PROC_DATA_DIR = '../data/processed/P2253/'
INT_DATA_DIR = '../data/interim/P2253/'

### Functions

In [6]:
def get_seconds_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts))
    
    return time_diff

In [7]:
def get_minutes_after(current_date, base_date):
        
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0) + 1
    
    return time_diff

In [8]:
def get_hours_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0 / 60.0) + 1
    
    return time_diff

In [9]:
def get_days_after(current_date, base_date):
    
    base_ts = time.mktime(base_date.timetuple()) # Converting to Unix timestamp
    current_ts = time.mktime(current_date.timetuple())
    time_diff = round((current_ts - base_ts) / 60.0 / 60.0 / 24) + 1
    
    return time_diff

In [10]:
def get_months_after(current_date, base_date):    
    time_diff = ((current_date.year - base_date.year) * 12) + current_date.month - base_date.month + 1    
    return time_diff

In [11]:
def get_years_after(current_date, base_date):    
    time_diff = (current_date.year - base_date.year) + 1 
    return time_diff

In [12]:
def get_time_from_minutes(time_in_mins, base_date):    
    new_date = base_date + timedelta(minutes = time_in_mins)        
    return new_date

#### Debug

In [13]:
fmt = '%Y-%m-%d %H:%M:%S'
start_date = datetime.strptime('2017-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

td_min = get_minutes_after(stop_date, start_date)
print(td_min)

td_hours = get_hours_after(stop_date, start_date)
print(td_hours)

td_days = get_days_after(stop_date, start_date)
print(td_days)

td_mon = get_months_after(stop_date, start_date)
print(td_mon)

td_year = get_years_after(stop_date, start_date)
print(td_year)

1051201
17521
731
25
3


### Reading Aggregated Data

In [14]:
def read_data(input_path, print_debug = False):
    
    df_features = {}
    
    if os.path.isdir(input_path):
        input_files = [f for f in listdir(input_path) if (isfile(join(input_path, f))) and ((f.endswith('.pkl')) or (f.endswith('.csv')))]
    elif os.path.isfile(input_path):
        input_files = input_path
    
    if print_debug:
        print('Number of files found in %s is %d ' % (input_path, len(input_files)))
    
    for input_file in input_files:
        
        feature,_ = os.path.splitext(input_file)
        input_file = input_path + input_file  
        
        with open(input_file, 'rb') as f:
            df = pkl.load(f)
            df_features[feature] = df
    
    if print_debug:
        print('Number of features extracted from %d files is %d ' % (len(input_files), len(df_features)))
    
    return df_features

In [15]:
# Testing read_data function
input_dir = INT_DATA_DIR + 'aggregated/'
df_features = read_data(input_dir, True)

Number of files found in ../data/interim/P2253/aggregated/ is 51 
Number of features extracted from 51 files is 51 


In [16]:
print('Number of features = ', len(list(df_features.keys())))

Number of features =  51


In [17]:
def get_granulairty_function(granularity, current_date, base_date):
    
    if granularity == 'sec':
        return get_seconds_after(current_date, base_date)
    elif granularity == 'min':
        return get_minutes_after(current_date, base_date)
    elif granularity == 'hr':
        return get_hours_after(current_date, base_date)
    elif granularity == 'day':
        return get_days_after(current_date, base_date)
    elif granularity == 'mon':
        return get_months_after(current_date, base_date)
    elif granularity == 'yr':
        return get_years_after(current_date, base_date)
    
    return get_minutes_after(current_date, base_date) # Default return function is for minutes

### Generate Master Dataframe for time

In [18]:
from datetime import datetime
def generate_master_df(time_granularity,
                       time_gran_col,
                       base_date,
                       end_date,
                       print_debug = False):
    '''
    Generates a master dataframe
    Dataframe will have an integer column that denotes x minutes have passed after the base_date
    granulaity - can take one of the following - 'sec' (seconds), min ' (minutes), 'hr' (hour), 
                'day' (day), 'mon' (month), 'yr' (year)
    base_date = date of reference since which the unit of time is computed
    '''
    
    if print_debug:
        print('Granularity is', time_granularity, '\tStart Date = ', base_date, '\tEnd Date = ', end_date)
            
    max_td = get_granulairty_function(time_granularity, end_date, base_date)
    
    df_master = pd.DataFrame(columns=[time_gran_col])    
    df_master[time_gran_col] = [i for i in range(1, max_td+1)]
    
    if print_debug:
        print('Shape of the master dataframe is ', df_master.shape, 'with columns ', df_master.columns.values)
    
    return df_master

In [19]:
time_granularity = 'min'
time_gran_col = 'datetime_gran'

fmt = '%Y-%m-%d %H:%M:%S'
# base_date = datetime.strptime('2017-01-01 00:00:01', fmt)
# stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

print(time_granularity, base_date, stop_date)

df_master = generate_master_df(time_granularity= time_granularity,
                               time_gran_col= time_gran_col, 
                               base_date= base_date, 
                               end_date= stop_date)

min 2017-01-01 00:00:01 2019-01-01 00:00:01


In [20]:
print('Size of the master df', len(df_master))

Size of the master df 1051201


In [21]:
for feature,df in df_features.items():
    print(feature, ' - Length: ', len(df))

22P53CP4:LIE22F36.PNT  - Length:  415301
22P53CP4:PI22F31.PNT  - Length:  579932
22GTWY_E402:PIE22F32.PNT  - Length:  233274
UBNV05CPB:XI22F30Y.PNT  - Length:  583870
22P53CP4:FC22E22.MEAS  - Length:  556892
22P53CP4:PDI22F33.PNT  - Length:  415224
22PM53CPM:TI22F16A.PNT  - Length:  23389
22PM53CPM:TI22F17A.PNT  - Length:  23389
22GTWY_E403:PDALE22E24SP.PNT  - Length:  23192
UBNV05CPB:XI22F26X.PNT  - Length:  583422
22P53CP4:FC22E22.OUT  - Length:  556897
UBNV05CPB:XI22F26Y.PNT  - Length:  584072
22P53CP4:FC22E04.OUT  - Length:  352609
22GTWY_E403:FIE22E23.PNT  - Length:  329490
UBNV05CPB:XI22F30X.PNT  - Length:  583923
22P53CP4:TI22F14.PNT  - Length:  583934
TMP:HCU_P2253_Flow_Balance.Cal  - Length:  637308
22PM53CPM:TI22F16B.PNT  - Length:  23389
22PM53CPM:TI22F17B.PNT  - Length:  23391
22P53CP4:FXC22E22.MEAS  - Length:  556922
UBNV05CPB:ZI22F28.PNT  - Length:  584096
22P53CP4:TC22F38.SPT  - Length:  23820
UBNV05CPB:SI22F23.PNT  - Length:  227340
22P53CP4:TC22F38.MEAS  - Length:  579

In [22]:
df_features['22P53CP4:PI22F31.PNT'].head()

Unnamed: 0,datetime_gran,val
0,247460,104.6885
1,247461,104.7289
2,247462,104.7692
3,247463,104.7329
4,247464,104.7773


### Generate Master Features

In [23]:
df_master_features = {}

idx = 0
for feature in df_features.keys():
    df = df_features[feature].copy()
    df = pd.merge(df_master, df, how='left', left_on=[time_gran_col], right_on=[time_gran_col])
    
    df_master_features[feature] = df

### Interpolation - Linear

In [24]:
for feature in df_master_features.keys():    
    df = df_master_features[feature].copy()
    
    print(feature, 'Total=', len(df),  ' NANs= ', len(df.loc[df[value_col].isna()]), end=' --> ')
    
    df.interpolate(inplace=True)
    
    print(len(df.loc[df[value_col].isna()]))
    
    df_master_features[feature] = df

22P53CP4:LIE22F36.PNT Total= 1051201  NANs=  635900 --> 525601
22P53CP4:PI22F31.PNT Total= 1051201  NANs=  471269 --> 247459
22GTWY_E402:PIE22F32.PNT Total= 1051201  NANs=  817927 --> 247457
UBNV05CPB:XI22F30Y.PNT Total= 1051201  NANs=  467331 --> 247931
22P53CP4:FC22E22.MEAS Total= 1051201  NANs=  494309 --> 247459
22P53CP4:PDI22F33.PNT Total= 1051201  NANs=  635977 --> 525601
22PM53CPM:TI22F16A.PNT Total= 1051201  NANs=  1027812 --> 247470
22PM53CPM:TI22F17A.PNT Total= 1051201  NANs=  1027812 --> 247470
22GTWY_E403:PDALE22E24SP.PNT Total= 1051201  NANs=  1028009 --> 247470
UBNV05CPB:XI22F26X.PNT Total= 1051201  NANs=  467779 --> 247931
22P53CP4:FC22E22.OUT Total= 1051201  NANs=  494304 --> 247459
UBNV05CPB:XI22F26Y.PNT Total= 1051201  NANs=  467129 --> 247931
22P53CP4:FC22E04.OUT Total= 1051201  NANs=  698592 --> 247470
22GTWY_E403:FIE22E23.PNT Total= 1051201  NANs=  721711 --> 525601
UBNV05CPB:XI22F30X.PNT Total= 1051201  NANs=  467278 --> 247931
22P53CP4:TI22F14.PNT Total= 1051201 

In [25]:
df_master_features['22P53CP4:TC22F38.MEAS'].head()

Unnamed: 0,datetime_gran,val
0,1,
1,2,
2,3,
3,4,
4,5,


In [26]:
for feature,df in df_master_features.items():
    print(feature, '\tSize=', len(df), end=' --> ')
    df.dropna(subset=['val'], inplace=True)
    print(len(df))

22P53CP4:LIE22F36.PNT 	Size= 1051201 --> 525600
22P53CP4:PI22F31.PNT 	Size= 1051201 --> 803742
22GTWY_E402:PIE22F32.PNT 	Size= 1051201 --> 803744
UBNV05CPB:XI22F30Y.PNT 	Size= 1051201 --> 803270
22P53CP4:FC22E22.MEAS 	Size= 1051201 --> 803742
22P53CP4:PDI22F33.PNT 	Size= 1051201 --> 525600
22PM53CPM:TI22F16A.PNT 	Size= 1051201 --> 803731
22PM53CPM:TI22F17A.PNT 	Size= 1051201 --> 803731
22GTWY_E403:PDALE22E24SP.PNT 	Size= 1051201 --> 803731
UBNV05CPB:XI22F26X.PNT 	Size= 1051201 --> 803270
22P53CP4:FC22E22.OUT 	Size= 1051201 --> 803742
UBNV05CPB:XI22F26Y.PNT 	Size= 1051201 --> 803270
22P53CP4:FC22E04.OUT 	Size= 1051201 --> 803731
22GTWY_E403:FIE22E23.PNT 	Size= 1051201 --> 525600
UBNV05CPB:XI22F30X.PNT 	Size= 1051201 --> 803270
22P53CP4:TI22F14.PNT 	Size= 1051201 --> 803742
TMP:HCU_P2253_Flow_Balance.Cal 	Size= 1051201 --> 803278
22PM53CPM:TI22F16B.PNT 	Size= 1051201 --> 803731
22PM53CPM:TI22F17B.PNT 	Size= 1051201 --> 803731
22P53CP4:FXC22E22.MEAS 	Size= 1051201 --> 803742
UBNV05CPB:ZI2

### Write imputed files

In [27]:
int_dir = INT_DATA_DIR + 'imputed/'

for feature, df in df_master_features.items():    
    op_file = int_dir + feature + '.pkl'
    with open(op_file, 'wb') as f:
        pkl.dump(df, f, protocol=pkl.HIGHEST_PROTOCOL)
    

### Read imputed files

In [28]:
# Testing read_data function
input_dir = INT_DATA_DIR + 'imputed/'
df_features_norm = read_data(input_dir, True)

Number of files found in ../data/interim/P2253/imputed/ is 51 
Number of features extracted from 51 files is 51 


In [29]:
for feature in df_features_norm.keys():    
    print(feature, '--', len(df_features_norm[feature]))

22P53CP4:LIE22F36.PNT -- 525600
22P53CP4:PI22F31.PNT -- 803742
22GTWY_E402:PIE22F32.PNT -- 803744
UBNV05CPB:XI22F30Y.PNT -- 803270
22P53CP4:FC22E22.MEAS -- 803742
22P53CP4:PDI22F33.PNT -- 525600
22PM53CPM:TI22F16A.PNT -- 803731
22PM53CPM:TI22F17A.PNT -- 803731
22GTWY_E403:PDALE22E24SP.PNT -- 803731
UBNV05CPB:XI22F26X.PNT -- 803270
22P53CP4:FC22E22.OUT -- 803742
UBNV05CPB:XI22F26Y.PNT -- 803270
22P53CP4:FC22E04.OUT -- 803731
22GTWY_E403:FIE22E23.PNT -- 525600
UBNV05CPB:XI22F30X.PNT -- 803270
22P53CP4:TI22F14.PNT -- 803742
TMP:HCU_P2253_Flow_Balance.Cal -- 803278
22PM53CPM:TI22F16B.PNT -- 803731
22PM53CPM:TI22F17B.PNT -- 803731
22P53CP4:FXC22E22.MEAS -- 803742
UBNV05CPB:ZI22F28.PNT -- 803270
22P53CP4:TC22F38.SPT -- 803731
UBNV05CPB:SI22F23.PNT -- 803271
22P53CP4:TC22F38.MEAS -- 803742
22P53CP4:FXC22E22.SPT -- 803731
22PM53CPM:TI22F18B.PNT -- 803731
22P53CP4:TC22F38.OUT -- 803731
22GTWY_E402:PALE22F32SP.PNT -- 525588
UBNV05CPB:XI22F25Y.PNT -- 803270
22P53CP4:FQI22E22.OUT -- 803742
22P53CP

### Normalization

In [30]:
def scale_val(val, min_val, max_val):
    if val is not None:
        return (val-min_val)/(max_val-min_val + 1e-7)
    return None

In [31]:
scaled_value_col = 'scaled_val'
value_col = 'val'

for feature, df in df_features_norm.items():
    
    df.dropna(inplace=True)
    
    min_val = df[value_col].min()
    max_val = df[value_col].max()

    df[scaled_value_col] = df[value_col].apply(lambda x:scale_val(x, min_val, max_val))
    
    print(feature, ' -- ', len(df), ' -- ', len(df.loc[df[value_col].isna()]))

22P53CP4:LIE22F36.PNT  --  525600  --  0
22P53CP4:PI22F31.PNT  --  803742  --  0
22GTWY_E402:PIE22F32.PNT  --  803744  --  0
UBNV05CPB:XI22F30Y.PNT  --  803270  --  0
22P53CP4:FC22E22.MEAS  --  803742  --  0
22P53CP4:PDI22F33.PNT  --  525600  --  0
22PM53CPM:TI22F16A.PNT  --  803731  --  0
22PM53CPM:TI22F17A.PNT  --  803731  --  0
22GTWY_E403:PDALE22E24SP.PNT  --  803731  --  0
UBNV05CPB:XI22F26X.PNT  --  803270  --  0
22P53CP4:FC22E22.OUT  --  803742  --  0
UBNV05CPB:XI22F26Y.PNT  --  803270  --  0
22P53CP4:FC22E04.OUT  --  803731  --  0
22GTWY_E403:FIE22E23.PNT  --  525600  --  0
UBNV05CPB:XI22F30X.PNT  --  803270  --  0
22P53CP4:TI22F14.PNT  --  803742  --  0
TMP:HCU_P2253_Flow_Balance.Cal  --  803278  --  0
22PM53CPM:TI22F16B.PNT  --  803731  --  0
22PM53CPM:TI22F17B.PNT  --  803731  --  0
22P53CP4:FXC22E22.MEAS  --  803742  --  0
UBNV05CPB:ZI22F28.PNT  --  803270  --  0
22P53CP4:TC22F38.SPT  --  803731  --  0
UBNV05CPB:SI22F23.PNT  --  803271  --  0
22P53CP4:TC22F38.MEAS  --  8037

### Write normalized files

In [32]:
int_dir = INT_DATA_DIR + 'normalized/'

for feature, df in df_features_norm.items():    
    
    pkl_file = int_dir + feature + '.pkl'
    with open(pkl_file, 'wb') as f:
        pkl.dump(df, f, protocol=pkl.HIGHEST_PROTOCOL)

In [33]:
df_features_norm['UBNV05CPB:XI22F25X.PNT'].head()

Unnamed: 0,datetime_gran,val,scaled_val
247931,247932,30.86054,0.709421
247932,247933,29.21335,0.670829
247933,247934,29.50403,0.677639
247934,247935,30.37607,0.69807
247935,247936,32.79841,0.754824


In [34]:
df_features_norm['UBNV05CPB:ZI22F28.PNT'].head()

Unnamed: 0,datetime_gran,val,scaled_val
247931,247932,-0.01158,0.842038
247932,247933,-0.010223,0.845991
247933,247934,-0.008866,0.849943
247934,247935,-0.009545,0.847967
247935,247936,-0.008576,0.85079


### Useful features

In [35]:
# Removing the following 
# a) All .OUT
# b) All .CIN
# c) All .SPT
# d) .PNTs that have a flat line in graph
usable_features = [
 'UBNV05CPB:XI22F25Y.PNT',
 'UBNV05CPB:XI22F25X.PNT',
 '22P53CP4:PI22F31.PNT',
 '22P53CP4:FX22E22SPF.MEAS',
 'UBNV05CPB:XI22F29X.PNT',
 'UBNV05CPB:ZI22F28.PNT',
 'UBNV05CPB:XI22F29Y.PNT',
 '22P53CP4:FXC22E22.MEAS',
 '22P53CP4:TI22F14.PNT',
 '22P53CP4:FC22E04.MEAS',
 '22P53CP4:PI22E05.PNT',
 'UBNV05CPB:XI22F30Y.PNT',
 '22GTWY_E402:PIE22F32.PNT',
 '22P53CP4:TI22F11.PNT',
 '22P53CP4:FC22E22.MEAS',
 'UBNV05CPB:XI22F26X.PNT',
 'UBNV05CPB:XI22F26Y.PNT',
 'UBNV05CPB:XI22F30X.PNT',
 '22GTWY_E403:FIE22E23.PNT',
 'UBNV05CPB:ZI22F27.PNT',
 '22P53CP4:TI22F12.PNT',
 '22P53CP4:TI22F13.PNT',
 '22P53CP4:TC22F38.MEAS',
 '22PM53CPM:II22E47.PNT',
 '22GTWY_E403:PDIE22E24.PNT']

print('Number of usable features = ', len(usable_features))

Number of usable features =  25
