# Data Aggregation and Dimensionality Reduction

### Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import os
import ntpath
import pickle as pkl
import xlrd
import time
import string
import math

from os import listdir
from os.path import isfile, join

### Notebook options

In [2]:
pd.set_option('display.max_colwidth', -1) # Column width
plt.rcParams['figure.figsize'] = [15, 10] # Size of the plots

### General Assumptions

In [3]:
time_col = 'datetime'
time_gran_col = 'datetime_gran'
value_col = 'val'
scaled_value_col = 'scaled_val'
time_granularity = 'min'

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'
base_date = datetime.strptime('2016-01-01 00:00:01', fmt)
start_date = datetime.strptime('2016-01-01 00:00:01', fmt)
stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

### Data Directories

In [5]:
pump = 'P1201'
RAW_DATA_DIR = '../data/raw/'+pump+'/'
PROC_DATA_DIR = '../data/processed/'+pump+'/'
INT_DATA_DIR = '../data/interim/'+pump+'/'

### Reading Normalized Data

In [6]:
from utils import read_data
input_dir = INT_DATA_DIR + 'normalized/'
df_features_norm = read_data(input_dir, True)

Number of files found in ../data/interim/P1201/normalized/ is 16 
Number of features extracted from 16 files is 16 


### Selected Features

In [7]:
list(df_features_norm.keys())

['HART_P1CP04:FY12847FB.MEAS',
 '12V04CP4:PC12007.MEAS',
 '12V04CP4:LC12005A.MEAS',
 '12P01BCP4:FC12847.OUT',
 '12V04CP4:FC12006.OUT',
 '12P01BCP4:XI12597.CIN',
 '12P01BCP4:FC12847.SPT',
 '12V05CP4:PC12073.MEAS',
 '12P01BCP4:PI12956.PNT',
 '12V08CP4:FC12351.MEAS',
 '12DATASCRCP1:TI12813.PNT',
 '12V04CP4:FC12006.MEAS',
 '12GTWY_E101:FIE12404.PNT',
 '12P01BCP4:PI12955.PNT',
 '12P01BCP4:FC12847.MEAS',
 '12GTWY_E101:FALE12404SP.PNT']

In [8]:
# Removing the following 
# a) All .OUT
# b) All .CIN
# c) All .SPT
# d) .PNTs that have a flat line in graph

usable_features = [
 'HART_P1CP04:FY12847FB.MEAS',
 '12V04CP4:PC12007.MEAS',
 '12V04CP4:LC12005A.MEAS',
 '12V05CP4:PC12073.MEAS',
 '12P01BCP4:PI12956.PNT',
 '12V08CP4:FC12351.MEAS',
 '12DATASCRCP1:TI12813.PNT',
 '12V04CP4:FC12006.MEAS',
 '12GTWY_E101:FIE12404.PNT',
 '12P01BCP4:PI12955.PNT',
 '12P01BCP4:FC12847.MEAS'
]

print('Number of usable features = ', len(usable_features))

Number of usable features =  11


### Generate Master Dataframe for time

In [9]:
from utils import generate_master_df

time_granularity = 'min'
time_gran_col = 'datetime_gran'

fmt = '%Y-%m-%d %H:%M:%S'
# base_date = datetime.strptime('2017-01-01 00:00:01', fmt)
# stop_date = datetime.strptime('2019-01-01 00:00:01', fmt)

print(time_granularity, base_date, stop_date)

df_master = generate_master_df(time_granularity= time_granularity,
                               time_gran_col= time_gran_col, 
                               base_date= base_date, 
                               end_date= stop_date)

min 2016-01-01 00:00:01 2019-01-01 00:00:01


In [10]:
print('Size of the master df', len(df_master))

Size of the master df 1578241


### Use only selected features

In [11]:
df_features_sel = {}
for feature in usable_features:
    df_features_sel[feature] = df_features_norm[feature].copy()

In [12]:
df_features_sel.keys()

dict_keys(['HART_P1CP04:FY12847FB.MEAS', '12V04CP4:PC12007.MEAS', '12V04CP4:LC12005A.MEAS', '12V05CP4:PC12073.MEAS', '12P01BCP4:PI12956.PNT', '12V08CP4:FC12351.MEAS', '12DATASCRCP1:TI12813.PNT', '12V04CP4:FC12006.MEAS', '12GTWY_E101:FIE12404.PNT', '12P01BCP4:PI12955.PNT', '12P01BCP4:FC12847.MEAS'])

### Convert individual features into single dataframe

In [13]:
idx = 0
for feature in df_features_sel.keys():

    df = df_features_sel[feature].copy()
    df_master = pd.merge(df_master, df[[time_gran_col, scaled_value_col]], how='left', left_on=[time_gran_col], right_on=[time_gran_col])
    df_master.rename(columns={scaled_value_col:feature}, inplace=True)    

In [14]:
df_features_sel.keys()

dict_keys(['HART_P1CP04:FY12847FB.MEAS', '12V04CP4:PC12007.MEAS', '12V04CP4:LC12005A.MEAS', '12V05CP4:PC12073.MEAS', '12P01BCP4:PI12956.PNT', '12V08CP4:FC12351.MEAS', '12DATASCRCP1:TI12813.PNT', '12V04CP4:FC12006.MEAS', '12GTWY_E101:FIE12404.PNT', '12P01BCP4:PI12955.PNT', '12P01BCP4:FC12847.MEAS'])

In [15]:
df_master.head()

Unnamed: 0,datetime_gran,HART_P1CP04:FY12847FB.MEAS,12V04CP4:PC12007.MEAS,12V04CP4:LC12005A.MEAS,12V05CP4:PC12073.MEAS,12P01BCP4:PI12956.PNT,12V08CP4:FC12351.MEAS,12DATASCRCP1:TI12813.PNT,12V04CP4:FC12006.MEAS,12GTWY_E101:FIE12404.PNT,12P01BCP4:PI12955.PNT,12P01BCP4:FC12847.MEAS
0,1,,,,,,,,,,,
1,2,,,,,,,,,,,
2,3,,,,,,,,,,,
3,4,,,,,,,,,,,
4,5,,,,,,,,,,,


### Aggregation

In [16]:
from utils import get_minutes_after

start_date_analysis = datetime.strptime('2018-01-01 00:00:01', fmt)
stop_date_analysis = datetime.strptime('2018-11-01 00:00:01', fmt)

start = get_minutes_after(base_date=base_date, current_date=start_date_analysis)
stop = get_minutes_after(base_date=base_date, current_date=stop_date_analysis)

df_master_date = df_master.loc[(df_master[time_gran_col] > start) & (df_master[time_gran_col] < stop)]

In [17]:
df_master_date.head()

Unnamed: 0,datetime_gran,HART_P1CP04:FY12847FB.MEAS,12V04CP4:PC12007.MEAS,12V04CP4:LC12005A.MEAS,12V05CP4:PC12073.MEAS,12P01BCP4:PI12956.PNT,12V08CP4:FC12351.MEAS,12DATASCRCP1:TI12813.PNT,12V04CP4:FC12006.MEAS,12GTWY_E101:FIE12404.PNT,12P01BCP4:PI12955.PNT,12P01BCP4:FC12847.MEAS
1052641,1052642,0.982966,0.616575,0.516241,0.371921,0.715633,0.351119,0.069354,0.332981,0.114017,0.496189,0.000338
1052642,1052643,0.982966,0.624924,0.513221,0.37242,0.714983,0.352894,0.067677,0.332231,0.115755,0.496189,0.000358
1052643,1052644,0.982966,0.626434,0.51213,0.372087,0.715307,0.353981,0.068198,0.329184,0.106135,0.495362,0.000398
1052644,1052645,0.982966,0.622968,0.512718,0.373418,0.715794,0.354564,0.060073,0.331178,0.109651,0.495527,0.000318
1052645,1052646,0.982966,0.625279,0.511166,0.373916,0.715388,0.355147,0.054184,0.330455,0.109651,0.495527,0.000348


In [18]:
from utils import lcl_divmul

agg_val_list = [1, 5, 10, 15, 30, 60]

for agg_val in agg_val_list:
    
    df_master_date_analysis = df_master_date.copy()
    
    agg_col = 'agg'+str(agg_val)
    df_master_date_analysis[agg_col] = df_master_date_analysis[time_gran_col].apply(lambda x: lcl_divmul(x, agg_val, agg_val))
    
    use_cols = list(set(df_master_date_analysis.columns.values) - set([time_gran_col]))
    df_master_date_analysis = df_master_date_analysis[use_cols].copy()
    
    df_agg = df_master_date_analysis.groupby(by=[agg_col]).mean()
    df_agg = df_agg.reset_index(level=0, inplace=False)
    
    output_file = INT_DATA_DIR + 'agg_single/' + agg_col + '.csv'
    with open(output_file, 'wb') as f:
        df_agg.to_csv(output_file, header=True, index=False)
        print('Writing to ', output_file)

Writing to  ../data/interim/P1201/agg_single/agg1.csv
Writing to  ../data/interim/P1201/agg_single/agg5.csv
Writing to  ../data/interim/P1201/agg_single/agg10.csv
Writing to  ../data/interim/P1201/agg_single/agg15.csv
Writing to  ../data/interim/P1201/agg_single/agg30.csv
Writing to  ../data/interim/P1201/agg_single/agg60.csv


### Reading back the files to check if the data was written correctly

In [21]:
### Read aggregated data

agg_val = 1
agg_col = 'agg' + str(agg_val) 
input_file = INT_DATA_DIR + 'agg_single/' + agg_col + '.csv'
with open(input_file, 'rb') as f:
    df_agg = pd.read_csv(input_file)

In [23]:
df_agg.shape

(437699, 12)