# WDNN - Hyperparameter Tuning

# Imports

In [0]:
# Standard imports
import numpy as np 
import pandas as pd 
import os 

# For loading data from Drive
from google.colab import drive

# Plotting 
import matplotlib.pyplot as plt 
from matplotlib import style
style.use('ggplot')
%matplotlib inline 
from seaborn import distplot, heatmap

# Scikit-Learn imports
from sklearn.model_selection import RandomizedSearchCV, train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Tensorflow and Keras
from tensorflow import keras

# Loading Data

In [2]:
# Mount Google Drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
# Defining filepaths
ROOT_DIR = "/content/drive/My Drive"
FYP_DIR = "EE 16-17 FYP DL Energy Theft Detection"
DATA_DIR = "Data/full-processing-data"
FILE_NAME = "outliersRemoved.csv"

# Create filepath
FILE_PATH = os.path.join(ROOT_DIR, FYP_DIR, DATA_DIR, FILE_NAME)

In [4]:
# Print the filepath for checking
print(FILE_PATH)

/content/drive/My Drive/EE 16-17 FYP DL Energy Theft Detection/Data/full-processing-data/outliersRemoved.csv


In [0]:
df = pd.read_csv(FILE_PATH)

In [6]:
# Did it work?
df.head()

Unnamed: 0,CONS_NO,FLAG,2014-01-01,2014-01-02,2014-01-03,2014-01-04,2014-01-05,2014-01-06,2014-01-07,2014-01-08,2014-01-09,2014-01-10,2014-01-11,2014-01-12,2014-01-13,2014-01-14,2014-01-15,2014-01-16,2014-01-17,2014-01-18,2014-01-19,2014-01-20,2014-01-21,2014-01-22,2014-01-23,2014-01-24,2014-01-25,2014-01-26,2014-01-27,2014-01-28,2014-01-29,2014-01-30,2014-01-31,2014-02-01,2014-02-02,2014-02-03,2014-02-04,2014-02-05,2014-02-06,2014-02-07,...,2016-09-22,2016-09-23,2016-09-24,2016-09-25,2016-09-26,2016-09-27,2016-09-28,2016-09-29,2016-09-30,2016-10-01,2016-10-02,2016-10-03,2016-10-04,2016-10-05,2016-10-06,2016-10-07,2016-10-08,2016-10-09,2016-10-10,2016-10-11,2016-10-12,2016-10-13,2016-10-14,2016-10-15,2016-10-16,2016-10-17,2016-10-18,2016-10-19,2016-10-20,2016-10-21,2016-10-22,2016-10-23,2016-10-24,2016-10-25,2016-10-26,2016-10-27,2016-10-28,2016-10-29,2016-10-30,2016-10-31
0,0387DD8A07E07FDA6271170F86AD9151,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.56,10.48,8.2,12.53,7.9,8.58,10.12,9.96,7.6,18.19,10.93,11.41,19.92,10.7,9.2,6.77,6.76,6.15,6.5,8.08,7.97,8.81,7.37,11.72,11.02,8.18,7.33,6.71,8.52,6.31,7.18,8.07,8.09,9.53,5.48,8.75,9.3,7.54,9.16,6.74
1,01D6177B5D4FFE0CABA9EF17DAFC2B84,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4B75AC4F2D8434CFF62DB64D0BB43103,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.34,13.483126,13.483126,13.483126,13.483126,12.73,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,10.95,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126,13.483126
3,B32AC8CC6D5D805AC053557AB05F5343,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17.3,16.95,8.25,22.76,14.07,20.7,6.5,9.99,18.59,16.79,26.82,14.7,16.05,24.1,12.64,10.85,11.77,13.03,12.26,15.66,13.43,15.03,14.61,13.97,15.07,14.11,14.77,11.72,11.73,11.98,12.81,15.12,17.26,14.91,19.59,20.79,17.95,19.26,14.46,11.72
4,EDFC78B07BA2908B3395C4EB2304665E,1,2.9,5.64,6.99,3.32,3.61,5.35,4.73,3.68,3.53,3.42,3.81,4.58,3.56,4.25,3.86,3.53,3.41,0.85,0.88,3.81,6.51,9.19,7.79,7.02,7.25,6.37,6.43,7.89,7.46,7.95,7.59,4.84,6.06,5.6,6.72,7.29,5.6,6.28,...,9.56,9.48,10.6,10.06,10.79,10.91,17.77,10.37,13.51,14.13,17.44,15.96,12.18,18.54,13.44,11.68,9.15,9.16,10.19,10.31,8.03,10.08,10.34,11.24,15.14,14.36,12.39,10.36,9.29,7.91,14.21,10.22,8.47,6.11,6.1,6.73,7.52,10.89,9.86,8.72


# Extracting features and labels

In [0]:
features, labels = df.iloc[:, 2:], df.iloc[:, 1]

# Train-Test Split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, 
                                                    random_state=0, stratify=labels.values)

In [9]:
# Checking shapes of training and test data
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (33897, 1034)
X_test (8475, 1034)
y_train (33897,)
y_test (8475,)


In [35]:
# Checking the types of the features and labels
print("X_train", type(X_train))
print("y_train", type(X_train))
print("X_test", type(X_train))
print("y_test", type(X_train))

X_train <class 'pandas.core.frame.DataFrame'>
y_train <class 'pandas.core.frame.DataFrame'>
X_test <class 'pandas.core.frame.DataFrame'>
y_test <class 'pandas.core.frame.DataFrame'>


# Pipeline 1 - 1D Data

The pipeline should
1. **`row_to_col`**: Transpose all features arrays so that rows (consumers) become columns.
2. **`scaler`**: Apply a user-specified scaling strategy to the transposed data on a column-wise (consumer-wise) basis.
3. **`col_to_row`**: Retranspose the scaled data so that consumers are once again along the rows axis.

In [0]:
pipeline_1D = Pipeline([
             # Convert all matrices to numpy array for faster processing
             ('to_numpy', FunctionTransformer(pd.DataFrame.to_numpy)),

             # Transpose once so that consumers are now along column axis
             ('row_to_col', FunctionTransformer(np.transpose)), 

             # Use scikit-learn scaler of your choice to scale kWhs on a consumer basis
             ('scaler', StandardScaler()),

             # Retranspose so that consumers are once again along the rows axis
             ('col_to_row', FunctionTransformer(np.transpose)),
])

# Pipeline 2 - 2D Data

Transform standard scaler data from the `pipeline_1D` into 2D data where each consumer's readings are `(148, 7, 1)`-dimensional matrix. 

In [0]:
class ZeroPadder(BaseEstimator, TransformerMixin):
  def __init__(self):
    """Initialise some attributes to be used for padding with zeros"""
    self.days_per_consumer = 0
    self.days_per_week = 7
    self.weeks_per_consumer = 0
    self.padding = 0
    self.pad_value = 0.0

  def fit(self, X, y=None):
    """Update the attributes based on X""" 
    # Find the number of days of kWh entries for each consumer
    self.days_per_consumer = X.shape[-1]

    # Find the number of weeks in these days rounded to the nearest whole number 
    self.weeks_per_consumer = np.ceil(self.days_per_consumer / self.days_per_week).astype('uint8')

    # Number of zeros for padding = number of extra days required to turn days_per_consumer
    # into a number that is equivalent to a whole number of weeks
    self.padding = self.weeks_per_consumer * self.days_per_week - self.days_per_consumer

    # Once parameters have been fit, just return
    return self

  def transform(self, X, y=None):
    """Transform the data according to the specified attributes"""
    # Add the right number of zeroes to the end of each row in the numpy array `X`
    # return np.apply_along_axis(np.pad(pad_width=(0, self.padding), 
    #                                   mode='constant', 
    #                                   constant_values=0.0),
    #                             axis=1)
    X.apply()

In [0]:
def daily_to_weekly(arr):
  """Simple function that converts daily kWh numpy arrays to weekly matrices"""
  # Find the number of kWh entries per consumer
  days_per_consumer = arr.length[-1]

  # This is constant
  days_per_week = 7

  # Round to the nearest number of whole weeks > number of days
  weeks_per_consumer = np.ceil(days_per_consumer / days_per_week).astype('uint8')

  # CHANNEL is always 1 - must provide this for CNN
  channels = 1
  return arr.reshape((weeks_per_consumer, days_per_weeks, channels))

In [0]:
pipeline_2D = Pipeline([
          # Pad each row with zeroes for reshaping 
          ('ZeroPadder', ZeroPadder()),

          # # Reshape to a weekly data that can be fed to a CNN
          # ('Reshaper_2D', FunctionTransformer(daily_to_weekly))
])

In [0]:
X_train_2D = pipeline_2D.fit_transform(X_train_1D), y_train.copy()