# Feature Engineering v1

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    "../data/processed/hourly_energy.csv",
    parse_dates=["datetime"],
    index_col="datetime"
).sort_index()

df.head()

Unnamed: 0_level_0,Global_active_power
datetime,Unnamed: 1_level_1
2006-12-16 17:00:00,4.2229
2006-12-16 18:00:00,3.6322
2006-12-16 19:00:00,3.4002
2006-12-16 20:00:00,3.2686
2006-12-16 21:00:00,3.0565


In [2]:
target = "Global_active_power"

### Time Features

In [3]:
df_feat = df.copy()

df_feat["hour"] = df_feat.index.hour
df_feat["dayofweek"] = df_feat.index.dayofweek   # 0=Mon
df_feat["month"] = df_feat.index.month

In [4]:
df_feat[["hour", "dayofweek", "month"]].head()

Unnamed: 0_level_0,hour,dayofweek,month
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006-12-16 17:00:00,17,5,12
2006-12-16 18:00:00,18,5,12
2006-12-16 19:00:00,19,5,12
2006-12-16 20:00:00,20,5,12
2006-12-16 21:00:00,21,5,12


### Lag Features

In [5]:
df_feat["lag_1"] = df_feat[target].shift(1)
df_feat["lag_24"] = df_feat[target].shift(24)
df_feat["lag_168"] = df_feat[target].shift(168)

In [6]:
df_feat[[target, "lag_1", "lag_24", "lag_168"]].head(200).tail(10)

Unnamed: 0_level_0,Global_active_power,lag_1,lag_24,lag_168
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-12-24 15:00:00,3.0968,1.3715,4.0491,2.9854
2006-12-24 16:00:00,2.0071,3.0968,4.3491,3.326
2006-12-24 17:00:00,1.6865,2.0071,5.4525,3.4068
2006-12-24 18:00:00,0.5052,1.6865,3.8794,3.6971
2006-12-24 19:00:00,0.454,0.5052,4.1178,2.9084
2006-12-24 20:00:00,0.4741,0.454,4.1814,3.3615
2006-12-24 21:00:00,0.484,0.4741,3.2884,3.0408
2006-12-24 22:00:00,0.4798,0.484,4.3279,1.518
2006-12-24 23:00:00,0.4863,0.4798,5.5625,0.4377
2006-12-25 00:00:00,0.9996,0.4863,4.028,0.2764


### Drop Rows with NaNs

In [7]:
df_feat = df_feat.dropna()
df_feat.shape

(34421, 7)

### Define Feature(X) and Target(y)

In [8]:
feature_cols = ["hour", "dayofweek", "month", "lag_1", "lag_24", "lag_168"]

X = df_feat[feature_cols]
y = df_feat[target]

In [9]:
X.head(), y.head()

(                     hour  dayofweek  month   lag_1  lag_24  lag_168
 datetime                                                            
 2006-12-23 17:00:00    17          5     12  4.3491  1.4968   4.2229
 2006-12-23 18:00:00    18          5     12  5.4525  2.6870   3.6322
 2006-12-23 19:00:00    19          5     12  3.8794  3.9382   3.4002
 2006-12-23 20:00:00    20          5     12  4.1178  3.5361   3.2686
 2006-12-23 21:00:00    21          5     12  4.1814  4.5487   3.0565,
 datetime
 2006-12-23 17:00:00    5.4525
 2006-12-23 18:00:00    3.8794
 2006-12-23 19:00:00    4.1178
 2006-12-23 20:00:00    4.1814
 2006-12-23 21:00:00    3.2884
 Name: Global_active_power, dtype: float64)

### Time Split (again after FE)

In [10]:
n = len(df_feat)
train_end = int(n * 0.70)
val_end = int(n * 0.85)

X_train = X.iloc[:train_end]
y_train = y.iloc[:train_end]

X_val = X.iloc[train_end:val_end]
y_val = y.iloc[train_end:val_end]

X_test = X.iloc[val_end:]
y_test = y.iloc[val_end:]

len(X_train), len(X_val), len(X_test)

(24094, 5163, 5164)

### Quick Checks

In [11]:
X.isna().sum()

hour         0
dayofweek    0
month        0
lag_1        0
lag_24       0
lag_168      0
dtype: int64

In [12]:
X.describe()

Unnamed: 0,hour,dayofweek,month,lag_1,lag_24,lag_168
count,34421.0,34421.0,34421.0,34421.0,34421.0,34421.0
mean,11.501089,2.998954,6.423462,1.085834,1.086971,1.088756
std,6.922395,2.000072,3.402078,0.893214,0.894575,0.895875
min,0.0,0.0,1.0,0.124,0.124,0.124
25%,6.0,1.0,3.0,0.3421,0.3422,0.3422
50%,12.0,3.0,6.0,0.8019,0.8023,0.8036
75%,18.0,5.0,9.0,1.571,1.5726,1.5752
max,23.0,6.0,12.0,6.5605,6.5605,6.5605
