In [None]:
%load_ext autoreload
%autoreload 2

import urllib.request as urllib2
from io import BytesIO
from zipfile import ZipFile
from IPython.display import display

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

TODO:

- [ ] Put all the imports at the top
- [ ] Create a requirements.txt

# loading in the data

This example uses the pulbic UCI [power consumption](https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption) dataset.

This dataset withholds measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. Different electrical quantities and some sub-metering values are available.

In [None]:
zip_url: str = "https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
zipped_file_name: str = "household_power_consumption.txt"


df_power_consumption: pd.DataFrame = pd.read_csv(
    ZipFile(BytesIO(urllib2.urlopen(zip_url).read())).open(zipped_file_name),
    sep=";",
    parse_dates={"timestamp": ["Date", "Time"]},
    infer_datetime_format=True,
    low_memory=False,
    na_values=["nan", "?"],
    index_col="timestamp",
    dtype="float32",
)

display(df_power_consumption.sample(3))
df_power_consumption.info()

# Exploratory Data Analysis (EDA)

**first step**: Reading the [dataset description](https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption) (or gather all information about the dataset)

**global variables**:
* `global_active_power`: household global **minute averaged** active power (kilowatt)
* `global_reactive_power`: household global **minute averaged** reactive power (kilowatt)
* `global_intensity`: household global **minute averaged** current (ampere)
* `voltage`: minute-averaged voltage (volt)

**sub meterings**:
* `sub_metering_1`: **kitchen** - dishwasher & microwave - (in watt-hour of **active energy**)
* `sub_metering_2`: **laundry room** - washing maching, tumble drier, refrigerator & light (in watt-hour of **active energy**)
* `sub_metering_3`: electric water-heater & air conditioner (in watt-hour of **active energy**)


As the user is only billed for the **active power**, we will use this variable as target.

## General data statistics

In [None]:
df_power_consumption.head()

In [None]:
df_power_consumption.tail()

In [None]:
print(
    f'rows={df_power_consumption.shape[0]:,}', 
    f'cols={df_power_consumption.shape[1]:,}'
)
print('-'*80)
# It appears we have some NaN's (not a numbers) in the data.
print('NaN sum:')
df_power_consumption.isna().sum()

In [None]:
df_power_consumption.describe().round(2).astype("str")

In [None]:
# so the data is actually regularly sampled
df_power_consumption.index.to_series().diff().value_counts()

In [None]:
# but if we drop the nan's (i.e. the isnull() values), this will not be the case
df_power_consumption.dropna().index.to_series().diff().value_counts().sample(3)

## Visualizations

In [None]:
df_power_consumption = df_power_consumption.dropna()

In [None]:
corr = df_power_consumption.corr() * np.tril(
    np.ones(tuple([len(df_power_consumption.columns)] * 2)), k=-1
)
pd.set_option("precision", 3)
corr.style.background_gradient(cmap="coolwarm", axis=None)

In [None]:
from plotly_resampler import FigureResampler
from plotly.subplots import make_subplots
from plotly_resampler.downsamplers import LTTB
import plotly.graph_objs as go

fig = FigureResampler(
    make_subplots(
        rows=3, cols=1, shared_xaxes=True,
        specs=[[{'secondary_y': True}], [{}], [{'secondary_y': True}]],
        subplot_titles=['Globals', 'Voltage', 'Sub metering']
    ),
    default_n_shown_samples=1000,
    default_downsampler=LTTB(interleave_gaps=True),
)

for c, row, visible in [('Global_active_power', 1, 1), ('Global_reactive_power', 1, 'legendonly'), ('Voltage', 2, 1),
    ('Sub_metering_1', 3, 1), ('Sub_metering_2', 3, 1), ('Sub_metering_3', 3, 1)]:
    fig.add_trace(
        go.Scattergl(name=c, visible=visible),
        hf_x=df_power_consumption.index,
        hf_y=df_power_consumption[c],
        row=row,
        col=1
    )

for c in ['Global_intensity']:
    fig.add_trace(
        go.Scattergl(name=c, visible='legendonly'),
        hf_x=df_power_consumption.index,
        hf_y=df_power_consumption[c],
        secondary_y=True,
        row=1,
        col=1
    )

# add a shaded weekend region on the lowest row
datelist = pd.date_range(df_power_consumption.index[0].date(), df_power_consumption.index[-1], freq='D')
weekend = datelist.weekday.isin([5, 6]).astype(int)
fig.add_trace(
    go.Scattergl(line_shape='hv', name='Weekend', showlegend=False, line_color ='rgba(0,0,0,0)', fillcolor='rgba(99, 110, 250, 0.15)', fill='tozeroy'),
    hf_x=datelist,
    hf_y=weekend,
    limit_to_view=True,
    max_n_samples=len(weekend),
    secondary_y=True,
    row=3,
    col=1
)


fig.update_layout(height=800)
fig.show_dash(mode='external', port=8051)

## Creating an objective

Our objective is:
> Estimate the average `Global active power` over the last 15 minutes by **only** using past sub-metering values.

`TODO` decide whether global intensity will be used<br>
**remark**:
We explicitly did not use global variables (like `global_intensity`) as these are show high correlations with the global power consumption.

In [None]:
# Our client wants to know the average power consumption per 15 minutes and this
# 5 minutes in advance for the next 15-minute period
avg_window_min = 15
shift_min = - (5 + avg_window_min)

avg_col = f"GAP_avg{avg_window_min}min"
target_col = f"{avg_col}_shift{shift_min}min"

# create the target by (1) calculating the average and (2) shifting the data so we will forecast
df_power_consumption[avg_col] = df_power_consumption.rolling(avg_window_min)["Global_active_power"].aggregate(np.nanmean)
df_power_consumption[target_col] = df_power_consumption[avg_col].shift(shift_min)

 ### 🚨 Perform visual inspection 🔍

In [None]:
from plotly_resampler import FigureResampler
from plotly.subplots import make_subplots
import plotly.graph_objs as go

fig = FigureResampler(make_subplots(rows=1, cols=1, shared_xaxes=True))

for c, row, visible in [('Global_active_power', 1, 1), (avg_col, 1, 'legendonly'), (target_col, 1, 1)]:
    fig.add_trace(
        go.Scattergl(name=c, visible=visible),
        hf_x=df_power_consumption.index,
        hf_y=df_power_consumption[c],
        row=row,
        col=1
    )

fig.update_layout(height=400)
fig.show_dash(mode='external', port=8051)

# ML time

## Train-test split

In [None]:
train_columns = [f"Sub_metering_{i}" for i in range(1, 4)] + ["timestamp", "Global_intensity", "Voltage"]
target_col = target_col

# The percentage of data used for testing
test_pct = 0.2
day_margin = 3

# add the timestamp col
df_power_consumption["timestamp"] = df_power_consumption.index

# Temporal split: Use the last test_pct of the data as test_data
df_train = df_power_consumption[: -int(len(df_power_consumption) * test_pct)].copy()
X_train, y_train = df_train[train_columns], df_train[target_col]

df_test = df_power_consumption[df_train.index[-1] + pd.Timedelta(days=day_margin) :]
X_test, y_test = df_test[train_columns], df_test[target_col]

In [None]:
import sys
sys.path.append("../")

In [None]:
import scipy.stats as ss

from tsflex.chunking import chunk_data
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.utils import make_robust

## Feature extraction with tsflex

In [None]:
# !pip install holidays
import holidays

In [None]:
# some feature functions
def slope(x): return (x[-1] - x[0]) / x[0] if x[0] else 0
def abs_diff_mean(x): return np.mean(np.abs(x[1:] - x[:-1])) if len(x) > 1 else 0
def diff_std(x): return np.std(x[1:] - x[:-1]) if len(x) > 1 else 0


# time based features
def time_float(x) -> float:
    x_ = pd.Timestamp(x[-1])
    return np.float32(x_.hour + x_.minute / 60)

def day_of_week(x) -> int: return pd.Timestamp(x[-1]).day_of_week

def is_holiday(x) -> bool: return pd.Timestamp(x[-1]) in holidays.France()

def yesterday_holiday(x) -> bool:
    return (pd.Timestamp(x[-1]) - pd.Timedelta(days=1)) in holidays.France()

def tomorrow_holiday(x) -> bool:
    return (pd.Timestamp(x[-1]) - pd.Timedelta(days=1)) in holidays.France()

funcs = [
    make_robust(f)
    for f in [ np.min, np.max, np.std, np.mean, slope, ss.skew, abs_diff_mean, diff_std,sum, len,]
]
time_funcs = [
    make_robust(f)
    for f in [time_float, day_of_week, is_holiday, yesterday_holiday, tomorrow_holiday]
]

# Create the feature collection
fc = FeatureCollection(
    feature_descriptors=[
        MultipleFeatureDescriptors(
            functions=funcs,
            # TODO -> maybe also use the `intensity`
            series_names=list(set(train_columns).difference({"timestamp"})),
            windows=["15min", "30min", "1h"],# "6h", "12h", "24h"],
            strides="15min",
        ),
        MultipleFeatureDescriptors(
            functions=time_funcs,
            series_names="timestamp",
            windows=["15min"],
            strides="15min",
        ),
    ]
)

### Chunking train data

In [None]:
# %%time
# chunk the data in blocks of `max_chunk_dur`
# also omits the gaps :)
chunks = chunk_data(
    data=df_train,
    max_chunk_dur="365 days",
    chunk_range_margin="10 min",
    sub_chunk_overlap="15min",
)

we will now use these yearly chunks to extract the features

In [None]:
%%time
df_train_feats = pd.concat(
    [
        fc.calculate(chunk, show_progress=True, return_df=True, approve_sparsity=True, n_jobs=None)
        for chunk in chunks
    ]
)

In [None]:
df_train_feats.info()

In [None]:
df_train_feats.sample(2)

In [None]:
# make sure that there are no duplicate indices
print(df_train_feats.shape)
df_train_feats = df_train_feats[~df_train_feats.index.duplicated()]
print(df_train_feats.shape)

## Constructing the pipeline

In [None]:
selected_cols = df_train_feats.columns
df_train = df_train_feats.join(y_train)
# drop the observations of which we don't have the target
df_train = df_train[df_train[y_train.name].notna()]

In [None]:
import catboost

In [None]:
pipe = catboost.CatBoostRegressor(verbose=100, )

# as this is a lot of data, this might take a minute or 5
pipe.fit(df_train[selected_cols], df_train[y_train.name])

---

## Prediction on `df_test`

In [None]:
df_test_feats = fc.calculate(
    df_test, show_progress=True, return_df=True, approve_sparsity=True,n_jobs=None
).dropna(how='all', axis=0)

In [None]:
df_test_tot = df_test_feats.join(y_test)

In [None]:
out = pipe.predict(df_test_tot[selected_cols])
df_test_tot["predictions"] = out

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

In [None]:
df_train_feats_ = df_train_feats.join(y_train)
train_predictions = pipe.predict(df_train_feats_[selected_cols])
nan_mask = df_train_feats_[y_train.name].notna()
y_true, y_pred = df_train_feats_[y_train.name][nan_mask], train_predictions[nan_mask]

print("MSE  [TRAIN]: ", round(mean_squared_error(y_true, y_pred), 3))
print("MAE  [TRAIN]: ", round(mean_absolute_error(y_true, y_pred), 3))
print("MAPE [TRAIN]: ", round(mean_absolute_percentage_error(y_true, y_pred), 3))
print("R2   [TRAIN]: ", round(r2_score(y_true, y_pred), 3))

In [None]:
nan_mask = df_test_tot[y_test.name].notna()
y_true, y_pred = df_test_tot[nan_mask][y_test.name], df_test_tot[nan_mask]["predictions"]

print("MSE  [TEST]: ", round(mean_squared_error(y_true, y_pred), 3))
print("MAE  [TEST]: ", round(mean_absolute_error(y_true, y_pred), 3))
print("MAPE [TEST]: ", round(mean_absolute_percentage_error(y_true, y_pred), 3))
print("R2   [TEST]: ", round(r2_score(y_true, y_pred), 3))

## Shap

In [None]:
# !pip install shap

In [None]:
import shap

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure()
explainer = shap.TreeExplainer(pipe)
shap_values = explainer.shap_values(df_train_feats[selected_cols])
shap.summary_plot(shap_values, df_train_feats[selected_cols], max_display=50, 
                    auto_size_plot=True, show=False, color_bar=False)
plt.show()

In [None]:
vals= np.abs(shap_values).mean(0)
feature_importance = pd.DataFrame(list(zip(selected_cols, vals)),columns=['col_name','feature_importance_vals'])
feature_importance = feature_importance.sort_values(by=['feature_importance_vals'],ascending=False).reset_index(drop=True)
feature_importance.head()

In [None]:
n = 50
important_cols = feature_importance[:n]['col_name'].values

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline

In [None]:
df_train_feats.shape

In [None]:
# gmm_pipe = Pipeline([
#     ('scaler', PowerTransformer()),
#     ('gmm', GaussianMixture(n_components=25, covariance_type='diag', random_state=42)),
# ])
# gmm_pipe.fit(df_train_feats[important_cols].dropna(how='any'))
# loglh = gmm_pipe.score_samples(df_test_feats[important_cols].dropna(how='any'))
# loglh = pd.Series(index=df_test_feats[important_cols].dropna(how='any').index, data=loglh)

# AE = (df_test_tot['predictions'] - df_test_tot[y_test.name]).abs()

# from scipy.stats import pearsonr

# joined = AE.rename('MAE').to_frame().join((loglh * -1).rename('loglh')).dropna(how='any')
# pearsonr(joined.MAE, joined.loglh)

## Uncertainty

In [None]:
pipe_upper = catboost.CatBoostRegressor(verbose=100, loss_function='Quantile:alpha=0.975')
pipe_lower = catboost.CatBoostRegressor(verbose=100, loss_function='Quantile:alpha=0.025')

pipe_upper.fit(df_train[selected_cols], df_train[y_train.name])
pipe_lower.fit(df_train[selected_cols], df_train[y_train.name])

In [None]:
def PICP(y_true,y_lower,y_upper):
    return np.logical_and(y_lower<=y_true, y_true<=y_upper).sum()/len(y_true)*100
    
def NMPIW(y_true,y_lower,y_upper):
    return np.mean(y_upper-y_lower)/(np.max(y_true)-np.min(y_true))
    
def MPIW(y_true,y_lower,y_upper):
    return np.mean(y_upper-y_lower)

#def pred_Crossing(y_pred,y_lower,y_upper):
#    return np.logical_or(y_upper<y_pred,y_pred<y_lower).sum()/len(y_true)*100

#def quantile_Crossing(y_lower,y_upper):
#    return (y_upper<y_lower).sum()/len(y_true)*100

In [None]:
train_predictions_upper = pipe_upper.predict(df_train_feats_[selected_cols])
train_predictions_lower = pipe_lower.predict(df_train_feats_[selected_cols])

nan_mask = df_train_feats_[y_train.name].notna()
y_true, y_pred_upper, y_pred_lower = df_train_feats_[y_train.name][nan_mask], train_predictions_upper[nan_mask], train_predictions_lower[nan_mask]

print("[TEST] PICP = "+str(np.round(PICP(y_true.values,y_pred_lower,y_pred_upper),2))+" % -- NMPIW = "+str(np.round(NMPIW(y_true.values,y_pred_lower,y_pred_upper),2))+" -- MPIW = "+str(np.round(MPIW(y_true.values,y_pred_lower,y_pred_upper),2))+" kW")

In [None]:
out_upper = pipe_upper.predict(df_test_tot[selected_cols])
out_lower = pipe_lower.predict(df_test_tot[selected_cols])

df_test_tot["predictions_upper"] = out_upper
df_test_tot["predictions_lower"] = out_lower

nan_mask = df_test_tot[y_test.name].notna()
y_true, y_pred_upper, y_pred_lower = df_test_tot[nan_mask][y_test.name], df_test_tot[nan_mask]["predictions_upper"], df_test_tot[nan_mask]["predictions_lower"]

print("[TEST] PICP = "+str(np.round(PICP(y_true.values,y_pred_lower.values,y_pred_upper.values),2))+" % -- NMPIW = "+str(np.round(NMPIW(y_true.values,y_pred_lower.values,y_pred_upper.values),2))+" -- MPIW = "+str(np.round(MPIW(y_true.values,y_pred_lower.values,y_pred_upper.values),2))+" kW")

## Visualizing predictions

In [None]:
import plotly.graph_objecats as go
from plotly_resampler.downsamplers import EveryNthPoint 

fig = FigureResampler(make_subplots(rows=2, shared_xaxes=True, specs=[[{}], [{'secondary_y': True}]]), default_n_shown_samples=1000)

fig.add_trace(
    go.Scattergl(name='target'),
    hf_x=y_test.index, hf_y=y_test, row=1, col=1
)

fig.add_trace(
    go.Scattergl(name="predictions", marker_color="red"),
    hf_x=df_test_tot.index, hf_y=df_test_tot.predictions, row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df_test_tot.index,
        y=df_test_tot.predictions_upper,
        name="upper_bound",
        showlegend=False,
        marker_color='black',
        line=dict(width=0),
        mode='lines',
    ),
    downsampler=EveryNthPoint(),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df_test_tot.index,
        y=df_test_tot.predictions_lower,
        name="lower bound",
        showlegend=False,
        marker_color='black',
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(255, 0, 0, 0.2)',
        fill='tonexty'
    ),
    downsampler=EveryNthPoint(),
    row=1, col=1
)

fig.update_layout(title="Power consumption predictions", title_x=0.5, hovermode='x')
fig.update_xaxes(title="Time")
fig.update_yaxes(title="AVG power consumption (kW)")
fig.show_dash(mode='external', height=700)

**notes**:
* this notebook is far from complete and serves as "quick" first iteration:
  * using a linear model as baseline
  * trying other windows / strides

In [None]:
fc.serialize('fc.pkl')