### Time Series Workshop 
# 4. Air Pollutants &#x1F525;: Forecasting

In this notebook, we will use the preprocessed data from the feature engineering notebook and, finally, perform some forecasting!

In [None]:
%config InlineBackend.figure_format='retina'
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from timeseries.data import load_air_quality
from timeseries.utils import print_metrics
from sklearn.linear_model import Lasso
from feature_engine.creation import CyclicalFeatures
from sklearn.pipeline import Pipeline
from timeseries import preprocessing as pp
from sklearn.preprocessing import FunctionTransformer as ftrf


DATA_DIR = Path("..") / Path("data")

## Load data

In [None]:
FILE_PATH = DATA_DIR / "air_quality_processed.csv"

variables = ["co_sensor", "humidity"]

df_in = pd.read_csv(
    FILE_PATH,
    parse_dates=["date_time"],
    index_col=["date_time"],
)
df_in.head(2)

# Train-test split

* We will train the model on a certain portion of the data and leave another part out to evaluate the model.

* Contrary to regular machine learning problems, it is neccessary to strictly split on time in order to avoid leakage. 

* We will roughly use the first 90% of the data for training and the remaining 10% for testing.

In [None]:
SPLIT_DATE = "2005-02-01"
TARGET_COL = "co_sensor"

X_train = df_in[df_in.index <= SPLIT_DATE]
X_test = df_in[df_in.index > SPLIT_DATE]

y_train = X_train.pop(TARGET_COL)
y_test = X_test.pop(TARGET_COL)

train_ratio = len(y_train) / len(df_in)
test_ratio = len(y_test) / len(df_in)

print(f"Train vs. test ratios: {train_ratio:.2%} vs. {test_ratio:.2%}")


In [None]:
train_range = pd.DataFrame(
    index=pd.date_range(y_train.index.min(), y_train.index.max(), freq="1H")
)
test_range = pd.DataFrame(
    index=pd.date_range(y_test.index.min(), y_test.index.max(), freq="1H")
)

y_train_plt = train_range.merge(y_train, left_index=True, right_index=True, how="left")
y_test_plt = test_range.merge(y_test, left_index=True, right_index=True, how="left")

_, ax = plt.subplots(figsize=(15, 3))
_ = ax.plot(y_train_plt, label="train")
_ = ax.plot(y_test_plt, label="test")
_ = plt.legend(loc="upper left")

# Random Forest Regressor

In [None]:
FOREST_FEATURES = [
    "month",
    "week",
    "day",
    "day_of_week",
    "hour",
    "is_weekend",
    "co_sensor_lag_1",
    "co_sensor_lag_2",
    "co_sensor_lag_3",
    "co_sensor_lag_24",
    "humidity_lag_1",
    "humidity_lag_2",
    "humidity_lag_3",
    "humidity_lag_24",
    "co_sensor_win_mean",
    "co_sensor_win_min",
    "co_sensor_win_max",
    "co_sensor_win_std",
    "humidity_win_mean",
    "humidity_win_min",
    "humidity_win_max",
    "humidity_win_std",
]

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=5,
    random_state=0,
)
rf_model.fit(X_train[FOREST_FEATURES], y_train)

y_pred = rf_model.predict(X_test[FOREST_FEATURES])
print_metrics(y_test, y_pred)

In [None]:
rf_importances = pd.Series(rf_model.feature_importances_, index=FOREST_FEATURES)
_ = rf_importances.plot.barh(figsize=(7, 5))
_ = plt.xlabel("Feature Importance")
_ = plt.tight_layout()

In [None]:
y_pred_plt = test_range.merge(
    pd.Series(y_pred, index=y_test.index, name="pred"),
    left_index=True,
    right_index=True,
    how="left",
)

_, ax = plt.subplots(figsize=(15, 3))
_ = ax.plot(y_test_plt, label="test")
_ = ax.plot(y_pred_plt, label="pred")
_ = plt.legend(loc="upper left")

# Linear Regression

In [None]:
LINEAR_FEATURES = [
    "is_weekend",
    "co_sensor_lag_1",
    "co_sensor_lag_2",
    "co_sensor_lag_3",
    "co_sensor_lag_24",
    "humidity_lag_1",
    "humidity_lag_2",
    "humidity_lag_3",
    "humidity_lag_24",
    "co_sensor_win_mean",
    "co_sensor_win_min",
    "co_sensor_win_max",
    "co_sensor_win_std",
    "humidity_win_mean",
    "humidity_win_min",
    "humidity_win_max",
    "humidity_win_std",
    "month_sin",
    "month_cos",
    "hour_sin",
    "hour_cos",
]

linear_model = Lasso(alpha=2, random_state=0)
linear_model.fit(X_train[LINEAR_FEATURES], y_train)

y_pred = linear_model.predict(X_test[LINEAR_FEATURES])
print_metrics(y_test, y_pred)

In [None]:
coefs = pd.Series(np.abs(linear_model.coef_), index=LINEAR_FEATURES)
_ = coefs.plot.barh(figsize=(7, 5))
_ = plt.xlabel("Feature Importance")
_ = plt.tight_layout()

In [None]:
y_pred_plt = test_range.merge(
    pd.Series(y_pred, index=y_test.index, name="pred"),
    left_index=True,
    right_index=True,
    how="left",
)

_, ax = plt.subplots(figsize=(15, 3))
_ = ax.plot(y_test_plt, label="test")
_ = ax.plot(y_pred_plt, label="pred")
_ = plt.legend(loc="upper left")

# Putting it all together
This endless stream of notebook cells doesn't exactly correspond to the usual best practices.

Here's an example of the whole thing put together in a single cell:

In [None]:
# Define params:
FILE_PATH = DATA_DIR / "air_quality.csv"
SPLIT_DATE = "2005-02-01"
INPUT_COLS = ["co_sensor", "humidity"]
CYCLIC_COLS = ["month", "hour"]
TARGET_COL = "co_sensor"
FEATURE_COLS = [
    TARGET_COL,
    "is_weekend",
    "co_sensor_lag_1",
    "co_sensor_lag_2",
    "co_sensor_lag_3",
    "co_sensor_lag_24",
    "humidity_lag_1",
    "humidity_lag_24",
    "co_sensor_win_min",
    "co_sensor_win_std",
    "humidity_win_max",
    "month_sin",
    "month_cos",
    "hour_sin",
    "hour_cos",
]

# Load original data:
df_in = load_air_quality(FILE_PATH)[INPUT_COLS]
df_in.head()

# Define preprocessing pipeline:
preprocessing_steps = [
    ("remove_negative_values", ftrf(pp.remove_negative_values)),
    ("time_features", ftrf(pp.time_features)),
    (
        "lag_features",
        ftrf(pp.lag_features, kw_args={"columns": INPUT_COLS, "lags": [1, 2, 3, 24]}),
    ),
    ("window_features", ftrf(pp.window_features, kw_args={"columns": INPUT_COLS})),
    (
        "cyclical_features",
        CyclicalFeatures(
            variables=CYCLIC_COLS,
            drop_original=False,
        ),
    ),
    ("select_cols", ftrf(pp.select_columns, kw_args={"columns": FEATURE_COLS})),
    ("remove_na_values", ftrf(pp.remove_na)),
]
preprocessing_pipe = Pipeline(preprocessing_steps)

# Apply preprocessing pipeline:
df_processed = preprocessing_pipe.fit_transform(df_in)

# Train-test split:
X_train = df_processed[df_processed.index <= SPLIT_DATE]
X_test = df_processed[df_processed.index > SPLIT_DATE]
y_train = X_train.pop(TARGET_COL)
y_test = X_test.pop(TARGET_COL)

# Fit simple model:
linear_model = Lasso(alpha=5, random_state=0)
linear_model.fit(X_train, y_train)

# Predict:
y_pred = linear_model.predict(X_test)
print_metrics(y_test, y_pred)

Now this looks pretty amazing and all. It's just a one-hour forecast, though. Anyways, amazing! &#x1F973;

But what should we have done first?

#### Did we forget something? &#x1F6A8;


## Post Scriptum: Refactored feature engineering
- The code above is quite messy. Let's clean it up a bit

In [None]:
from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import (
    LagFeatures,
    WindowFeatures,
)

# Date feature transformer:
datetime_features = DatetimeFeatures(
    variables="index",
    features_to_extract=[
        "month",
        "week",
        "day_of_week",
        "day_of_month",
        "hour",
        "weekend",
    ],
)

# Lag feature transformer:
lag_features = LagFeatures(
    variables=variables, freq=["1H", "24H"], missing_values="ignore"
)

# Window feature transformer:
window_features = WindowFeatures(
    variables=variables,
    window="3H",
    freq="1H",
    missing_values="ignore",
    functions=["mean", "min", "max", "std"],
)

# Cyclical feature transformer (this one we already know!):
cyclic_features = CyclicalFeatures(variables=["month", "hour"], drop_original=False)

# Drop missing data transformer:
dropnas = DropMissingData()

# Drop features transformer (to avoid look-ahead bias):
drop_features = DropFeatures(features_to_drop=variables)

In [None]:
pipe = Pipeline(
    [
        ("datetime_features", datetime_features),
        ("lag_features", lag_features),
        ("window_features", window_features),
        ("cyclic_features", cyclic_features),
        ("dropnas", dropnas),
        ("drop_features", drop_features),
    ]
)
pipe

In [None]:
df = df_in.copy()
df_processed = pipe.fit_transform(df)
df_processed.head(2)

Ah, way better and not too cluttered. &#x1F9D8;

In [None]:
# Train test split
df_train = df[df.index < SPLIT_DATE]
df_test = df[df.index >= SPLIT_DATE]

X_train = df_train.copy()
X_test = df_test.copy()

y_train = df_train[TARGET_COL]
y_test = df_test[TARGET_COL]

# Preprocessing:
X_train_t = pipe.fit_transform(X_train)
X_test_t = pipe.transform(X_test)

y_train_t = y_train.loc[X_train_t.index]
y_test_t = y_test.loc[X_test_t.index]

# Fit simple model:
linear_model = Lasso(alpha=1, random_state=0)
linear_model.fit(X_train_t, y_train_t)

# Predict:
y_pred = linear_model.predict(X_test_t)
print_metrics(y_test_t, y_pred)