### Time Series Workshop 
# 3. Air Pollutants Forecasting

In [12]:
%config InlineBackend.figure_format='retina'
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from timeseries.data import load_air_quality

DATA_DIR = Path("..") / Path("data")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load and process data

In [10]:
FILE_PATH = DATA_DIR / "air_quality.csv"

df_in = load_air_quality(FILE_PATH)
df_in.head()

variables = ["co_sensor", "humidity"]
df_in = df_in[variables].copy()

for var in variables:
    df_in = df_in[df_in[var] >= 0]

df_in.head()

Unnamed: 0_level_0,co_sensor,humidity
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-04 00:00:00,1224.0,56.5
2004-04-04 01:00:00,1215.0,59.2
2004-04-04 02:00:00,1115.0,62.4
2004-04-04 03:00:00,1124.0,65.0
2004-04-04 04:00:00,1028.0,65.3


## Time related features

In [17]:
df = df_in.copy()

df["month"] = df.index.month
df["week"] = df.index.isocalendar().week
df["day"] = df.index.day
df["day_of_week"] = df.index.day_of_week
df["hour"] = df.index.hour
df["is_weekend"] = np.where(df["day_of_week"]>4, 1, 0)
df.head()

Unnamed: 0_level_0,co_sensor,humidity,month,week,day,day_of_week,hour,is_weekend
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,4,6,0,1
2004-04-04 01:00:00,1215.0,59.2,4,14,4,6,1,1
2004-04-04 02:00:00,1115.0,62.4,4,14,4,6,2,1
2004-04-04 03:00:00,1124.0,65.0,4,14,4,6,3,1
2004-04-04 04:00:00,1028.0,65.3,4,14,4,6,4,1


## Lag features
Lag features are past values of the variable that we can use to predict future values.

Here, we will use the following lag features to predict the next hour's pollutant concentration:
- The pollutant concentration for the previous hour (t-1).
- The pollutant concentration for the same hour on the previous day (t-24).

The reasoning behind this is that pollutant concentrations do not change quickly and, as previously demonstrated, have a 24-hour seasonality.

In [19]:
# Shift the data forward 1 Hr.

for h in range(1,4):
    tmp = df[variables].shift(freq="1H")

# Names for the new variables.
tmp.columns = [v + "_lag_1" for v in variables]

# Add the variables to the original data.
print("data size before")
print(df.shape)

df = df.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(df.shape)

df.head()

data size before
(7393, 10)
data size after
(7393, 12)


Unnamed: 0_level_0,co_sensor,humidity,month,week,day,day_of_week,hour,is_weekend,co_sensor_lag_1_x,humidity_lag_1_x,co_sensor_lag_1_y,humidity_lag_1_y
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,4,6,0,1,,,,
2004-04-04 01:00:00,1215.0,59.2,4,14,4,6,1,1,1224.0,56.5,1224.0,56.5
2004-04-04 02:00:00,1115.0,62.4,4,14,4,6,2,1,1215.0,59.2,1215.0,59.2
2004-04-04 03:00:00,1124.0,65.0,4,14,4,6,3,1,1115.0,62.4,1115.0,62.4
2004-04-04 04:00:00,1028.0,65.3,4,14,4,6,4,1,1124.0,65.0,1124.0,65.0
