In [2]:
# Notebook: Answers to Question 2
# Author: Ben Bernal
# Date: 08JUL2024

### Preliminaries

#### Libraries

In [3]:
import numpy as np
import pandas as pd
from os import path, listdir
from pprint import pprint
import re
import plotly.express as px
import plotly.graph_objects as go
from dataclasses import make_dataclass
from icecream import ic
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_percentage_error as mape
from datetime import datetime as dt

#### Utilities

In [4]:
from utilities import json as util_json

#### Configuration

In [5]:
env_config = util_json.to_dict(
    file_path="../../config/env.json"
)

# Dataset

## Pre-processing

In [6]:
raw_frame = pd.read_csv(
    path.normpath(
        path.join(
            env_config['root'],
            "modules/m4/release_time_series_report_data_nops/CO2.csv"
        )
    ),
    header=56,
    names = [
        'year',
        'month',
        'date_excel',
        'date_decimal',
        'CO2_[ppm]',
        'CO2_[ppm]_seasonality_adjusted',
        "CO2_[ppm]_fit",
        "CO2_[ppm]_seasonality_adjusted_fit",
        "CO2_[ppm]_filled",
        "CO2_[ppm]_seasonality_adjusted_filled"
    ]
)

print(raw_frame.head(3).to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+----------------------------------+-----------------+--------------------------------------+--------------------+-----------------------------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   CO2_[ppm]_seasonality_adjusted |   CO2_[ppm]_fit |   CO2_[ppm]_seasonality_adjusted_fit |   CO2_[ppm]_filled |   CO2_[ppm]_seasonality_adjusted_filled |
|  0 |   1958 |       1 |        21200 |        1958.04 |      -99.99 |                           -99.99 |          -99.99 |                               -99.99 |             -99.99 |                                  -99.99 |
+----+--------+---------+--------------+----------------+-------------+----------------------------------+-----------------+--------------------------------------+--------------------+-----------------------------------------+
|  1 |   1958 |       2 |        21231 |        1958.13 |      -99.99 |                     

In [7]:
frame = (
    raw_frame
    .assign(
        month_fraction_index = lambda X: X.index
    )
    .assign(
        month_fraction = lambda X: X.month_fraction_index.map(
            lambda y: (y+0.5)/12
        )
    )
    .drop(
        columns=[
        'CO2_[ppm]_seasonality_adjusted',
        "CO2_[ppm]_fit",
        "CO2_[ppm]_seasonality_adjusted_fit",
        "CO2_[ppm]_filled",
        "CO2_[ppm]_seasonality_adjusted_filled"
        ]
    )
    .dropna(
        subset=[
            'year',
            'month',
            'CO2_[ppm]',
        ],
        how='any'
    )
    .query(
        "`CO2_[ppm]` != -99.99"
    )
)


print(frame.head(20).to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  2 |   1958 |       3 |        21259 |        1958.2  |      315.7  |                      2 |         0.208333 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   1958 |       4 |        21290 |        1958.29 |      317.45 |                      3 |         0.291667 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  4 |   1958 |       5 |        21320 |        1958.37 |      317.51 |                      4 |         0.375    |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  6 |   1958 |       7 |        21381 |        1958.54 |      315.86 | 

In [8]:
ic(raw_frame.shape)
ic(frame.shape)

ic| raw_frame.shape: (744, 10)
ic| frame.shape: (734, 7)


(734, 7)

## Train - Test Split 

In [9]:
tscv = TimeSeriesSplit(
    n_splits=4
)

In [10]:

for train, test in tscv.split(X = frame):
    
    ic(train[:5])
    ic(test[:5,])
    ic(train.shape)
    ic(test.shape)
    # break

ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([150, 151, 152, 153, 154])
ic| train.shape: (150,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([296, 297, 298, 299, 300])
ic| train.shape: (296,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([442, 443, 444, 445, 446])
ic| train.shape: (442,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([588, 589, 590, 591, 592])
ic| train.shape: (588,)
ic| test.shape: (146,)


In [11]:
train_frame = (
    frame.iloc[train]
    .reset_index(
        drop=True
    )
)

print(train_frame.head().to_markdown(tablefmt="grid"))
print(train_frame.tail().to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  0 |   1958 |       3 |        21259 |        1958.2  |      315.7  |                      2 |         0.208333 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  1 |   1958 |       4 |        21290 |        1958.29 |      317.45 |                      3 |         0.291667 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  2 |   1958 |       5 |        21320 |        1958.37 |      317.51 |                      4 |         0.375    |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   1958 |       7 |        21381 |        1958.54 |      315.86 | 

In [12]:
test_frame = (
    frame.iloc[test]
    .reset_index(
        drop=True
    )
)

print(test_frame.head().to_markdown(tablefmt="grid"))
print(test_frame.tail().to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  0 |   2007 |       8 |        39309 |        2007.62 |      381.84 |                    595 |          49.625  |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  1 |   2007 |       9 |        39340 |        2007.71 |      380.86 |                    596 |          49.7083 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  2 |   2007 |      10 |        39370 |        2007.79 |      380.86 |                    597 |          49.7917 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   2007 |      11 |        39401 |        2007.87 |      382.36 | 

# Linear Model

In [13]:
reg = LinearRegression().fit(
    X = train_frame[['month_fraction']],
    y=train_frame[['CO2_[ppm]']]
)

In [14]:
train_frame['CO2_[ppm]_nominal_linear_model'] = reg.predict(
    X = train_frame[['month_fraction']]
)

train_frame['residuals_nominal_linear_model'] = train_frame['CO2_[ppm]'] - train_frame['CO2_[ppm]_nominal_linear_model']

In [15]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]'],
        name='CO2 [ppm]'
    )
)
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]_nominal_linear_model'],
        name='Nominal Linear Regreession (F1)'
    )
)
fig.show()

In [16]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.month_fraction,
        y=train_frame['residuals_nominal_linear_model'],
        mode='markers',
        marker=dict(
            color = 'red'
        )
    )
)

In [17]:
ic(reg.coef_)
ic(reg.intercept_)

ic| reg.coef_: array([[1.40788184]])
ic| reg.intercept_: array([308.97162006])


array([308.97162006])

In [18]:
test_frame['CO2_[ppm]_nominal_linear_model'] = reg.predict(
    X=test_frame[
        [
            'month_fraction'
        ]
    ]
)

print(test_frame.sample(5).to_markdown(tablefmt='grid'))

+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
|     |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |   CO2_[ppm]_nominal_linear_model |
|  32 |   2010 |       4 |        40283 |        2010.29 |      392.38 |                    627 |          52.2917 |                          382.592 |
+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
|  47 |   2011 |       7 |        40739 |        2011.54 |      392.59 |                    642 |          53.5417 |                          384.352 |
+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
|  95 |   2015 |       7 |        42200 |        2015.54 |      401.61 |                

In [19]:
ic(
    rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    )
)

ic| rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    ): 10.623211641288632


10.623211641288632

In [20]:
ic(
    mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    )
)

ic| mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    ): 0.02445351279724722


0.02445351279724722

# Quadratic Model

In [21]:
quadratic_transform = PolynomialFeatures(
    degree=2,
    include_bias=False
)

In [22]:
quadratic_train_array = quadratic_transform.fit_transform(
    X=train_frame[
        [
            'month_fraction'
        ]
    ]
)

print(
    quadratic_train_array[:3,:]
)

[[0.20833333 0.04340278]
 [0.29166667 0.08506944]
 [0.375      0.140625  ]]


In [23]:
reg_quadratic = LinearRegression().fit(
    X = quadratic_train_array,
    y=train_frame[['CO2_[ppm]']]
)

In [24]:
ic(reg_quadratic.coef_)
ic(reg_quadratic.intercept_)

ic| reg_quadratic.coef_: array([[0.80137882, 0.01213543]])
ic| reg_quadratic.intercept_: array([314.10517372])


array([314.10517372])

In [25]:
train_frame['CO2_[ppm]_quadratic_model'] = reg_quadratic.predict(
    X=quadratic_train_array
)

In [26]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]'],
        name='CO2 [ppm]'
    )
)
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]_quadratic_model'],
        name='Quadratic Model (F2)'
    )
)
fig.show()

In [27]:
train_frame['residuals_quadratic_model'] = train_frame['CO2_[ppm]'] - train_frame['CO2_[ppm]_quadratic_model']

In [28]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.month_fraction,
        y=train_frame['residuals_quadratic_model'],
        mode='markers',
        marker=dict(
            color = 'red'
        )
    )
)

In [29]:
quadratic_test_array = quadratic_transform.fit_transform(
    X = test_frame[['month_fraction']]
)

In [30]:
test_frame['CO2_[ppm]_quadratic_model'] = reg_quadratic.predict(
    X=quadratic_test_array
)

In [31]:
ic(
    rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_quadratic_model']
    )
)

ic| rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_quadratic_model']
    ): 2.5018919596730487


2.5018919596730487

In [32]:
ic(
    mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_quadratic_model']
    )
)

ic| mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_quadratic_model']
    ): 0.005327124480114718


0.005327124480114718

# Cubic Polynomial

In [33]:
cubic_transform = PolynomialFeatures(
    degree=3,
    include_bias=False
)

In [34]:
train_cubic_array = cubic_transform.fit_transform(
    X = train_frame[['month_fraction']]
)

test_cubic_array = cubic_transform.fit_transform(
    X=test_frame[['month_fraction']]
)

In [35]:
reg_cubic = LinearRegression().fit(
    X = train_cubic_array,
    y=train_frame[['CO2_[ppm]']]
)

In [36]:
ic(reg_cubic.coef_)
ic(reg_cubic.intercept_)

ic| reg_cubic.coef_: array([[ 6.28211675e-01,  2.07598357e-02, -1.15243612e-04]])
ic| reg_cubic.intercept_: array([314.8470801])


array([314.8470801])

In [37]:
train_frame['CO2_[ppm]_cubic_model'] = reg_cubic.predict(
    X=train_cubic_array
)

test_frame['CO2_[ppm]_cubic_model'] = reg_cubic.predict(
    X=test_cubic_array
)

In [38]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]'],
        name='CO2 [ppm]'
    )
)
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]_cubic_model'],
        name='Cubic Model (F3)'
    )
)
fig.show()

In [39]:
train_frame['residuals_cubic_model'] = train_frame['CO2_[ppm]'] - train_frame['CO2_[ppm]_cubic_model']

In [40]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.month_fraction,
        y=train_frame['residuals_cubic_model'],
        mode='markers',
        marker=dict(
            color = 'red'
        )
    )
)

In [41]:
ic(
    rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_cubic_model']
    )
)

ic| rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_cubic_model']
    ): 4.078412538346104


4.078412538346104

In [42]:
ic(
    mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_cubic_model']
    )
)

ic| mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_cubic_model']
    ): 0.008342509400571723


0.008342509400571723

# Periodic Signal

The detrended time series is equal to the calculated residual.

In [43]:
print(train_frame.sample(1).to_markdown(tablefmt='gid'))

       year    month    date_excel    date_decimal    CO2_[ppm]    month_fraction_index    month_fraction    CO2_[ppm]_nominal_linear_model    residuals_nominal_linear_model    CO2_[ppm]_quadratic_model    residuals_quadratic_model    CO2_[ppm]_cubic_model    residuals_cubic_model
---  ------  -------  ------------  --------------  -----------  ----------------------  ----------------  --------------------------------  --------------------------------  ---------------------------  ---------------------------  -----------------------  -----------------------
273    1981        5         29721         1981.37       342.91                     280            23.375                           341.881                           1.02914                      339.468                      3.44191                  339.403                  3.50737


In [44]:
periodict_frame = (
    train_frame
    .groupby(
        by=[
            'month'
        ]
    ).mean()[
        [
            'CO2_[ppm]',
            'CO2_[ppm]_quadratic_model',
            'residuals_quadratic_model'
        ]
    ]
)

In [None]:
periodict_frame

In [45]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = periodict_frame.index,
        y = periodict_frame['residuals_quadratic_model'],
        name='Periodic Signal',
    )
)
fig.update_layout(
    title_text = "Periodict Signal (Pi)",
    template = 'plotly_white',
    xaxis_title = 'Month Index',
    yaxis_title = 'Monthly Average of the Residuals: Pi = Ci - Fn(ti)'
)
fig.show()

# Final Model

In [51]:
train_frame = train_frame.merge(
    right=(
        periodict_frame
        .reset_index(
            drop=False,
            names='month'
        )
    ),
    left_on='month',
    right_on='month',
    suffixes=(None, "_monthly_average"),
    how='left',
    validate="m:1"
)

In [52]:
test_frame = test_frame.merge(
    right=(
        periodict_frame
        .reset_index(
            drop=False,
            names='month'
        )
    ),
    left_on='month',
    right_on='month',
    suffixes=(None, "_monthly_average"),
    how='left',
    validate="m:1"
)

In [53]:
train_frame = (
    train_frame
    .rename(
        columns = {
            'residuals_quadratic_model_monthly_average':"periodict_signal"
        }
    )
    .assign(
        co2_fitted_model = lambda X: X.apply(
            lambda y: y['CO2_[ppm]_quadratic_model'] + y.periodict_signal,
            axis = 1
        )
    )
)

In [None]:
test_frame = (
    test_frame
    .rename(
        columns = {
            'residuals_quadratic_model_monthly_average':"periodict_signal"
        }
    )
    .assign(
        co2_fitted_model = lambda X: X.apply(
            lambda y: y['CO2_[ppm]_quadratic_model'] + y.periodict_signal,
            axis = 1
        )
    )
)

In [None]:
train_frame

In [56]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame.co2_fitted_model,
        name='[CO2] Fitted Model',
    )
)
fig.update_layout(
    title_text = "[CO2] Time Series Records versus Quadratic Model + Periodict Component",
    template = 'plotly_white',
    xaxis_title = 'Date of Record',
    yaxis_title = '[CO2]'
)
fig.show()