In [1]:
# Notebook: Answers to Question 2
# Author: Ben Bernal
# Date: 08JUL2024

### Preliminaries

#### Libraries

In [45]:
import numpy as np
import pandas as pd
from os import path, listdir
from pprint import pprint
import re
import plotly.express as px
import plotly.graph_objects as go
from dataclasses import make_dataclass
from icecream import ic
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_percentage_error as mape

#### Utilities

In [3]:
from utilities import json as util_json

#### Configuration

In [4]:
env_config = util_json.to_dict(
    file_path="../../config/env.json"
)

# Dataset

## Pre-processing

In [5]:
raw_frame = pd.read_csv(
    path.normpath(
        path.join(
            env_config['root'],
            "modules/m4/release_time_series_report_data_nops/CO2.csv"
        )
    ),
    header=56,
    names = [
        'year',
        'month',
        'date_excel',
        'date_decimal',
        'CO2_[ppm]',
        'CO2_[ppm]_seasonality_adjusted',
        "CO2_[ppm]_fit",
        "CO2_[ppm]_seasonality_adjusted_fit",
        "CO2_[ppm]_filled",
        "CO2_[ppm]_seasonality_adjusted_filled"
    ]
)

print(raw_frame.head(3).to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+----------------------------------+-----------------+--------------------------------------+--------------------+-----------------------------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   CO2_[ppm]_seasonality_adjusted |   CO2_[ppm]_fit |   CO2_[ppm]_seasonality_adjusted_fit |   CO2_[ppm]_filled |   CO2_[ppm]_seasonality_adjusted_filled |
|  0 |   1958 |       1 |        21200 |        1958.04 |      -99.99 |                           -99.99 |          -99.99 |                               -99.99 |             -99.99 |                                  -99.99 |
+----+--------+---------+--------------+----------------+-------------+----------------------------------+-----------------+--------------------------------------+--------------------+-----------------------------------------+
|  1 |   1958 |       2 |        21231 |        1958.13 |      -99.99 |                     

In [23]:
frame = (
    raw_frame
    .assign(
        month_fraction_index = lambda X: X.index
    )
    .assign(
        month_fraction = lambda X: X.month_fraction_index.map(
            lambda y: (y+0.5)/12
        )
    )
    .drop(
        columns=[
        'CO2_[ppm]_seasonality_adjusted',
        "CO2_[ppm]_fit",
        "CO2_[ppm]_seasonality_adjusted_fit",
        "CO2_[ppm]_filled",
        "CO2_[ppm]_seasonality_adjusted_filled"
        ]
    )
    .dropna(
        subset=[
            'year',
            'month',
            'CO2_[ppm]',
        ],
        how='any'
    )
    .query(
        "`CO2_[ppm]` != -99.99"
    )
)


print(frame.head(20).to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  2 |   1958 |       3 |        21259 |        1958.2  |      315.7  |                      2 |         0.208333 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   1958 |       4 |        21290 |        1958.29 |      317.45 |                      3 |         0.291667 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  4 |   1958 |       5 |        21320 |        1958.37 |      317.51 |                      4 |         0.375    |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  6 |   1958 |       7 |        21381 |        1958.54 |      315.86 | 

In [24]:
ic(raw_frame.shape)
ic(frame.shape)

ic| raw_frame.shape: (744, 10)
ic| frame.shape: (734, 7)


(734, 7)

## Train - Test Split 

In [25]:
tscv = TimeSeriesSplit(
    n_splits=4
)

In [26]:

for train, test in tscv.split(X = frame):
    
    ic(train[:5])
    ic(test[:5,])
    ic(train.shape)
    ic(test.shape)
    # break

ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([150, 151, 152, 153, 154])
ic| train.shape: (150,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([296, 297, 298, 299, 300])
ic| train.shape: (296,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([442, 443, 444, 445, 446])
ic| train.shape: (442,)
ic| test.shape: (146,)
ic| train[:5]: array([0, 1, 2, 3, 4])
ic| test[:5,]: array([588, 589, 590, 591, 592])
ic| train.shape: (588,)
ic| test.shape: (146,)


In [27]:
train_frame = (
    frame.iloc[train]
    .reset_index(
        drop=True
    )
)

print(train_frame.head().to_markdown(tablefmt="grid"))
print(train_frame.tail().to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  0 |   1958 |       3 |        21259 |        1958.2  |      315.7  |                      2 |         0.208333 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  1 |   1958 |       4 |        21290 |        1958.29 |      317.45 |                      3 |         0.291667 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  2 |   1958 |       5 |        21320 |        1958.37 |      317.51 |                      4 |         0.375    |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   1958 |       7 |        21381 |        1958.54 |      315.86 | 

In [28]:
test_frame = (
    frame.iloc[test]
    .reset_index(
        drop=True
    )
)

print(test_frame.head().to_markdown(tablefmt="grid"))
print(test_frame.tail().to_markdown(tablefmt="grid"))

+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|    |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |
|  0 |   2007 |       8 |        39309 |        2007.62 |      381.84 |                    595 |          49.625  |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  1 |   2007 |       9 |        39340 |        2007.71 |      380.86 |                    596 |          49.7083 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  2 |   2007 |      10 |        39370 |        2007.79 |      380.86 |                    597 |          49.7917 |
+----+--------+---------+--------------+----------------+-------------+------------------------+------------------+
|  3 |   2007 |      11 |        39401 |        2007.87 |      382.36 | 

# Linear Model

In [29]:
reg = LinearRegression().fit(
    X = train_frame[['month_fraction']],
    y=train_frame[['CO2_[ppm]']]
)

In [30]:
train_frame['CO2_[ppm]_nominal_linear_model'] = reg.predict(
    X = train_frame[['month_fraction']]
)

train_frame['residuals_nominal_linear_model'] = train_frame['CO2_[ppm]'] - train_frame['CO2_[ppm]_nominal_linear_model']

In [31]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]'],
        name='CO2 [ppm]'
    )
)
fig.add_trace(
    go.Scatter(
        x = train_frame.date_decimal,
        y = train_frame['CO2_[ppm]_nominal_linear_model'],
        name='Nominal Linear Regreession (F1)'
    )
)
fig.show()

In [36]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = train_frame.month_fraction,
        y=train_frame['residuals_nominal_linear_model'],
        mode='markers',
        marker=dict(
            color = 'red'
        )
    )
)

In [39]:
ic(reg.coef_)
ic(reg.intercept_)

ic| reg.coef_: array([[1.40788184]])
ic| reg.intercept_: array([308.97162006])


array([308.97162006])

In [40]:
test_frame['CO2_[ppm]_nominal_linear_model'] = reg.predict(
    X=test_frame[
        [
            'month_fraction'
        ]
    ]
)

print(test_frame.sample(5).to_markdown(tablefmt='grid'))

+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
|     |   year |   month |   date_excel |   date_decimal |   CO2_[ppm] |   month_fraction_index |   month_fraction |   CO2_[ppm]_nominal_linear_model |
|  41 |   2011 |       1 |        40558 |        2011.04 |      391.3  |                    636 |          53.0417 |                          383.648 |
+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
|  94 |   2015 |       6 |        42170 |        2015.45 |      402.88 |                    689 |          57.4583 |                          389.866 |
+-----+--------+---------+--------------+----------------+-------------+------------------------+------------------+----------------------------------+
| 112 |   2016 |      12 |        42719 |        2016.96 |      404.55 |                

In [46]:
ic(
    rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    )
)

ic| rmse(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    ): 10.623211641288632


10.623211641288632

In [43]:
ic(
    mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    )
)

ic| mape(
        y_true=test_frame['CO2_[ppm]'],
        y_pred=test_frame['CO2_[ppm]_nominal_linear_model']
    ): 0.02445351279724722


0.02445351279724722

For higher order polynomials, first transform the data with Polynomial Features from sk learn