In [40]:
import numpy as np
import pandas as pd
from datetime import timedelta
from sklearn.feature_selection import mutual_info_regression
import plotly.express as px
from simplify_deployment.organism import Organism
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression

In [41]:
predictions = pd.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/data_science/lag_16_si_only_100_gen.parquet",
)
predictions["residuals"] = predictions["y_true"] - predictions["y_pred"]
predictions = predictions.add_suffix("_ga_lag_16")

predictions

Unnamed: 0,y_pred_ga_lag_16,y_true_ga_lag_16,residuals_ga_lag_16
2022-07-29 22:14:00+00:00,-245.732406,-137.439,108.293406
2022-07-29 22:29:00+00:00,-41.613828,-200.462,-158.848172
2022-07-29 22:44:00+00:00,-194.010608,-266.786,-72.775392
2022-07-29 22:59:00+00:00,-218.353402,-69.500,148.853402
2022-07-29 23:14:00+00:00,-37.205121,-44.361,-7.155879
...,...,...,...
2023-06-30 20:59:00+00:00,36.928324,-29.614,-66.542324
2023-06-30 21:14:00+00:00,-56.405676,-40.824,15.581676
2023-06-30 21:29:00+00:00,18.784111,10.249,-8.535111
2023-06-30 21:44:00+00:00,23.915048,-6.148,-30.063048


In [42]:
org_latest_4 = Organism.from_yaml(
    path_config="/home/thomas/repos/simplify_deployment/src/simplify_deployment/config/lag_16_si_only_config.yaml",
    path_genome="/home/thomas/repos/simplify_deployment/src/simplify_deployment/genomes/lag_16_si_only.yaml",
)
model = LinearRegression()
prediction_list = []
for fold in range(12):
    X_train_raw = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/simplify_1_0/folds/X_train_fold_{fold}.parquet")
    y_train_raw = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/simplify_1_0/folds/y_train_fold_{fold}.parquet")

    X_test_raw = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/simplify_1_0/folds/X_test_fold_{fold}.parquet")
    y_test_raw = pd.read_parquet(f"/home/thomas/repos/simplify_deployment/data/simplify_1_0/folds/y_test_fold_{fold}.parquet")

    y_train,X_train = org_latest_4.create_y_X(
        y_train_raw,
        X_train_raw,
    )

    y_test,X_test = org_latest_4.create_y_X(
        y_test_raw,
        X_test_raw,
    )

    model.fit(
        X_train,
        y_train,
    )
    single_prediction = pd.DataFrame(
        {
            "y_true_latest_4_lag_16": y_test,
            "y_pred_latest_4_lag_16": model.predict(X_test),
        }
    )
    single_prediction["residuals_latest_4_lag_16"] = single_prediction["y_true_latest_4_lag_16"] - single_prediction["y_pred_latest_4_lag_16"]
    prediction_list.append(
        single_prediction,        
    )
    print(f"Fold {fold} done.")
prediction_latest_4 = pd.concat(
    prediction_list,
    axis = 0,
)

Fold 0 done.
Fold 1 done.
Fold 2 done.
Fold 3 done.
Fold 4 done.
Fold 5 done.
Fold 6 done.
Fold 7 done.
Fold 8 done.
Fold 9 done.
Fold 10 done.
Fold 11 done.


In [43]:
df = pd.merge(
    predictions,
    prediction_latest_4,
    left_index=True,
    right_index=True,
    how = "inner",
)
df

Unnamed: 0,y_pred_ga_lag_16,y_true_ga_lag_16,residuals_ga_lag_16,y_true_latest_4_lag_16,y_pred_latest_4_lag_16,residuals_latest_4_lag_16
2022-11-29 23:29:00+00:00,-98.429577,-94.130,4.299577,-94.130,-17.255989,-76.874011
2022-11-29 23:44:00+00:00,-128.322369,-162.900,-34.577631,-162.900,-140.810674,-22.089326
2022-11-29 23:59:00+00:00,-70.687126,-63.882,6.805126,-63.882,-99.571445,35.689445
2022-11-30 00:14:00+00:00,-56.373276,-291.893,-235.519724,-291.893,-61.996481,-229.896519
2022-11-30 00:29:00+00:00,-207.043495,5.630,212.673495,5.630,-194.746090,200.376090
...,...,...,...,...,...,...
2023-06-30 20:59:00+00:00,36.928324,-29.614,-66.542324,-29.614,-0.377607,-29.236393
2023-06-30 21:14:00+00:00,-56.405676,-40.824,15.581676,-40.824,-19.582549,-21.241451
2023-06-30 21:29:00+00:00,18.784111,10.249,-8.535111,10.249,-12.187900,22.436900
2023-06-30 21:44:00+00:00,23.915048,-6.148,-30.063048,-6.148,18.778816,-24.926816


In [44]:
rmse_ga = np.sqrt(np.mean(df["residuals_ga_lag_16"]**2))
rmse_ga

111.75359628102181

In [45]:
rmse_latest_4 = np.sqrt(np.mean(df["residuals_latest_4_lag_16"]**2))
rmse_latest_4

119.11170899041707